Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 12 additions & 26 deletions dataloom-backend/app/api/endpoints/projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,38 +93,24 @@ async def save_project(
commit_message: str,
db: Session = Depends(database.get_db),
):
"""Save project changes as a checkpoint.
"""Save the current working dataset as a checkpoint.

Replays all pending transformations from the change log onto the original
file and creates a checkpoint record marking the save point.
The working copy already reflects the user's latest accepted transforms, so
checkpoint creation should preserve that current dataset and only update the
checkpoint/log metadata for pending actions.
"""
project = get_project_or_404(project_id, db)

# Load original file for replaying transformations
original_path = get_original_path(project.file_path)
df = read_csv_safe(original_path)

# Get all unapplied logs for this project
logs = (
db.query(models.ProjectChangeLog)
.filter(
models.ProjectChangeLog.project_id == project_id,
models.ProjectChangeLog.applied == False, # noqa: E712
if project.file_path == str(original_path):
logger.error(
"Project working copy unexpectedly points at original file: id=%s path=%s",
project_id,
project.file_path,
)
.order_by(models.ProjectChangeLog.timestamp)
.all()
)

# Replay each logged transformation on the original
for log in logs:
df = apply_logged_transformation(df, log.action_type, log.action_details)

# Write transformations to the working copy (.copy.csv), not the original dataset.
# original_path must remain immutable as the baseline used for transformation replay.
assert project.file_path != str(original_path), (
"Invariant violation: attempted to write transformed data to original_path."
)
save_csv_safe(df, project.file_path)
raise HTTPException(status_code=500, detail="Project working copy is misconfigured")

df = read_csv_safe(project.file_path)

# Create checkpoint (marks logs as applied)
checkpoint = create_checkpoint(db, project_id, commit_message)
Expand Down
52 changes: 52 additions & 0 deletions dataloom-backend/tests/test_save_revert.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
"""Tests for save and revert logic in the project service."""

import asyncio
import shutil

import pandas as pd

from app import models
from app.api.endpoints.projects import save_project
from app.services.project_service import (
create_checkpoint,
create_project,
log_transformation,
)
from app.services.transformation_service import add_column, rename_column
from app.utils.pandas_helpers import read_csv_safe, save_csv_safe


class TestCheckpoint:
Expand Down Expand Up @@ -41,3 +50,46 @@ def test_checkpoint_message(self, db):

checkpoint = create_checkpoint(db, project.project_id, "My save message")
assert checkpoint.message == "My save message"


class TestSaveEndpointRegressions:
def test_second_save_preserves_previously_checkpointed_transforms(self, db, tmp_path):
"""A later save should keep earlier checkpointed transforms intact."""
original_path = tmp_path / "sample.csv"
copy_path = tmp_path / "sample_copy.csv"

pd.DataFrame(
{
"name": ["Alice", "Bob"],
"age": [30, 25],
"city": ["New York", "Los Angeles"],
}
).to_csv(original_path, index=False)
shutil.copy2(original_path, copy_path)

project = create_project(db, "Cumulative Save", str(copy_path), "Regression for repeated saves")

renamed_df = rename_column(read_csv_safe(project.file_path), 1, "years")
save_csv_safe(renamed_df, project.file_path)
log_transformation(
db,
project.project_id,
"renameCol",
{"rename_col_params": {"col_index": 1, "new_name": "years"}},
)

first_save = asyncio.run(save_project(project.project_id, "first checkpoint", db))
assert first_save["columns"] == ["name", "years", "city"]

extended_df = add_column(read_csv_safe(project.file_path), 3, "country")
save_csv_safe(extended_df, project.file_path)
log_transformation(
db,
project.project_id,
"addCol",
{"add_col_params": {"index": 3, "name": "country"}},
)

second_save = asyncio.run(save_project(project.project_id, "second checkpoint", db))
assert second_save["columns"] == ["name", "years", "city", "country"]
assert read_csv_safe(project.file_path).columns.tolist() == ["name", "years", "city", "country"]
Loading