Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions dataloom-backend/app/api/endpoints/profiling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
"""Data profiling endpoint — column statistics and quality scoring."""

import uuid

from fastapi import APIRouter, Depends
from sqlmodel import Session

from app import database
from app.api.dependencies import get_project_or_404
from app.utils.logging import get_logger
from app.utils.pandas_helpers import (
compute_quality_score,
get_column_profile,
read_file_safe,
)

logger = get_logger(__name__)
router = APIRouter()


@router.get("/{project_id}/profile")
async def get_dataset_profile(
project_id: uuid.UUID,
db: Session = Depends(database.get_db),
):
"""Return a column-level statistical profile for a project's dataset.

Response includes per-column dtype, null stats, unique count, and
numeric distribution. Also returns a composite data quality score
that penalises null values and duplicate rows.

Args:
project_id: UUID of the project to profile.

Returns:
JSON with total_rows, total_columns, duplicate_rows,
quality_score (0-100), and per-column statistics.
"""
project = get_project_or_404(project_id, db)
df = read_file_safe(project.file_path)

duplicate_rows = int(df.duplicated().sum())
quality_score = compute_quality_score(df)
columns = get_column_profile(df)

logger.info(
"Profile generated: project=%s rows=%d cols=%d quality=%.1f",
project_id,
len(df),
len(df.columns),
quality_score,
)

return {
"project_id": str(project_id),
"total_rows": len(df),
"total_columns": len(df.columns),
"duplicate_rows": duplicate_rows,
"quality_score": quality_score,
"columns": columns,
}
2 changes: 1 addition & 1 deletion dataloom-backend/app/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class Settings(BaseSettings):
database_url: str
upload_dir: str = "uploads"
max_upload_size_bytes: int = 10_485_760 # 10 MB
allowed_extensions: list[str] = [".csv"]
allowed_extensions: list[str] = [".csv", ".tsv", ".xlsx", ".xls", ".json", ".parquet"]
cors_origins: list[str] = ["http://localhost:3200"]
debug: bool = False

Expand Down
3 changes: 2 additions & 1 deletion dataloom-backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import JSONResponse

from app.api.endpoints import projects, transformations, user_logs
from app.api.endpoints import profiling, projects, transformations, user_logs
from app.config import get_settings
from app.exceptions import AppException, app_exception_handler
from app.services.transformation_service import TransformationError
Expand Down Expand Up @@ -59,6 +59,7 @@ async def transformation_error_handler(request: Request, exc: TransformationErro
app.include_router(projects.router, prefix="/projects", tags=["projects"])
app.include_router(transformations.router, prefix="/projects", tags=["transformations"])
app.include_router(user_logs.router, prefix="/logs", tags=["user_logs"])
app.include_router(profiling.router, prefix="/projects", tags=["profiling"])

if __name__ == "__main__":
import uvicorn
Expand Down
50 changes: 39 additions & 11 deletions dataloom-backend/app/services/file_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,49 +8,77 @@

logger = get_logger(__name__)

SUPPORTED_FORMATS = {".csv", ".xlsx", ".xls", ".json", ".parquet", ".tsv"}


def store_upload(file) -> tuple[Path, Path]:
"""Store an uploaded file and create a working copy.

Saves the file with a sanitized name and creates a _copy.csv for
transformation operations, keeping the original pristine.
Saves the file with a sanitized name and creates a working copy
(suffixed with _copy + original extension) for transformation
operations, keeping the original pristine.

Supports: CSV, Excel (.xlsx/.xls), JSON, Parquet, TSV.

Args:
file: The FastAPI UploadFile object.

Returns:
Tuple of (original_path, copy_path).

Raises:
ValueError: If the file extension is not supported.
"""
safe_name = sanitize_filename(file.filename)
suffix = Path(safe_name).suffix.lower()

if suffix not in SUPPORTED_FORMATS:
raise ValueError(
f"Unsupported file format '{suffix}'. Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}"
)

original_path = resolve_upload_path(safe_name)

with open(original_path, "wb+") as f:
shutil.copyfileobj(file.file, f)

copy_path = Path(str(original_path).replace(".csv", "_copy.csv"))
# Build copy path by inserting _copy before the extension
# e.g. data.xlsx -> data_copy.xlsx (avoids .replace() string bugs)
stem = original_path.stem
copy_path = original_path.with_name(f"{stem}_copy{suffix}")
shutil.copy2(original_path, copy_path)

logger.info("Stored upload: original=%s, copy=%s", original_path, copy_path)
return original_path, copy_path


def get_original_path(copy_path: str) -> Path:
def get_original_path(copy_path: str | Path) -> Path:
"""Derive the original file path from a working copy path.

Args:
copy_path: Path to the _copy.csv working file.
copy_path: Path to the working copy file (str or Path).

Returns:
Path to the original CSV file.
Path to the original file.
"""
return Path(copy_path.replace("_copy.csv", ".csv"))


def delete_project_files(copy_path: str) -> None:
copy_path = Path(copy_path)
suffix = copy_path.suffix
# Remove the _copy suffix from the stem
stem = copy_path.stem
if stem.endswith("_copy"):
original_stem = stem[: -len("_copy")]
else:
# Fallback: return as-is to avoid silent data loss
logger.warning("copy_path does not end with _copy: %s", copy_path)
original_stem = stem
return copy_path.with_name(f"{original_stem}{suffix}")


def delete_project_files(copy_path: str | Path) -> None:
"""Delete both the working copy and original file for a project.

Args:
copy_path: Path to the _copy.csv working file.
copy_path: Path to the working copy file.
"""
original_path = get_original_path(copy_path)

Expand Down
Loading
Loading