c2siorg · 24f2000777 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026 · Mar 23, 2026
diff --git a/dataloom-backend/app/api/endpoints/profiling.py b/dataloom-backend/app/api/endpoints/profiling.py
@@ -0,0 +1,61 @@
+"""Data profiling endpoint — column statistics and quality scoring."""
+
+import uuid
+
+from fastapi import APIRouter, Depends
+from sqlmodel import Session
+
+from app import database
+from app.api.dependencies import get_project_or_404
+from app.utils.logging import get_logger
+from app.utils.pandas_helpers import (
+    compute_quality_score,
+    get_column_profile,
+    read_file_safe,
+)
+
+logger = get_logger(__name__)
+router = APIRouter()
+
+
+@router.get("/{project_id}/profile")
+async def get_dataset_profile(
+    project_id: uuid.UUID,
+    db: Session = Depends(database.get_db),
+):
+    """Return a column-level statistical profile for a project's dataset.
+
+    Response includes per-column dtype, null stats, unique count, and
+    numeric distribution. Also returns a composite data quality score
+    that penalises null values and duplicate rows.
+
+    Args:
+        project_id: UUID of the project to profile.
+
+    Returns:
+        JSON with total_rows, total_columns, duplicate_rows,
+        quality_score (0-100), and per-column statistics.
+    """
+    project = get_project_or_404(project_id, db)
+    df = read_file_safe(project.file_path)
+
+    duplicate_rows = int(df.duplicated().sum())
+    quality_score = compute_quality_score(df)
+    columns = get_column_profile(df)
+
+    logger.info(
+        "Profile generated: project=%s rows=%d cols=%d quality=%.1f",
+        project_id,
+        len(df),
+        len(df.columns),
+        quality_score,
+    )
+
+    return {
+        "project_id": str(project_id),
+        "total_rows": len(df),
+        "total_columns": len(df.columns),
+        "duplicate_rows": duplicate_rows,
+        "quality_score": quality_score,
+        "columns": columns,
+    }
diff --git a/dataloom-backend/app/config.py b/dataloom-backend/app/config.py
@@ -24,7 +24,7 @@ class Settings(BaseSettings):
     database_url: str
     upload_dir: str = "uploads"
     max_upload_size_bytes: int = 10_485_760  # 10 MB
-    allowed_extensions: list[str] = [".csv"]
+    allowed_extensions: list[str] = [".csv", ".tsv", ".xlsx", ".xls", ".json", ".parquet"]
     cors_origins: list[str] = ["http://localhost:3200"]
     debug: bool = False
 

diff --git a/dataloom-backend/app/main.py b/dataloom-backend/app/main.py
@@ -10,7 +10,7 @@
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 
-from app.api.endpoints import projects, transformations, user_logs
+from app.api.endpoints import profiling, projects, transformations, user_logs
 from app.config import get_settings
 from app.exceptions import AppException, app_exception_handler
 from app.services.transformation_service import TransformationError
@@ -59,6 +59,7 @@ async def transformation_error_handler(request: Request, exc: TransformationErro
 app.include_router(projects.router, prefix="/projects", tags=["projects"])
 app.include_router(transformations.router, prefix="/projects", tags=["transformations"])
 app.include_router(user_logs.router, prefix="/logs", tags=["user_logs"])
+app.include_router(profiling.router, prefix="/projects", tags=["profiling"])
 
 if __name__ == "__main__":
     import uvicorn

diff --git a/dataloom-backend/app/services/file_service.py b/dataloom-backend/app/services/file_service.py
@@ -8,49 +8,77 @@
 
 logger = get_logger(__name__)
 
+SUPPORTED_FORMATS = {".csv", ".xlsx", ".xls", ".json", ".parquet", ".tsv"}
+
 
 def store_upload(file) -> tuple[Path, Path]:
     """Store an uploaded file and create a working copy.
 
-    Saves the file with a sanitized name and creates a _copy.csv for
-    transformation operations, keeping the original pristine.
+    Saves the file with a sanitized name and creates a working copy
+    (suffixed with _copy + original extension) for transformation
+    operations, keeping the original pristine.
+
+    Supports: CSV, Excel (.xlsx/.xls), JSON, Parquet, TSV.
 
     Args:
         file: The FastAPI UploadFile object.
 
     Returns:
         Tuple of (original_path, copy_path).
+
+    Raises:
+        ValueError: If the file extension is not supported.
     """
     safe_name = sanitize_filename(file.filename)
+    suffix = Path(safe_name).suffix.lower()
+
+    if suffix not in SUPPORTED_FORMATS:
+        raise ValueError(
+            f"Unsupported file format '{suffix}'. Supported formats: {', '.join(sorted(SUPPORTED_FORMATS))}"
+        )
+
     original_path = resolve_upload_path(safe_name)
 
     with open(original_path, "wb+") as f:
         shutil.copyfileobj(file.file, f)
 
-    copy_path = Path(str(original_path).replace(".csv", "_copy.csv"))
+    # Build copy path by inserting _copy before the extension
+    # e.g. data.xlsx -> data_copy.xlsx  (avoids .replace() string bugs)
+    stem = original_path.stem
+    copy_path = original_path.with_name(f"{stem}_copy{suffix}")
     shutil.copy2(original_path, copy_path)
 
     logger.info("Stored upload: original=%s, copy=%s", original_path, copy_path)
     return original_path, copy_path
 
 
-def get_original_path(copy_path: str) -> Path:
+def get_original_path(copy_path: str | Path) -> Path:
     """Derive the original file path from a working copy path.
 
     Args:
-        copy_path: Path to the _copy.csv working file.
+        copy_path: Path to the working copy file (str or Path).
 
     Returns:
-        Path to the original CSV file.
+        Path to the original file.
     """
-    return Path(copy_path.replace("_copy.csv", ".csv"))
-
-
-def delete_project_files(copy_path: str) -> None:
+    copy_path = Path(copy_path)
+    suffix = copy_path.suffix
+    # Remove the _copy suffix from the stem
+    stem = copy_path.stem
+    if stem.endswith("_copy"):
+        original_stem = stem[: -len("_copy")]
+    else:
+        # Fallback: return as-is to avoid silent data loss
+        logger.warning("copy_path does not end with _copy: %s", copy_path)
+        original_stem = stem
+    return copy_path.with_name(f"{original_stem}{suffix}")
+
+
+def delete_project_files(copy_path: str | Path) -> None:
     """Delete both the working copy and original file for a project.
 
     Args:
-        copy_path: Path to the _copy.csv working file.
+        copy_path: Path to the working copy file.
     """
     original_path = get_original_path(copy_path)