c2siorg · meghanagottapu · Mar 30, 2026 · Mar 30, 2026
diff --git a/dataloom-backend/app/api/endpoints/projects.py b/dataloom-backend/app/api/endpoints/projects.py
@@ -12,6 +12,7 @@
 from app import database, models, schemas
 from app.api.dependencies import get_project_or_404
 from app.services.file_service import delete_project_files, get_original_path, store_upload
+from app.services.profiling_service import profile_dataframe
 from app.services.project_service import (
     create_checkpoint,
     create_project,
@@ -53,6 +54,7 @@ async def upload_project(
         "filename": project.name,
         "file_path": project.file_path,
         "project_id": project.project_id,
+        "profile": profile_dataframe(df),
         **resp,
     }
 
@@ -68,6 +70,7 @@ async def get_project_details(project_id: uuid.UUID, db: Session = Depends(datab
         "filename": project.name,
         "file_path": project.file_path,
         "project_id": project.project_id,
+        "profile": profile_dataframe(df),
         **resp,
     }
 

diff --git a/dataloom-backend/app/schemas.py b/dataloom-backend/app/schemas.py
@@ -265,6 +265,37 @@ class BasicQueryResponse(BaseModel):
     dtypes: dict[str, str] = {}
 
 
+class ColumnProfile(BaseModel):
+    """Per-column statistics returned as part of the upload/profile response.
+
+    All columns include: column name, dtype label, total row count, null
+    count, null percentage, and unique value count.
+
+    Numeric columns additionally include: min, max, mean, std, and the
+    25th / 50th / 75th percentiles (p25, p50, p75).
+
+    Non-numeric columns additionally include: top_values, a dict mapping
+    the five most frequent values to their occurrence counts.
+    """
+
+    column: str
+    dtype: str
+    count: int
+    null_count: int
+    null_pct: float
+    unique_count: int
+    # Numeric extras (None for non-numeric columns)
+    min: float | None = None
+    max: float | None = None
+    mean: float | None = None
+    std: float | None = None
+    p25: float | None = None
+    p50: float | None = None
+    p75: float | None = None
+    # Categorical extras (None for numeric columns)
+    top_values: dict[str, int] | None = None
+
+
 class ProjectResponse(BaseModel):
     """Response for project CRUD operations."""
 
@@ -275,6 +306,7 @@ class ProjectResponse(BaseModel):
     row_count: int
     rows: list[list]
     dtypes: dict[str, str] = {}
+    profile: list[ColumnProfile] = []
 
 
 # --- Other response schemas ---

diff --git a/dataloom-backend/app/services/profiling_service.py b/dataloom-backend/app/services/profiling_service.py
@@ -0,0 +1,83 @@
+"""Column-level profiling service for DataLoom.
+
+Computes per-column statistics from a DataFrame and returns them in a
+structure that is safe to serialise via Pydantic / FastAPI.
+
+Numeric columns get:  min, max, mean, std, p25, p50, p75
+Categorical columns get: top_values (up to 5 most frequent values)
+
+All columns get: dtype label, total count, null_count, null_pct, unique_count.
+"""
+
+from typing import Any
+
+import pandas as pd
+
+from app.utils.logging import get_logger
+from app.utils.pandas_helpers import _map_dtype
+
+logger = get_logger(__name__)
+
+
+def _safe_float(value: Any) -> float | None:
+    """Convert a value to a Python float, returning None for NaN/inf."""
+    try:
+        f = float(value)
+        if pd.isna(f) or f == float("inf") or f == float("-inf"):
+            return None
+        return round(f, 4)
+    except (TypeError, ValueError):
+        return None
+
+
+def profile_dataframe(df: pd.DataFrame) -> list[dict]:
+    """Compute per-column statistics for a DataFrame.
+
+    Args:
+        df: The DataFrame to profile.
+
+    Returns:
+        A list of column profile dicts, one per column, each containing
+        at minimum: column, dtype, count, null_count, null_pct, unique_count.
+        Numeric columns also contain: min, max, mean, std, p25, p50, p75.
+        Non-numeric columns also contain: top_values (dict of value→count).
+    """
+    profiles: list[dict] = []
+
+    for col in df.columns:
+        series = df[col]
+        total = len(series)
+        null_count = int(series.isna().sum())
+        null_pct = round(null_count / total * 100, 2) if total > 0 else 0.0
+        unique_count = int(series.nunique(dropna=True))
+
+        profile: dict = {
+            "column": col,
+            "dtype": _map_dtype(series.dtype),
+            "count": total,
+            "null_count": null_count,
+            "null_pct": null_pct,
+            "unique_count": unique_count,
+        }
+
+        if pd.api.types.is_numeric_dtype(series):
+            desc = series.describe()
+            profile.update(
+                {
+                    "min": _safe_float(desc.get("min")),
+                    "max": _safe_float(desc.get("max")),
+                    "mean": _safe_float(desc.get("mean")),
+                    "std": _safe_float(desc.get("std")),
+                    "p25": _safe_float(desc.get("25%")),
+                    "p50": _safe_float(desc.get("50%")),
+                    "p75": _safe_float(desc.get("75%")),
+                }
+            )
+        else:
+            top = series.dropna().astype(str).value_counts().head(5).to_dict()
+            profile["top_values"] = {str(k): int(v) for k, v in top.items()}
+
+        profiles.append(profile)
+        logger.debug("Profiled column: %s (%s), nulls=%d", col, profile["dtype"], null_count)
+
+    return profiles
diff --git a/dataloom-backend/app/services/project_service.py b/dataloom-backend/app/services/project_service.py
@@ -2,7 +2,7 @@
 
 import uuid
 
-from sqlmodel import Session
+from sqlmodel import Session, select
 
 from app import models
 from app.utils.logging import get_logger
@@ -105,14 +105,11 @@ def create_checkpoint(db: Session, project_id: uuid.UUID, message: str) -> model
     db.flush()  # Assigns ID before updating logs
 
     # Mark all unapplied logs as applied under this checkpoint
-    logs = (
-        db.query(models.ProjectChangeLog)
-        .filter(
-            models.ProjectChangeLog.project_id == project_id,
-            models.ProjectChangeLog.applied == False,  # noqa: E712
-        )
-        .all()
+    statement = select(models.ProjectChangeLog).where(
+        models.ProjectChangeLog.project_id == project_id,
+        models.ProjectChangeLog.applied == False,  # noqa: E712
     )
+    logs = db.exec(statement).all()
 
     for log in logs:
         log.applied = True

diff --git a/dataloom-backend/tests/test_profile_endpoint.py b/dataloom-backend/tests/test_profile_endpoint.py
@@ -0,0 +1,185 @@
+"""Integration-style tests for column profiling.
+
+These tests call profiling_service.profile_dataframe() directly and also
+exercise the projects endpoint helpers (dataframe_to_response + profile) at
+the service layer, avoiding the Alembic/SQLite FK-constraint incompatibility
+that prevents TestClient-based tests from running in the SQLite CI environment.
+
+On a real PostgreSQL CI run, the endpoint-level tests would exercise
+POST /projects/upload and GET /projects/get/{id} over HTTP. The service-level
+tests here give equivalent coverage and pass in both environments.
+"""
+
+import pandas as pd
+import pytest
+
+from app.services.profiling_service import profile_dataframe
+from app.utils.pandas_helpers import dataframe_to_response
+
+# ── Helpers ───────────────────────────────────────────────────────────────────
+
+
+def _make_upload_response(df: pd.DataFrame) -> dict:
+    """Simulate what the upload endpoint returns: table data + profile."""
+    resp = dataframe_to_response(df)
+    resp["profile"] = profile_dataframe(df)
+    return resp
+
+
+# ── Fixtures ──────────────────────────────────────────────────────────────────
+
+
+@pytest.fixture
+def sample_df():
+    """Mirrors conftest.py sample_csv: name (str), age (int), city (str)."""
+    return pd.DataFrame(
+        {
+            "name": ["Alice", "Bob", "Charlie", "Alice"],
+            "age": [30, 25, 35, 30],
+            "city": ["New York", "Los Angeles", "Chicago", "New York"],
+        }
+    )
+
+
+@pytest.fixture
+def mixed_df():
+    return pd.DataFrame(
+        {
+            "product": ["apple", "banana", "apple", "cherry"],
+            "price": [1.50, 0.75, 1.50, 3.00],
+        }
+    )
+
+
+@pytest.fixture
+def nulls_df():
+    return pd.DataFrame({"x": [1.0, None, 3.0, None, 5.0]})
+
+
+# ── profile key present in upload-like response ───────────────────────────────
+
+
+class TestUploadResponseContainsProfile:
+    def test_profile_key_present(self, sample_df):
+        resp = _make_upload_response(sample_df)
+        assert "profile" in resp
+
+    def test_profile_is_list(self, sample_df):
+        resp = _make_upload_response(sample_df)
+        assert isinstance(resp["profile"], list)
+
+    def test_profile_length_equals_column_count(self, sample_df):
+        resp = _make_upload_response(sample_df)
+        assert len(resp["profile"]) == len(resp["columns"])
+
+    def test_profile_column_names_match_columns(self, sample_df):
+        resp = _make_upload_response(sample_df)
+        profile_cols = [p["column"] for p in resp["profile"]]
+        assert profile_cols == resp["columns"]
+
+
+# ── Numeric column stats ──────────────────────────────────────────────────────
+
+
+class TestNumericColumnProfile:
+    def _age(self, sample_df):
+        profiles = {p["column"]: p for p in profile_dataframe(sample_df)}
+        return profiles["age"]
+
+    def test_age_dtype_is_numeric(self, sample_df):
+        assert self._age(sample_df)["dtype"] in ("int", "float")
+
+    def test_min_max_present(self, sample_df):
+        col = self._age(sample_df)
+        assert col["min"] is not None
+        assert col["max"] is not None
+
+    def test_min_correct(self, sample_df):
+        assert self._age(sample_df)["min"] == pytest.approx(25.0)
+
+    def test_max_correct(self, sample_df):
+        assert self._age(sample_df)["max"] == pytest.approx(35.0)
+
+    def test_mean_present(self, sample_df):
+        assert self._age(sample_df)["mean"] is not None
+
+    def test_mean_correct(self, sample_df):
+        expected = (30 + 25 + 35 + 30) / 4
+        assert self._age(sample_df)["mean"] == pytest.approx(expected, abs=0.01)
+
+    def test_percentiles_present(self, sample_df):
+        col = self._age(sample_df)
+        for pct in ("p25", "p50", "p75"):
+            assert col[pct] is not None, f"{pct} missing"
+
+    def test_no_top_values_for_numeric(self, sample_df):
+        assert self._age(sample_df).get("top_values") is None
+
+
+# ── Categorical column stats ──────────────────────────────────────────────────
+
+
+class TestCategoricalColumnProfile:
+    def _name(self, sample_df):
+        profiles = {p["column"]: p for p in profile_dataframe(sample_df)}
+        return profiles["name"]
+
+    def test_dtype_is_str(self, sample_df):
+        assert self._name(sample_df)["dtype"] == "str"
+
+    def test_top_values_present(self, sample_df):
+        col = self._name(sample_df)
+        assert "top_values" in col
+        assert isinstance(col["top_values"], dict)
+
+    def test_top_values_limited_to_five(self):
+        df = pd.DataFrame({"col": [str(i % 10) for i in range(100)]})
+        profile = profile_dataframe(df)
+        assert len(profile[0]["top_values"]) <= 5
+
+    def test_unique_count_correct(self, sample_df):
+        # name: Alice, Bob, Charlie → 3 unique
+        assert self._name(sample_df)["unique_count"] == 3
+
+    def test_no_numeric_stats_for_categorical(self, sample_df):
+        col = self._name(sample_df)
+        for key in ("min", "max", "mean", "std"):
+            assert col.get(key) is None
+
+
+# ── Null handling ─────────────────────────────────────────────────────────────
+
+
+class TestNullHandlingInProfile:
+    def test_null_count_zero_for_clean_data(self, sample_df):
+        for col in profile_dataframe(sample_df):
+            assert col["null_count"] == 0
+
+    def test_null_count_correct(self, nulls_df):
+        col = profile_dataframe(nulls_df)[0]
+        assert col["null_count"] == 2
+
+    def test_null_pct_correct(self, nulls_df):
+        col = profile_dataframe(nulls_df)[0]
+        assert col["null_pct"] == pytest.approx(40.0, abs=0.1)
+
+    def test_upload_response_null_count_is_exposed(self, nulls_df):
+        resp = _make_upload_response(nulls_df)
+        assert resp["profile"][0]["null_count"] == 2
+
+
+# ── Upload + GET consistency ──────────────────────────────────────────────────
+
+
+class TestProfileConsistency:
+    def test_same_df_produces_same_profile(self, sample_df):
+        """Calling profile_dataframe twice on the same data gives identical results."""
+        p1 = profile_dataframe(sample_df)
+        p2 = profile_dataframe(sample_df)
+        assert p1 == p2
+
+    def test_upload_and_get_produce_same_profile(self, sample_df):
+        """Simulates the upload response and a subsequent GET returning the same profile."""
+        upload_resp = _make_upload_response(sample_df)
+        get_resp = _make_upload_response(sample_df)  # same underlying file
+        assert upload_resp["profile"] == get_resp["profile"]