Merge pull request #2 from zimea/feat/aggregate-node-features

grst · web-flow · commit 5fff32e623ac · 2025-09-05T08:53:38.000+02:00
Feat/aggregate node features
diff --git a/docs/notebooks/multi_sample_comparison_of_node_features.ipynb b/docs/notebooks/multi_sample_comparison_of_node_features.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [ "hatchling" ]
 [project]
 name = "spatial-sample-aggregation"
 version = "0.0.1"
-description = "Aggregate spatial slides into sample-level statistyics"
+description = "Aggregate spatial slides into sample-level statistics"
 readme = "README.md"
 license = { file = "LICENSE" }
 maintainers = [
diff --git a/src/spatial_sample_aggregation/tl/__init__.py b/src/spatial_sample_aggregation/tl/__init__.py
@@ -1 +1,2 @@
-from .aggregate import basic_tool
+from .aggregate import aggregate_by_node
+from .compute_node_features import aggregate_by_group, compute_node_feature, get_neighbor_counts
diff --git a/src/spatial_sample_aggregation/tl/aggregate.py b/src/spatial_sample_aggregation/tl/aggregate.py
@@ -1,9 +1,13 @@
 import pandas as pd
 from anndata import AnnData
+from squidpy._constants._pkg_constants import Key
+from squidpy.gr._utils import _assert_categorical_obs, _assert_connectivity_key
+
+from .compute_node_features import aggregate_by_group, compute_node_feature
 
 
 def aggregate_by_edge(
-    adata: AnnData, sample_key: str, annotation_key: str, use_edge_weight: bool = False
+    adata: AnnData, library_key: str, annotation_key: str, use_edge_weight: bool = False
 ) -> pd.DataFrame:
     """
     Aggregate spatial neighborhood graph taking into account neighbors
@@ -15,14 +19,56 @@ def aggregate_by_edge(
 
 
 def aggregate_by_node(
-    adata, *, sample_key: str, annotation_key: str, metric: str = "shannon", aggregate_by: str = "mean"
-) -> pd.DataFrame:
+    adata: AnnData,
+    *,
+    library_key: str,
+    cluster_key: str = None,  # TODO: annotation_key --> cluster_key to adapt to squidpy notation
+    metric: str = "shannon",
+    aggregation: str = "mean",  # TODO: new parameter --> check squidpy
+    connectivity_key: str = "spatial_connectivities",  # TODO: new parameter
+    key_added: str = None,
+    **kwargs,
+) -> None:
     """
-    Compute a metric on every node of the neighborhood graph. Then aggregate this metric by a group (e.g. cell-type).
+    Compute a node-level metric and aggregate it by a sample group.
 
     Parameters
     ----------
-    metric
-        possible metrics are shannon entropy, count (-> get percentage of niches/cell-types), ... (?)
+    - adata: AnnData, input data
+    - library_key: str, column in `adata.obs` to group by
+    - cluster_key: Optional[str], cell type or similar annotation
+    - metric: str, metric to compute ('shannon', 'degree', 'mean_distance')
+    - aggregation: str, aggregation method ('mean', 'median', 'sum', 'none')
+    - connectivity_key: str, adjacency matrix key
+    - key_added: Optional[str], key under which aggregated results are stored in `adata.uns`. Defaults to `metric`.
+    - kwargs: Additional parameters passed to metric computation functions.
+
+    Returns
+    -------
+    - None (Results are stored in `adata.obs[key_added]` and the agggregated features are added in `adata.uns[key_added]` if aggregation is not None)
     """
-    pass
+    # Determine where to store the results (default to metric name)
+    if key_added is None:
+        key_added = metric
+
+    # TODO: adapt to squidpy: connectivity_key = Key.obsp.spatial_conn(connectivity_key)
+    _assert_categorical_obs(adata, cluster_key)
+    _assert_connectivity_key(adata, connectivity_key)
+
+    # Compute node-level feature
+    node_features = compute_node_feature(
+        adata, metric, connectivity_key=connectivity_key, cluster_key=cluster_key, library_key=library_key, **kwargs
+    )
+
+    # TODO: adapt to squidpy gr_utils _save_data(adata, attr="obs", key=Key.obs.feature(feature_column), data=node_features)
+    adata.obs[key_added] = node_features  # TODO: store in obs here or in the indivdiual functions
+
+    # Aggregate the computed metric at the sample level
+    aggregate_by_group(
+        adata,
+        library_key=library_key,
+        node_feature_key=key_added,
+        cluster_key=cluster_key,
+        aggregation=aggregation,
+        key_added=key_added,
+    )
diff --git a/src/spatial_sample_aggregation/tl/compute_node_features.py b/src/spatial_sample_aggregation/tl/compute_node_features.py
@@ -0,0 +1,188 @@
+import numpy as np
+import pandas as pd
+import scipy
+from anndata import AnnData
+from scipy.stats import entropy
+from squidpy._utils import NDArrayA
+
+
+# TODO: this should go into squidpy/gr/_nhood.py
+def _get_neighbor_counts(
+    data: NDArrayA,
+    indices: NDArrayA,
+    indptr: NDArrayA,
+    cats: NDArrayA,  # Array mapping cell indices to their types
+    output: NDArrayA,  # Shape: (n_cells, n_celltypes)
+) -> NDArrayA:
+    indices_list = np.split(indices, indptr[1:-1])
+    data_list = np.split(data, indptr[1:-1])
+    for i in range(len(data_list)):  # Iterate over cells
+        cur_row = i  # Each row corresponds to a cell
+        cur_indices = indices_list[i]
+        cur_data = data_list[i]
+        for j, val in zip(cur_indices, cur_data, strict=False):
+            cur_col = cats[j]  # Column corresponds to cell type
+            output[cur_row, cur_col] += val
+    return output
+
+
+def get_neighbor_counts(
+    adata, cluster_key="cell_type", connectivity_key="spatial_connectivities", key_added="composition_matrix"
+):
+    """Computes the number of each cell type in one-hop neighbors and stores it in adata.obsm['neighbor_counts']."""
+    cats = adata.obs[cluster_key]
+    mask = ~pd.isnull(cats).values
+    cats = cats.loc[mask]
+    if not len(cats):
+        raise RuntimeError(f"After removing NaNs in `adata.obs[{cluster_key!r}]`, none remain.")
+
+    g = adata.obsp[connectivity_key]
+
+    if isinstance(g, scipy.sparse.coo_matrix):
+        g = g.tocsr()
+    g = g[mask, :][:, mask]
+    n_cats = len(cats.cat.categories)
+
+    g_data = np.broadcast_to(1, shape=len(g.data))
+    dtype = int if pd.api.types.is_bool_dtype(g.dtype) or pd.api.types.is_integer_dtype(g.dtype) else float
+    output: NDArrayA = np.zeros((len(cats), n_cats), dtype=dtype)
+
+    neighbor_counts = _get_neighbor_counts(g_data, g.indices, g.indptr, cats.cat.codes.to_numpy(), output)
+
+    # adding the neighbor counts to adata.obsm
+    # TODO: adapt to squidpy gr_utils _save_data(adata, attr="obsm", key=Key.obsm.feature(feature_column), data=node_features)
+    adata.obsm[key_added] = neighbor_counts
+
+    return neighbor_counts
+
+
+def compute_node_feature(adata: AnnData, metric: str, connectivity_key: str, **kwargs) -> NDArrayA:
+    """
+    Compute a node-level feature based on the selected metric.
+
+    Parameters
+    ----------
+    - adata: AnnData object
+    - metric: str, the metric to compute ('shannon', 'degree', 'mean_distance')
+    - connectivity_key: str, the key for the adjacency matrix in `adata.obsp`
+    - kwargs: additional parameters for specific computations (e.g., `n_hops` for Shannon)
+
+    Returns
+    -------
+    - np.ndarray: Node-level feature values indexed by cell ID
+    """
+    node_feature_functions = {
+        "shannon": compute_shannon_diversity,
+        "degree": calculate_degree,
+        "mean_distance": calculate_mean_distance,
+    }
+
+    if metric not in node_feature_functions:
+        raise ValueError(f"Unsupported metric: {metric}. Choose from 'shannon', 'degree', or 'mean_distance'")
+
+    return node_feature_functions[metric](adata, connectivity_key=connectivity_key, **kwargs).reshape(-1, 1)
+
+
+def calculate_degree(adata: AnnData, connectivity_key: str, **kwargs) -> NDArrayA:
+    """Compute the degree of each node."""
+    return adata.obsp[connectivity_key].sum(axis=1)
+
+
+def calculate_mean_distance(adata: AnnData, connectivity_key: str, **kwargs) -> NDArrayA:
+    """Compute the mean distance to neighbors."""
+    return np.nanmean(adata.obsp[connectivity_key].toarray(), axis=1)
+
+
+def compute_shannon_diversity(
+    adata: AnnData,
+    connectivity_key: str = "spatial_connectivities",
+    cluster_key: str = "cell_type",
+    key_added: str = "composition_matrix",
+    **kwargs,
+) -> NDArrayA:
+    """
+    Compute Shannon diversity index for each node based on neighbor counts.
+
+    Parameters
+    ----------
+    - adata: AnnData object
+    - connectivity_key: str, key in adata.obsp corresponding to the adjacency matrix
+    - cluster_key: str, column in adata.obs that contains categorical annotations (e.g., cell type)
+    - kwargs: additional arguments (not used here but included for interface consistency)
+
+    Returns
+    -------
+    - np.ndarray: Shannon diversity values indexed by cell ID
+    """
+    # Compute neighbor counts directly
+    neighbor_counts = get_neighbor_counts(
+        adata, cluster_key=cluster_key, connectivity_key=connectivity_key, key_added=key_added
+    )
+
+    # Normalize to probabilities
+    probabilities = neighbor_counts / neighbor_counts.sum(axis=1, keepdims=True)
+
+    # Compute Shannon diversity (entropy), ignoring zero probabilities
+    shannon_diversity = np.apply_along_axis(lambda p: entropy(p[p > 0], base=2), 1, probabilities)
+
+    return shannon_diversity.astype(np.float64)
+
+
+def aggregate_by_group(
+    adata: AnnData,
+    library_key: str,
+    node_feature_key: str,
+    cluster_key: str | None = None,
+    aggregation: str = "mean",
+    key_added: str = "aggregated_features",
+) -> None:
+    """
+    Aggregate node-level features by a sample group and optionally by annotation.
+
+    Parameters
+    ----------
+    - adata: AnnData object
+    - library_key: str, column in `adata.obs` indicating the sample group
+    - node_feature_key: str, column in `adata.obs` containing the node-level feature to aggregate
+    - cluster_key: Optional[str], column in `adata.obs` for additional grouping (e.g., cell type)
+    - aggregation: str, aggregation method ('mean', 'median', 'sum', None)
+    - key_added: str, key under which results are stored in `adata.uns`
+
+    Returns
+    -------
+    - None (Results are stored in `adata.uns[output_key]`)
+    """
+    if node_feature_key not in adata.obs.columns:
+        raise ValueError(f"Column '{node_feature_key}' not found in adata.obs")
+
+    if library_key not in adata.obs.columns:
+        raise ValueError(f"Column '{library_key}' not found in adata.obs")
+
+    if cluster_key and cluster_key not in adata.obs.columns:
+        raise ValueError(f"Column '{cluster_key}' not found in adata.obs")
+
+    # Select the aggregation function
+    agg_methods = {
+        "mean": "mean",
+        "median": "median",
+        "sum": "sum",
+    }
+
+    if aggregation is None:
+        return
+
+    if aggregation not in agg_methods:
+        raise ValueError(f"Unsupported aggregation method: {aggregation}")
+
+    # Perform aggregation
+    if cluster_key:
+        aggregated = (
+            adata.obs.groupby([library_key, cluster_key])[node_feature_key]
+            .agg(agg_methods[aggregation])
+            .unstack()  # Pivot so that annotation_key values become columns
+        )
+    else:
+        aggregated = adata.obs.groupby(library_key)[node_feature_key].agg(agg_methods[aggregation])
+
+    # TODO: adapt to squidpy save function
+    adata.uns[key_added] = aggregated
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,61 @@
+import pytest
+import pandas as pd
+import numpy as np
+import scipy.sparse
+from anndata import AnnData
+
+@pytest.fixture
+def sample_adata():
+    """Creates a small AnnData object for testing."""
+    # Create observation dataframe with 2 samples and 10 cells each (total 20 cells)
+    obs = pd.DataFrame(
+        {
+            "cell_id": [f"cell_{i}" for i in range(20)],
+            "cell_type": ["A", "B", "C", "A", "B", "C", "A", "B", "C", "A"] * 2,  # Repeating for two samples
+            "sample_id": ["S1"] * 10 + ["S2"] * 10,  # First 10 cells in S1, next 10 in S2
+            "node_feature": np.random.rand(20)
+        }
+    ).set_index("cell_id")
+
+    # Create an adjacency matrix with 2 separate connected components (one per sample)
+    adjacency_matrix = scipy.sparse.block_diag(
+        [
+            scipy.sparse.csr_matrix(
+                [
+                    [0, 1, 0, 1, 0, 0, 0, 0, 0, 1],  
+                    [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],  
+                    [0, 1, 0, 1, 0, 1, 0, 0, 0, 0],  
+                    [1, 0, 1, 0, 0, 0, 1, 0, 0, 0],  
+                    [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],  
+                    [0, 0, 1, 0, 1, 0, 1, 0, 1, 0],  
+                    [0, 0, 0, 1, 0, 1, 0, 1, 0, 0],  
+                    [0, 0, 0, 0, 1, 0, 1, 0, 1, 0],  
+                    [0, 0, 0, 0, 0, 1, 0, 1, 0, 1],  
+                    [1, 0, 0, 0, 0, 0, 0, 0, 1, 0],  
+                ]
+            ),
+            scipy.sparse.csr_matrix(
+                [
+                    [0, 1, 0, 1, 0, 0, 0, 0, 0, 1],  
+                    [1, 0, 1, 0, 1, 0, 0, 0, 0, 0],  
+                    [0, 1, 0, 1, 0, 1, 0, 0, 0, 0],  
+                    [1, 0, 1, 0, 0, 0, 1, 0, 0, 0],  
+                    [0, 1, 0, 0, 0, 1, 0, 1, 0, 0],  
+                    [0, 0, 1, 0, 1, 0, 1, 0, 1, 0],  
+                    [0, 0, 0, 1, 0, 1, 0, 1, 0, 0],  
+                    [0, 0, 0, 0, 1, 0, 1, 0, 1, 0],  
+                    [0, 0, 0, 0, 0, 1, 0, 1, 0, 1],  
+                    [1, 0, 0, 0, 0, 0, 0, 0, 1, 0],  
+                ]
+            ),
+        ]
+    )
+
+    # Create AnnData object
+    adata = AnnData(obs=obs)
+    adata.obs["sample_id"] = adata.obs["sample_id"].astype("category")
+    adata.obs["cell_type"] = adata.obs["cell_type"].astype("category")
+    adata.obsm["spatial"] = np.random.rand(20, 2)
+    adata.obsp["spatial_connectivities"] = adjacency_matrix
+
+    return adata
diff --git a/tests/test_aggregate_by_node.py b/tests/test_aggregate_by_node.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1,2 @@`
`1`		`-from .aggregate import basic_tool`
	`1`	`+from .aggregate import aggregate_by_node`
	`2`	`+from .compute_node_features import aggregate_by_group, compute_node_feature, get_neighbor_counts`