|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +import scipy |
| 4 | +from anndata import AnnData |
| 5 | +from scipy.stats import entropy |
| 6 | +from squidpy._utils import NDArrayA |
| 7 | + |
| 8 | + |
| 9 | +# TODO: this should go into squidpy/gr/_nhood.py |
| 10 | +def _get_neighbor_counts( |
| 11 | + data: NDArrayA, |
| 12 | + indices: NDArrayA, |
| 13 | + indptr: NDArrayA, |
| 14 | + cats: NDArrayA, # Array mapping cell indices to their types |
| 15 | + output: NDArrayA, # Shape: (n_cells, n_celltypes) |
| 16 | +) -> NDArrayA: |
| 17 | + indices_list = np.split(indices, indptr[1:-1]) |
| 18 | + data_list = np.split(data, indptr[1:-1]) |
| 19 | + for i in range(len(data_list)): # Iterate over cells |
| 20 | + cur_row = i # Each row corresponds to a cell |
| 21 | + cur_indices = indices_list[i] |
| 22 | + cur_data = data_list[i] |
| 23 | + for j, val in zip(cur_indices, cur_data, strict=False): |
| 24 | + cur_col = cats[j] # Column corresponds to cell type |
| 25 | + output[cur_row, cur_col] += val |
| 26 | + return output |
| 27 | + |
| 28 | + |
| 29 | +def get_neighbor_counts( |
| 30 | + adata, cluster_key="cell_type", connectivity_key="spatial_connectivities", key_added="composition_matrix" |
| 31 | +): |
| 32 | + """Computes the number of each cell type in one-hop neighbors and stores it in adata.obsm['neighbor_counts'].""" |
| 33 | + cats = adata.obs[cluster_key] |
| 34 | + mask = ~pd.isnull(cats).values |
| 35 | + cats = cats.loc[mask] |
| 36 | + if not len(cats): |
| 37 | + raise RuntimeError(f"After removing NaNs in `adata.obs[{cluster_key!r}]`, none remain.") |
| 38 | + |
| 39 | + g = adata.obsp[connectivity_key] |
| 40 | + |
| 41 | + if isinstance(g, scipy.sparse.coo_matrix): |
| 42 | + g = g.tocsr() |
| 43 | + g = g[mask, :][:, mask] |
| 44 | + n_cats = len(cats.cat.categories) |
| 45 | + |
| 46 | + g_data = np.broadcast_to(1, shape=len(g.data)) |
| 47 | + dtype = int if pd.api.types.is_bool_dtype(g.dtype) or pd.api.types.is_integer_dtype(g.dtype) else float |
| 48 | + output: NDArrayA = np.zeros((len(cats), n_cats), dtype=dtype) |
| 49 | + |
| 50 | + neighbor_counts = _get_neighbor_counts(g_data, g.indices, g.indptr, cats.cat.codes.to_numpy(), output) |
| 51 | + |
| 52 | + # adding the neighbor counts to adata.obsm |
| 53 | + # TODO: adapt to squidpy gr_utils _save_data(adata, attr="obsm", key=Key.obsm.feature(feature_column), data=node_features) |
| 54 | + adata.obsm[key_added] = neighbor_counts |
| 55 | + |
| 56 | + return neighbor_counts |
| 57 | + |
| 58 | + |
| 59 | +def compute_node_feature(adata: AnnData, metric: str, connectivity_key: str, **kwargs) -> NDArrayA: |
| 60 | + """ |
| 61 | + Compute a node-level feature based on the selected metric. |
| 62 | +
|
| 63 | + Parameters |
| 64 | + ---------- |
| 65 | + - adata: AnnData object |
| 66 | + - metric: str, the metric to compute ('shannon', 'degree', 'mean_distance') |
| 67 | + - connectivity_key: str, the key for the adjacency matrix in `adata.obsp` |
| 68 | + - kwargs: additional parameters for specific computations (e.g., `n_hops` for Shannon) |
| 69 | +
|
| 70 | + Returns |
| 71 | + ------- |
| 72 | + - np.ndarray: Node-level feature values indexed by cell ID |
| 73 | + """ |
| 74 | + node_feature_functions = { |
| 75 | + "shannon": compute_shannon_diversity, |
| 76 | + "degree": calculate_degree, |
| 77 | + "mean_distance": calculate_mean_distance, |
| 78 | + } |
| 79 | + |
| 80 | + if metric not in node_feature_functions: |
| 81 | + raise ValueError(f"Unsupported metric: {metric}. Choose from 'shannon', 'degree', or 'mean_distance'") |
| 82 | + |
| 83 | + return node_feature_functions[metric](adata, connectivity_key=connectivity_key, **kwargs).reshape(-1, 1) |
| 84 | + |
| 85 | + |
| 86 | +def calculate_degree(adata: AnnData, connectivity_key: str, **kwargs) -> NDArrayA: |
| 87 | + """Compute the degree of each node.""" |
| 88 | + return adata.obsp[connectivity_key].sum(axis=1) |
| 89 | + |
| 90 | + |
| 91 | +def calculate_mean_distance(adata: AnnData, connectivity_key: str, **kwargs) -> NDArrayA: |
| 92 | + """Compute the mean distance to neighbors.""" |
| 93 | + return np.nanmean(adata.obsp[connectivity_key].toarray(), axis=1) |
| 94 | + |
| 95 | + |
| 96 | +def compute_shannon_diversity( |
| 97 | + adata: AnnData, |
| 98 | + connectivity_key: str = "spatial_connectivities", |
| 99 | + cluster_key: str = "cell_type", |
| 100 | + key_added: str = "composition_matrix", |
| 101 | + **kwargs, |
| 102 | +) -> NDArrayA: |
| 103 | + """ |
| 104 | + Compute Shannon diversity index for each node based on neighbor counts. |
| 105 | +
|
| 106 | + Parameters |
| 107 | + ---------- |
| 108 | + - adata: AnnData object |
| 109 | + - connectivity_key: str, key in adata.obsp corresponding to the adjacency matrix |
| 110 | + - cluster_key: str, column in adata.obs that contains categorical annotations (e.g., cell type) |
| 111 | + - kwargs: additional arguments (not used here but included for interface consistency) |
| 112 | +
|
| 113 | + Returns |
| 114 | + ------- |
| 115 | + - np.ndarray: Shannon diversity values indexed by cell ID |
| 116 | + """ |
| 117 | + # Compute neighbor counts directly |
| 118 | + neighbor_counts = get_neighbor_counts( |
| 119 | + adata, cluster_key=cluster_key, connectivity_key=connectivity_key, key_added=key_added |
| 120 | + ) |
| 121 | + |
| 122 | + # Normalize to probabilities |
| 123 | + probabilities = neighbor_counts / neighbor_counts.sum(axis=1, keepdims=True) |
| 124 | + |
| 125 | + # Compute Shannon diversity (entropy), ignoring zero probabilities |
| 126 | + shannon_diversity = np.apply_along_axis(lambda p: entropy(p[p > 0], base=2), 1, probabilities) |
| 127 | + |
| 128 | + return shannon_diversity.astype(np.float64) |
| 129 | + |
| 130 | + |
| 131 | +def aggregate_by_group( |
| 132 | + adata: AnnData, |
| 133 | + library_key: str, |
| 134 | + node_feature_key: str, |
| 135 | + cluster_key: str | None = None, |
| 136 | + aggregation: str = "mean", |
| 137 | + key_added: str = "aggregated_features", |
| 138 | +) -> None: |
| 139 | + """ |
| 140 | + Aggregate node-level features by a sample group and optionally by annotation. |
| 141 | +
|
| 142 | + Parameters |
| 143 | + ---------- |
| 144 | + - adata: AnnData object |
| 145 | + - library_key: str, column in `adata.obs` indicating the sample group |
| 146 | + - node_feature_key: str, column in `adata.obs` containing the node-level feature to aggregate |
| 147 | + - cluster_key: Optional[str], column in `adata.obs` for additional grouping (e.g., cell type) |
| 148 | + - aggregation: str, aggregation method ('mean', 'median', 'sum', None) |
| 149 | + - key_added: str, key under which results are stored in `adata.uns` |
| 150 | +
|
| 151 | + Returns |
| 152 | + ------- |
| 153 | + - None (Results are stored in `adata.uns[output_key]`) |
| 154 | + """ |
| 155 | + if node_feature_key not in adata.obs.columns: |
| 156 | + raise ValueError(f"Column '{node_feature_key}' not found in adata.obs") |
| 157 | + |
| 158 | + if library_key not in adata.obs.columns: |
| 159 | + raise ValueError(f"Column '{library_key}' not found in adata.obs") |
| 160 | + |
| 161 | + if cluster_key and cluster_key not in adata.obs.columns: |
| 162 | + raise ValueError(f"Column '{cluster_key}' not found in adata.obs") |
| 163 | + |
| 164 | + # Select the aggregation function |
| 165 | + agg_methods = { |
| 166 | + "mean": "mean", |
| 167 | + "median": "median", |
| 168 | + "sum": "sum", |
| 169 | + } |
| 170 | + |
| 171 | + if aggregation is None: |
| 172 | + return |
| 173 | + |
| 174 | + if aggregation not in agg_methods: |
| 175 | + raise ValueError(f"Unsupported aggregation method: {aggregation}") |
| 176 | + |
| 177 | + # Perform aggregation |
| 178 | + if cluster_key: |
| 179 | + aggregated = ( |
| 180 | + adata.obs.groupby([library_key, cluster_key])[node_feature_key] |
| 181 | + .agg(agg_methods[aggregation]) |
| 182 | + .unstack() # Pivot so that annotation_key values become columns |
| 183 | + ) |
| 184 | + else: |
| 185 | + aggregated = adata.obs.groupby(library_key)[node_feature_key].agg(agg_methods[aggregation]) |
| 186 | + |
| 187 | + # TODO: adapt to squidpy save function |
| 188 | + adata.uns[key_added] = aggregated |
0 commit comments