nf-core · gregothebyteknight · Mar 11, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -8,7 +8,7 @@
 
 > Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311.
 
-## Base pipeliines
+## Base pipelines
 
 - [panpipes](https://doi.org/10.1101/2023.03.11.532085)
 
@@ -47,6 +47,13 @@
 
   > Cannoodt R, Zappia L, Morgan M, Deconinck L (2025). anndataR: AnnData interoperability in R. R package version 0.99.0
 
+- [scanpy](https://pubmed.ncbi.nlm.nih.gov/29409532/)
+
+  > Wolf, F. A., Angerer, P., & Theis, F. J. (2018). SCANPY: large-scale single-cell gene expression data analysis. Genome biology, 19(1), 15. https://doi.org/10.1186/s13059-017-1382-0
+
+- [scarches](https://pypi.org/project/scArches/)
+  > Lotfollahi, M., Naghipourfar, M., Luecken, M.D. et al. Mapping single-cell data to reference atlases by transfer learning. Nat Biotechnol 40, 121–130 (2022). https://doi.org/10.1038/s41587-021-01001-7
+
 ## Software packaging/containerisation tools
 
 - [Anaconda](https://anaconda.com)

@@ -59,6 +59,7 @@ Steps marked with the boat icon are not yet implemented. For the other steps, th
       - [BBKNN](https://github.com/Teichlab/bbknn)
       - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html)
       - [Seurat](https://satijalab.org/seurat/articles/integration_introduction)
+      - [PCA](https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.pca.html)
 3. Cell type annotation
    - [celltypist](https://www.celltypist.org/)
 4. Clustering and dimensionality reduction

@@ -0,0 +1,36 @@
+# EXPIMAP Pathway Database
+
+## Overview
+This directory contains pathway databases used by the EXPIMAP module for interpretable embedding and pathway analysis.
+
+## Files
+
+### pathways.gmt
+- **Format**: Gene Matrix Transposed (GMT)
+- **Content**: Reactome pathway gene sets
+- **Description**: Contains biological pathway gene sets from the Reactome database that are used to construct the latent space in EXPIMAP models. Each row represents a pathway with its associated genes.
+- **Source**: [Reactome Database](https://reactome.org/)
+- **Version/Date**: Please refer to Reactome documentation for version information
+- **Usage**: Used by the EXPIMAP module to guide the learning of interpretable latent representations in single-cell transcriptomics data
+
+## Usage in Pipeline
+
+The pathway file is automatically used by the EXPIMAP integration method. Users can specify a custom pathway file via the `expimap_gmt` parameter in the pipeline parameters, or use the default provided in this directory.
+
+Example:
+```bash
+nextflow run nf-core/scdownstream \
+  --input samplesheet.csv \
+  --outdir results \
+  --integration_methods expimap \
+  --expimap_gmt assets/databases/expimap/pathways.gmt
+```
+
+## Notes
+
+- The GMT format contains tab-separated values where each line represents one pathway
+- First column: pathway name
+- Second column: pathway URL/description
+- Remaining columns: genes in the pathway
+- The default Reactome pathways are suitable for most analyses of mammalian cells (especially human and mouse)
+- For custom pathway analysis, users can provide their own GMT files using the `--expimap_gmt` parameter
@@ -400,6 +400,24 @@ process {
         ]
     }
 
+    withName: SCANPY_PCA {
+        publishDir = [
+            path: { "${params.outdir}/combine/integrate/pca" },
+            mode: params.publish_dir_mode,
+            enabled: params.save_intermediates,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
+    }
+
+    withName: EXPIMAP {
+        publishDir = [
+            path: { "${params.outdir}/combine/integrate/expimap" },
+            mode: params.publish_dir_mode,
+            enabled: params.save_intermediates,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+        ]
+    }
+
     withName: '.*:SCIMILARITY:UNTAR' {
         publishDir = [
             enabled: false

@@ -35,6 +35,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d
       - [BBKNN](https://github.com/Teichlab/bbknn)
       - [Combat](https://scanpy.readthedocs.io/en/latest/api/generated/scanpy.pp.combat.html)
       - [Seurat](https://satijalab.org/seurat/articles/integration_introduction)
+      - [PCA](https://scanpy.readthedocs.io/en/stable/generated/scanpy.pp.pca.html)
+      - [Expimap](https://docs.scarches.org/en/latest/api/models.html#scarches.models.EXPIMAP)
 3. Cell type annotation
    - [celltypist](https://www.celltypist.org/)
    - [singleR](https://www.bioconductor.org/packages/release/bioc/html/SingleR.html)

@@ -56,6 +56,7 @@ workflow NFCORE_SCDOWNSTREAM {
     scvi_categorical_covariates   //   value: string
     scvi_continuous_covariates    //   value: string
     scimilarity_model             //   value: string
+    expimap_gmt                   //   value: string
     skip_liana                    //   value: boolean
     skip_rankgenesgroups          //   value: boolean
     base_embeddings               //   value: string
@@ -104,6 +105,7 @@ workflow NFCORE_SCDOWNSTREAM {
         scvi_categorical_covariates,
         scvi_continuous_covariates,
         scimilarity_model,
+        expimap_gmt,
         skip_liana,
         skip_rankgenesgroups,
         base_embeddings,
@@ -179,6 +181,7 @@ workflow {
         params.scvi_categorical_covariates,
         params.scvi_continuous_covariates,
         params.scimilarity_model,
+        params.expimap_gmt,
         params.skip_liana,
         params.skip_rankgenesgroups,
         params.base_embeddings,

@@ -14,7 +14,7 @@
 
 adata = ad.read_h5ad("${h5ad}")
 
-integration_methods = ["harmony", "scvi", "scanvi", "scimilarity", "seurat", "bbknn", "combat"]
+integration_methods = ["harmony", "scvi", "scanvi", "scimilarity", "seurat", "bbknn", "combat", "pca", "expimap"]
 
 for integration in integration_methods:
     embedding_key = f"X_{integration}"

@@ -4,6 +4,5 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - conda-forge::python=3.12.11
-  - conda-forge::pyyaml=6.0.2
-  - conda-forge::scanpy=1.11.2
+  - conda-forge::pyyaml=6.0.3
+  - conda-forge::scanpy=1.11.5
@@ -4,8 +4,8 @@ process SCANPY_PCA {
 
     conda "${moduleDir}/environment.yml"
     container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
-        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/16/168ecbbe27ccef766741ccbf937b0d2675be2e19b0565035e0719f1e9ea5ee95/data'
-        : 'community.wave.seqera.io/library/python_pyyaml_scanpy:b5509a698e9aae25'}"
+        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/fd/fd27aeaf160eaba9a58c029e08f1da74051aa292c2fb043a5dd68fddcde3af93/data'
+        : 'community.wave.seqera.io/library/pyyaml_scanpy:3c9e9f631f45553d'}"
 
     input:
     tuple val(meta), path(h5ad)

@@ -0,0 +1,9 @@
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - python=3.14
+  - pip
+  - pip:
+    - scArches==0.6.1
+    - anndata==0.9.2
@@ -0,0 +1,35 @@
+process SCARCHES_EXPIMAP {
+    tag "${meta.id}"
+    label 'process_medium'
+
+    conda "${moduleDir}/environment.yml"
+    container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+            ? 'https://wave.seqera.io/view/builds/bd-e532922f69f9a648_1'
+            : 'community.wave.seqera.io/library/pip_scarches:7e8c7e577326f6ad'}"
+
+    input:
+    tuple val(meta), path(h5ad, arity: 1)
+    tuple val(meta2), path(reference_model)
+    val(batch_col)
+    val(counts_layer)
+
+    output:
+    tuple val(meta), path("${prefix}.h5ad"), emit: h5ad
+    path "X_${prefix}.pkl", emit: obsm
+    path "versions.yml", emit: versions
+
+    script:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    if ("${prefix}.h5ad" == "${h5ad}") {
+        error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
+    }
+    template('expimap.py')
+
+    stub:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}.h5ad
+    touch X_${prefix}.pkl
+    touch versions.yml
+    """
+}
@@ -0,0 +1,69 @@
+name: "scarches_expimap"
+description: "Train EXPIMAP model from scArches for interpretable embedding and pathway analysis"
+keywords:
+  - scarches
+  - expimap
+  - interpretable embeddings
+  - pathway analysis
+  - single-cell
+tools:
+  - "scarches":
+      description: "Transfer learning for single-cell genomics"
+      homepage: "https://github.com/theislab/scarches"
+      documentation: "https://scarches.readthedocs.io"
+      tool_dev_url: "https://github.com/theislab/scarches"
+      doi: "10.1038/s41592-021-01232-1"
+      licence: ["BSD-3-Clause"]
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample metadata
+        e.g. `[ id:'sample1' ]`
+
+  - h5ad:
+      type: file
+      pattern: "*.{h5ad}"
+      description: "Annotated data matrix (h5ad format)"
+
+  - meta2:
+      type: map
+      description: |
+        Groovy Map for optional reference model metadata
+        e.g. `[ id:'reference' ]`
+
+  - reference_model:
+      type: file
+      pattern: "*.{gmt,gmx}"
+      description: "Optional gene module reference file (GMD/GMT format) for biological knowledge"
+      required: false
+
+  - batch_col:
+      type: string
+      description: "Column name in adata.obs containing batch/condition information"
+
+  - counts_layer:
+      type: string
+      description: "Layer name containing count data. Default uses 'X' (expression matrix)"
+
+output:
+  - h5ad:
+      type: file
+      pattern: "*.h5ad"
+      description: "Output h5ad file with EXPIMAP embedding in obsm['X_expimap_latent']"
+
+  - obsm:
+      type: file
+      pattern: "X_*.pkl"
+      description: "Pickle file containing EXPIMAP latent embedding matrix"
+
+  - versions:
+      type: file
+      pattern: "versions.yml"
+      description: "File containing software versions"
+
+authors:
+  - "@your_github_username"
+maintainers:
+  - "@your_github_username"
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+
+import os
+import platform
+import yaml
+
+os.environ["MPLCONFIGDIR"] = "./tmp/mpl"
+os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba"
+
+import scarches as sca
+import pandas as pd
+import scanpy as sc
+import torch
+
+from threadpoolctl import threadpool_limits
+threadpool_limits(int("${task.cpus}"))
+torch.set_num_threads(int("${task.cpus}"))
+
+adata = sc.read_h5ad("${h5ad}")
+
+adata_processing = adata.copy()
+
+if "${counts_layer}" != "X":
+    adata_processing.X = adata.layers["${counts_layer}"]
+
+# Prior biological knowledge in form of gene programs
+if "${reference_model}":
+    sca.utils.add_annotations(adata_processing, "${reference_model}", min_genes=12, clean=True)
+else:
+    raise ValueError("Reference model is required for EXPIMAP. Please provide a path to the reference model.")
+
+# Initialization of the model with the reference network
+intr_cvae = sca.models.EXPIMAP(
+    adata=adata_processing,
+    condition_key="${batch_col}",
+    hidden_layer_sizes=[256, 256, 256],
+    recon_loss="nb"
+)
+
+# Train the model
+intr_cvae.train(
+    n_epochs=400,
+    alpha_epoch_anneal=100,
+    alpha=0.7,
+    alpha_kl=0.5,
+    use_early_stopping=True
+)
+
+# Extract the interpretable latent representation
+emb = intr_cvae.get_latent(only_active=True)
+adata.obsm['X_emb'] = emb  
+
+adata.write_h5ad("${prefix}.h5ad")
+df = pd.DataFrame(emb, index=adata.obs_names)
+df.to_pickle("X_${prefix}.pkl")
+
+# Versions
+versions = {
+    "${task.process}": {
+        "python": platform.python_version(),
+        "scanpy": sc.__version__,
+        "pandas": pd.__version__,
+        "scarches": sca.__version__
+    }
+}
+
+with open("versions.yml", "w") as f:
+    yaml.dump(versions, f)
+