Hochfrequenz · hf-kklein · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026 · Apr 2, 2026
diff --git a/env.example b/env.example
@@ -1,4 +1,8 @@
 EBD_DOCX_FILE=./edi_energy_mirror/edi_energy_de/FV2410/Entscheidungsbaum-DiagrammeundCodelisten-informatorischeLesefassung3.5KonsolidierteLesefassungmitFehlerkorrekturenStand31.07.2024_20250403_20240403.docx
 OUTPUT_DIR=./ebd_toolchain/machine-readable_entscheidungsbaumdiagramme/FV2404
 KROKI_PORT=8125
-KROKI_HOST=kroki
+KROKI_HOST=kroki
+# Optional: GitHub token to download AHB DB from Hochfrequenz/xml-migs-and-ahbs releases for Prüfidentifikator mapping
+GITHUB_TOKEN=
+# Optional: Format version filter for Pruefi mapping, e.g. FV2504, FV2610
+FORMAT_VERSION=
diff --git a/pyproject.toml b/pyproject.toml
@@ -20,9 +20,11 @@ classifiers = [
 ]
 dependencies = [
     "ebdamame>=1.0.0",
-    "rebdhuhn>=1.0.0",
+    "rebdhuhn>=1.2.0",
     "click",
-    "pydantic-settings"
+    "pydantic-settings",
+    "fundamend[sqlmodels]>=0.34.0",
+    "py7zr",
     # add all the dependencies here
 ]
 dynamic = ["readme", "version"]
@@ -34,7 +36,7 @@ linting = ["pylint==4.0.5"]
 spellcheck = ["codespell==2.4.2"]
 test_packaging = ["build==1.4.2", "twine==6.2.0"]
 tests = ["pytest==9.0.2"]
-type_check = ["mypy==1.19.1"]
+type_check = ["mypy==1.19.1", "types-requests"]
 
 [project.urls]
 Changelog = "https://github.com/Hochfrequenz/ebd_toolchain/releases"

diff --git a/requirements.txt b/requirements.txt
@@ -2,59 +2,92 @@
 # This file is autogenerated by pip-compile with Python 3.12
 # by the following command:
 #
-#    pip-compile --cert=None --client-cert=None --index-url=None --pip-args=None pyproject.toml
+#    pip-compile --output-file=requirements.txt pyproject.toml
 #
 annotated-types==0.7.0
     # via pydantic
+brotli==1.2.0
+    # via py7zr
 certifi==2026.2.25
     # via requests
-charset-normalizer==3.4.6
+charset-normalizer==3.4.7
     # via requests
 click==8.3.1
     # via
     #   ebd-toolchain (pyproject.toml)
     #   ebdamame
-colorama==0.4.6
-    # via click
 ebdamame==1.0.0
     # via ebd-toolchain (pyproject.toml)
+efoli==2.2.0
+    # via
+    #   fundamend
+    #   rebdhuhn
+fundamend[sqlmodels]==0.34.1
+    # via ebd-toolchain (pyproject.toml)
 idna==3.11
     # via requests
+inflate64==1.0.4
+    # via py7zr
 lxml==6.0.2
     # via
     #   python-docx
     #   rebdhuhn
     #   svgutils
 more-itertools==10.8.0
     # via ebdamame
+multivolumefile==0.2.3
+    # via py7zr
 networkx==3.6.1
     # via rebdhuhn
+psutil==7.2.2
+    # via py7zr
+py7zr==1.1.0
+    # via ebd-toolchain (pyproject.toml)
+pybcj==1.0.7
+    # via py7zr
+pycryptodomex==3.23.0
+    # via py7zr
 pydantic==2.12.5
     # via
     #   ebdamame
+    #   fundamend
     #   pydantic-settings
     #   rebdhuhn
+    #   sqlmodel
 pydantic-core==2.41.5
     # via pydantic
 pydantic-settings==2.13.1
     # via ebd-toolchain (pyproject.toml)
+pyppmd==1.3.1
+    # via py7zr
 python-docx==1.2.0
     # via ebdamame
 python-dotenv==1.2.2
     # via pydantic-settings
-rebdhuhn==1.1.0
+pytz==2026.1.post1
+    # via efoli
+rebdhuhn==1.2.0
     # via
     #   ebd-toolchain (pyproject.toml)
     #   ebdamame
 requests==2.33.1
     # via rebdhuhn
+sqlalchemy==2.0.48
+    # via
+    #   fundamend
+    #   sqlmodel
+sqlmodel==0.0.37
+    # via fundamend
 svgutils==0.3.4
     # via rebdhuhn
+texttable==1.7.0
+    # via py7zr
 typing-extensions==4.15.0
     # via
     #   pydantic
     #   pydantic-core
     #   python-docx
+    #   sqlalchemy
     #   typing-inspection
 typing-inspection==0.4.2
     # via

diff --git a/src/ebd_toolchain/ahb_pruefi.py b/src/ebd_toolchain/ahb_pruefi.py
@@ -0,0 +1,127 @@
+"""
+This module downloads the AHB SQLite database from the xml-migs-and-ahbs GitHub release
+and queries the EBD-to-Pruefidentifikator mapping from the v_ahbtabellen view.
+"""
+
+import json
+import logging
+import tempfile
+from pathlib import Path
+
+import py7zr
+import requests
+from efoli import EdifactFormatVersion
+from rebdhuhn.models.ebd_table import EbdPruefidentifikator
+from sqlmodel import Session, create_engine, text
+
+_logger = logging.getLogger(__name__)
+
+
+def download_ahb_db(github_token: str, target_dir: Path | None = None) -> Path:
+    """
+    Downloads the unencrypted AHB SQLite database from the latest xml-migs-and-ahbs release.
+    Returns the path to the extracted .db file.
+    """
+    headers = {
+        "Authorization": f"Bearer {github_token}",
+        "Accept": "application/vnd.github+json",
+    }
+    # Get the latest release
+    release_url = "https://api.github.com/repos/Hochfrequenz/xml-migs-and-ahbs/releases/latest"
+    response = requests.get(release_url, headers=headers, timeout=30)
+    response.raise_for_status()
+    release_data = response.json()
+
+    # Find the unencrypted .db.7z asset (not .encrypted.7z)
+    db_asset = None
+    for asset in release_data["assets"]:
+        if asset["name"].endswith(".db.7z") and ".encrypted." not in asset["name"]:
+            db_asset = asset
+            break
+
+    if db_asset is None:
+        raise FileNotFoundError("No unencrypted .db.7z asset found in the latest release")
+
+    _logger.info("Downloading AHB database: %s", db_asset["name"])
+
+    # Download the asset
+    download_url = db_asset["url"]
+    download_response = requests.get(
+        download_url,
+        headers={**headers, "Accept": "application/octet-stream"},
+        timeout=300,
+    )
+    download_response.raise_for_status()
+
+    if target_dir is None:
+        target_dir = Path(tempfile.mkdtemp(prefix="ahb_db_"))
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    archive_path = target_dir / db_asset["name"]
+    archive_path.write_bytes(download_response.content)
+
+    # Extract the .7z archive
+    with py7zr.SevenZipFile(archive_path, mode="r") as archive:
+        archive.extractall(path=target_dir)
+    archive_path.unlink()
+
+    # Find the extracted .db file
+    db_files = list(target_dir.glob("*.db"))
+    if not db_files:
+        raise FileNotFoundError(f"No .db file found after extracting {archive_path}")
+
+    _logger.info("AHB database extracted to: %s", db_files[0])
+    return db_files[0]
+
+
+def get_ebd_to_pruefis_mapping(
+    db_path: Path, format_version: str | None = None
+) -> dict[str, list[EbdPruefidentifikator]]:
+    """
+    Queries the v_ahbtabellen view for EBD qualifiers and returns a mapping
+    from EBD code (e.g. 'E_0003') to a list of EbdPruefidentifikator objects.
+
+    Uses GLOB and GROUP BY in SQLite to do all filtering and aggregation in the database.
+
+    Args:
+        db_path: Path to the AHB SQLite database file.
+        format_version: Format version filter (e.g. 'FV2610').
+            Required to produce correct results — without it, duplicate EBD keys
+            across format versions would collide.
+    """
+    engine = create_engine(f"sqlite:///{db_path}")
+
+    # We use GLOB instead of REGEXP because SQLite does not ship a built-in REGEXP implementation.
+    # REGEXP requires a user-defined function registered on the connection, which sqlmodel/sqlalchemy don't do.
+    # GLOB supports character classes natively: [0-9] matches a single digit.
+    sql = """
+        SELECT format_version,
+               qualifier AS ebd_key,
+               JSON_GROUP_ARRAY(DISTINCT pruefidentifikator) AS pruefidentifikatoren
+        FROM v_ahbtabellen
+        WHERE qualifier GLOB 'E_[0-9][0-9][0-9][0-9]'
+    """
+    params: dict[str, str] = {}
+    if format_version is not None:
+        sql += " AND format_version = :fv"
+        params["fv"] = format_version
+    sql += " GROUP BY format_version, qualifier ORDER BY ebd_key"
+
+    with Session(bind=engine) as session:
+        results = session.exec(text(sql), params=params).all()  # type: ignore[call-overload]
+
+    mapping: dict[str, list[EbdPruefidentifikator]] = {}
+    for fv, ebd_key, pruefis_json in results:
+        pruefis: list[str] = json.loads(pruefis_json)
+        new_entries = [
+            EbdPruefidentifikator(format_version=EdifactFormatVersion(fv), pruefidentifikator=pi) for pi in pruefis
+        ]
+        if ebd_key in mapping:
+            # Multiple format versions for the same EBD — merge and deduplicate
+            existing_pis = {p.pruefidentifikator for p in mapping[ebd_key]}
+            mapping[ebd_key].extend(e for e in new_entries if e.pruefidentifikator not in existing_pis)
+        else:
+            mapping[ebd_key] = new_entries
+        mapping[ebd_key].sort(key=lambda p: p.pruefidentifikator)
+
+    return mapping
diff --git a/src/ebd_toolchain/main.py b/src/ebd_toolchain/main.py
@@ -27,7 +27,7 @@
 import logging
 from enum import Enum
 from pathlib import Path
-from typing import Literal
+from typing import Literal, Optional
 
 import click
 from ebdamame import (
@@ -38,16 +38,18 @@
     get_ebd_docx_tables,
 )
 from ebdamame.docxtableconverter import DocxTableConverter
-from pydantic import Field
+from pydantic import Field, field_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from rebdhuhn.graph_conversion import convert_table_to_graph
 from rebdhuhn.graphviz import convert_dot_to_svg_kroki, convert_graph_to_dot
 from rebdhuhn.kroki import DotToSvgConverter, Kroki
 from rebdhuhn.models.ebd_graph import EbdGraph
-from rebdhuhn.models.ebd_table import EbdTable, EbdTableMetaData
+from rebdhuhn.models.ebd_table import EbdPruefidentifikator, EbdTable, EbdTableMetaData
 from rebdhuhn.models.errors import GraphConversionError, PlantumlConversionError, SvgConversionError
 from rebdhuhn.plantuml import convert_graph_to_plantuml
 
+from ebd_toolchain.ahb_pruefi import download_ahb_db, get_ebd_to_pruefis_mapping
+
 _logger = logging.getLogger(__name__)
 
 
@@ -68,8 +70,22 @@ class Settings(BaseSettings):
 
     kroki_port: int = Field(alias="KROKI_PORT")
     kroki_host: str = Field(alias="KROKI_HOST")
+    github_token: Optional[str] = Field(
+        default=None,
+        alias="GITHUB_TOKEN",
+        description="GitHub token to download AHB DB from Hochfrequenz/xml-migs-and-ahbs releases",
+    )
+    format_version: Optional[str] = Field(default=None, alias="FORMAT_VERSION")
     model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
 
+    @field_validator("github_token", "format_version", mode="before")
+    @classmethod
+    def empty_str_to_none(cls, v: object) -> object:
+        """Treat empty strings from .env files as None."""
+        if v == "":
+            return None
+        return v
+
 
 def _dump_puml(puml_path: Path, ebd_graph: EbdGraph) -> None:
     plantuml_code = convert_graph_to_plantuml(ebd_graph)
@@ -87,7 +103,12 @@ def _dump_dot(dot_path: Path, ebd_graph: EbdGraph) -> None:
 
 def _dump_svg(svg_path: Path, ebd_graph: EbdGraph, converter: DotToSvgConverter) -> None:
     dot_code = convert_graph_to_dot(ebd_graph, ebd_link_template="?ebd={ebd_code}")
-    svg_code = convert_dot_to_svg_kroki(dot_code, converter, release_info=ebd_graph.metadata.release_information)
+    svg_code = convert_dot_to_svg_kroki(
+        dot_code,
+        converter,
+        release_info=ebd_graph.metadata.release_information,
+        pruefidentifikatoren=ebd_graph.metadata.pruefidentifikatoren,
+    )
     with open(svg_path, "w+", encoding="utf-8") as svg_file:
         svg_file.write(svg_code)
 
@@ -133,6 +154,15 @@ def _main(input_path: Path, output_path: Path, export_types: list[Literal["puml"
     settings = Settings()  # type: ignore[call-arg]
     # read settings from environment variable/.env file
     kroki_client = Kroki(kroki_host=f"http://{settings.kroki_host}:{settings.kroki_port}")
+
+    # Load EBD-to-Pruefidentifikator mapping from AHB database (if available)
+    ebd_to_pruefis: dict[str, list[EbdPruefidentifikator]] = {}
+    if settings.github_token:
+        click.secho("Downloading AHB database from xml-migs-and-ahbs...", fg="cyan")
+        db_path = download_ahb_db(settings.github_token)
+        click.secho(f"Loading EBD-to-Prüfi mapping from {db_path} (format_version={settings.format_version})")
+        ebd_to_pruefis = get_ebd_to_pruefis_mapping(db_path, format_version=settings.format_version)
+        click.secho(f"Loaded Prüfi mapping for {len(ebd_to_pruefis)} EBDs", fg="green")
     if output_path.exists() and output_path.is_dir():
         click.secho(f"The output directory '{output_path}' exists already. Will remove its content.", fg="yellow")
         for item in output_path.iterdir():
@@ -188,6 +218,7 @@ def handle_known_error(error: Exception, ebd_key: str, category: ErrorCategory)
                     section=f"{ebd_kapitel.chapter}.{ebd_kapitel.section}.{ebd_kapitel.subsection}: {ebd_kapitel.section_title}",
                 )
                 ebd_table = converter.convert_docx_tables_to_ebd_table()
+            ebd_table.metadata.pruefidentifikatoren = ebd_to_pruefis.get(ebd_key)
         except Exception as scraping_error:  # pylint:disable=broad-except
             handle_known_error(scraping_error, ebd_key, ErrorCategory.SCRAPING)
             continue