Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
EBD_DOCX_FILE=./edi_energy_mirror/edi_energy_de/FV2410/Entscheidungsbaum-DiagrammeundCodelisten-informatorischeLesefassung3.5KonsolidierteLesefassungmitFehlerkorrekturenStand31.07.2024_20250403_20240403.docx
OUTPUT_DIR=./ebd_toolchain/machine-readable_entscheidungsbaumdiagramme/FV2404
KROKI_PORT=8125
KROKI_HOST=kroki
KROKI_HOST=kroki
# Optional: GitHub token to download AHB DB from Hochfrequenz/xml-migs-and-ahbs releases for Prüfidentifikator mapping
GITHUB_TOKEN=
# Optional: Format version filter for Pruefi mapping, e.g. FV2504, FV2610
FORMAT_VERSION=
8 changes: 5 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@ classifiers = [
]
dependencies = [
"ebdamame>=1.0.0",
"rebdhuhn>=1.0.0",
"rebdhuhn>=1.2.0",
"click",
"pydantic-settings"
"pydantic-settings",
"fundamend[sqlmodels]>=0.34.0",
"py7zr",
# add all the dependencies here
]
dynamic = ["readme", "version"]
Expand All @@ -34,7 +36,7 @@ linting = ["pylint==4.0.5"]
spellcheck = ["codespell==2.4.2"]
test_packaging = ["build==1.4.2", "twine==6.2.0"]
tests = ["pytest==9.0.2"]
type_check = ["mypy==1.19.1"]
type_check = ["mypy==1.19.1", "types-requests"]

[project.urls]
Changelog = "https://github.com/Hochfrequenz/ebd_toolchain/releases"
Expand Down
43 changes: 38 additions & 5 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,59 +2,92 @@
# This file is autogenerated by pip-compile with Python 3.12
# by the following command:
#
# pip-compile --cert=None --client-cert=None --index-url=None --pip-args=None pyproject.toml
# pip-compile --output-file=requirements.txt pyproject.toml
#
annotated-types==0.7.0
# via pydantic
brotli==1.2.0
# via py7zr
certifi==2026.2.25
# via requests
charset-normalizer==3.4.6
charset-normalizer==3.4.7
# via requests
click==8.3.1
# via
# ebd-toolchain (pyproject.toml)
# ebdamame
colorama==0.4.6
# via click
ebdamame==1.0.0
# via ebd-toolchain (pyproject.toml)
efoli==2.2.0
# via
# fundamend
# rebdhuhn
fundamend[sqlmodels]==0.34.1
# via ebd-toolchain (pyproject.toml)
idna==3.11
# via requests
inflate64==1.0.4
# via py7zr
lxml==6.0.2
# via
# python-docx
# rebdhuhn
# svgutils
more-itertools==10.8.0
# via ebdamame
multivolumefile==0.2.3
# via py7zr
networkx==3.6.1
# via rebdhuhn
psutil==7.2.2
# via py7zr
py7zr==1.1.0
# via ebd-toolchain (pyproject.toml)
pybcj==1.0.7
# via py7zr
pycryptodomex==3.23.0
# via py7zr
pydantic==2.12.5
# via
# ebdamame
# fundamend
# pydantic-settings
# rebdhuhn
# sqlmodel
pydantic-core==2.41.5
# via pydantic
pydantic-settings==2.13.1
# via ebd-toolchain (pyproject.toml)
pyppmd==1.3.1
# via py7zr
python-docx==1.2.0
# via ebdamame
python-dotenv==1.2.2
# via pydantic-settings
rebdhuhn==1.1.0
pytz==2026.1.post1
# via efoli
rebdhuhn==1.2.0
# via
# ebd-toolchain (pyproject.toml)
# ebdamame
requests==2.33.1
# via rebdhuhn
sqlalchemy==2.0.48
# via
# fundamend
# sqlmodel
sqlmodel==0.0.37
# via fundamend
svgutils==0.3.4
# via rebdhuhn
texttable==1.7.0
# via py7zr
typing-extensions==4.15.0
# via
# pydantic
# pydantic-core
# python-docx
# sqlalchemy
# typing-inspection
typing-inspection==0.4.2
# via
Expand Down
127 changes: 127 additions & 0 deletions src/ebd_toolchain/ahb_pruefi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""
This module downloads the AHB SQLite database from the xml-migs-and-ahbs GitHub release
and queries the EBD-to-Pruefidentifikator mapping from the v_ahbtabellen view.
"""

import json
import logging
import tempfile
from pathlib import Path

import py7zr
import requests
from efoli import EdifactFormatVersion
from rebdhuhn.models.ebd_table import EbdPruefidentifikator
from sqlmodel import Session, create_engine, text

_logger = logging.getLogger(__name__)


def download_ahb_db(github_token: str, target_dir: Path | None = None) -> Path:
"""
Downloads the unencrypted AHB SQLite database from the latest xml-migs-and-ahbs release.
Returns the path to the extracted .db file.
"""
headers = {
"Authorization": f"Bearer {github_token}",
"Accept": "application/vnd.github+json",
}
# Get the latest release
release_url = "https://api.github.com/repos/Hochfrequenz/xml-migs-and-ahbs/releases/latest"
response = requests.get(release_url, headers=headers, timeout=30)
response.raise_for_status()
release_data = response.json()

# Find the unencrypted .db.7z asset (not .encrypted.7z)
db_asset = None
for asset in release_data["assets"]:
if asset["name"].endswith(".db.7z") and ".encrypted." not in asset["name"]:
db_asset = asset
break

if db_asset is None:
raise FileNotFoundError("No unencrypted .db.7z asset found in the latest release")

_logger.info("Downloading AHB database: %s", db_asset["name"])

# Download the asset
download_url = db_asset["url"]
download_response = requests.get(
download_url,
headers={**headers, "Accept": "application/octet-stream"},
timeout=300,
)
download_response.raise_for_status()

if target_dir is None:
target_dir = Path(tempfile.mkdtemp(prefix="ahb_db_"))
target_dir.mkdir(parents=True, exist_ok=True)

archive_path = target_dir / db_asset["name"]
archive_path.write_bytes(download_response.content)

# Extract the .7z archive
with py7zr.SevenZipFile(archive_path, mode="r") as archive:
archive.extractall(path=target_dir)
archive_path.unlink()

# Find the extracted .db file
db_files = list(target_dir.glob("*.db"))
if not db_files:
raise FileNotFoundError(f"No .db file found after extracting {archive_path}")

_logger.info("AHB database extracted to: %s", db_files[0])
return db_files[0]


def get_ebd_to_pruefis_mapping(
db_path: Path, format_version: str | None = None
) -> dict[str, list[EbdPruefidentifikator]]:
"""
Queries the v_ahbtabellen view for EBD qualifiers and returns a mapping
from EBD code (e.g. 'E_0003') to a list of EbdPruefidentifikator objects.

Uses GLOB and GROUP BY in SQLite to do all filtering and aggregation in the database.

Args:
db_path: Path to the AHB SQLite database file.
format_version: Format version filter (e.g. 'FV2610').
Required to produce correct results — without it, duplicate EBD keys
across format versions would collide.
"""
engine = create_engine(f"sqlite:///{db_path}")

# We use GLOB instead of REGEXP because SQLite does not ship a built-in REGEXP implementation.
# REGEXP requires a user-defined function registered on the connection, which sqlmodel/sqlalchemy don't do.
# GLOB supports character classes natively: [0-9] matches a single digit.
sql = """
SELECT format_version,
qualifier AS ebd_key,
JSON_GROUP_ARRAY(DISTINCT pruefidentifikator) AS pruefidentifikatoren
FROM v_ahbtabellen
WHERE qualifier GLOB 'E_[0-9][0-9][0-9][0-9]'
"""
params: dict[str, str] = {}
if format_version is not None:
sql += " AND format_version = :fv"
params["fv"] = format_version
sql += " GROUP BY format_version, qualifier ORDER BY ebd_key"

with Session(bind=engine) as session:
results = session.exec(text(sql), params=params).all() # type: ignore[call-overload]

mapping: dict[str, list[EbdPruefidentifikator]] = {}
for fv, ebd_key, pruefis_json in results:
pruefis: list[str] = json.loads(pruefis_json)
new_entries = [
EbdPruefidentifikator(format_version=EdifactFormatVersion(fv), pruefidentifikator=pi) for pi in pruefis
]
if ebd_key in mapping:
# Multiple format versions for the same EBD — merge and deduplicate
existing_pis = {p.pruefidentifikator for p in mapping[ebd_key]}
mapping[ebd_key].extend(e for e in new_entries if e.pruefidentifikator not in existing_pis)
else:
mapping[ebd_key] = new_entries
mapping[ebd_key].sort(key=lambda p: p.pruefidentifikator)

return mapping
39 changes: 35 additions & 4 deletions src/ebd_toolchain/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import logging
from enum import Enum
from pathlib import Path
from typing import Literal
from typing import Literal, Optional

import click
from ebdamame import (
Expand All @@ -38,16 +38,18 @@
get_ebd_docx_tables,
)
from ebdamame.docxtableconverter import DocxTableConverter
from pydantic import Field
from pydantic import Field, field_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
from rebdhuhn.graph_conversion import convert_table_to_graph
from rebdhuhn.graphviz import convert_dot_to_svg_kroki, convert_graph_to_dot
from rebdhuhn.kroki import DotToSvgConverter, Kroki
from rebdhuhn.models.ebd_graph import EbdGraph
from rebdhuhn.models.ebd_table import EbdTable, EbdTableMetaData
from rebdhuhn.models.ebd_table import EbdPruefidentifikator, EbdTable, EbdTableMetaData
from rebdhuhn.models.errors import GraphConversionError, PlantumlConversionError, SvgConversionError
from rebdhuhn.plantuml import convert_graph_to_plantuml

from ebd_toolchain.ahb_pruefi import download_ahb_db, get_ebd_to_pruefis_mapping

_logger = logging.getLogger(__name__)


Expand All @@ -68,8 +70,22 @@ class Settings(BaseSettings):

kroki_port: int = Field(alias="KROKI_PORT")
kroki_host: str = Field(alias="KROKI_HOST")
github_token: Optional[str] = Field(
default=None,
alias="GITHUB_TOKEN",
description="GitHub token to download AHB DB from Hochfrequenz/xml-migs-and-ahbs releases",
)
format_version: Optional[str] = Field(default=None, alias="FORMAT_VERSION")
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")

@field_validator("github_token", "format_version", mode="before")
@classmethod
def empty_str_to_none(cls, v: object) -> object:
"""Treat empty strings from .env files as None."""
if v == "":
return None
return v


def _dump_puml(puml_path: Path, ebd_graph: EbdGraph) -> None:
plantuml_code = convert_graph_to_plantuml(ebd_graph)
Expand All @@ -87,7 +103,12 @@ def _dump_dot(dot_path: Path, ebd_graph: EbdGraph) -> None:

def _dump_svg(svg_path: Path, ebd_graph: EbdGraph, converter: DotToSvgConverter) -> None:
dot_code = convert_graph_to_dot(ebd_graph, ebd_link_template="?ebd={ebd_code}")
svg_code = convert_dot_to_svg_kroki(dot_code, converter, release_info=ebd_graph.metadata.release_information)
svg_code = convert_dot_to_svg_kroki(
dot_code,
converter,
release_info=ebd_graph.metadata.release_information,
pruefidentifikatoren=ebd_graph.metadata.pruefidentifikatoren,
)
with open(svg_path, "w+", encoding="utf-8") as svg_file:
svg_file.write(svg_code)

Expand Down Expand Up @@ -133,6 +154,15 @@ def _main(input_path: Path, output_path: Path, export_types: list[Literal["puml"
settings = Settings() # type: ignore[call-arg]
# read settings from environment variable/.env file
kroki_client = Kroki(kroki_host=f"http://{settings.kroki_host}:{settings.kroki_port}")

# Load EBD-to-Pruefidentifikator mapping from AHB database (if available)
ebd_to_pruefis: dict[str, list[EbdPruefidentifikator]] = {}
if settings.github_token:
click.secho("Downloading AHB database from xml-migs-and-ahbs...", fg="cyan")
db_path = download_ahb_db(settings.github_token)
click.secho(f"Loading EBD-to-Prüfi mapping from {db_path} (format_version={settings.format_version})")
ebd_to_pruefis = get_ebd_to_pruefis_mapping(db_path, format_version=settings.format_version)
click.secho(f"Loaded Prüfi mapping for {len(ebd_to_pruefis)} EBDs", fg="green")
if output_path.exists() and output_path.is_dir():
click.secho(f"The output directory '{output_path}' exists already. Will remove its content.", fg="yellow")
for item in output_path.iterdir():
Expand Down Expand Up @@ -188,6 +218,7 @@ def handle_known_error(error: Exception, ebd_key: str, category: ErrorCategory)
section=f"{ebd_kapitel.chapter}.{ebd_kapitel.section}.{ebd_kapitel.subsection}: {ebd_kapitel.section_title}",
)
ebd_table = converter.convert_docx_tables_to_ebd_table()
ebd_table.metadata.pruefidentifikatoren = ebd_to_pruefis.get(ebd_key)
except Exception as scraping_error: # pylint:disable=broad-except
handle_known_error(scraping_error, ebd_key, ErrorCategory.SCRAPING)
continue
Expand Down