Integration Paperless NGX to TruSpace #254

jzakotnik · 2025-11-10T10:33:01Z

jzakotnik
Nov 10, 2025
Maintainer

I played a bit with Paperless NGX which seems to be a very nice and popular document management system (easy to install on a raspberry).

Only as a proof of concept, I vibe coded a small python script that polls a TruSpace workspace and inserts new files a Paperless folder. Works like a charm on a raspberry 💯

#!/usr/bin/env python3
"""
paperless_sync_truspace.py

TruSpace → Paperless-ngx bridge

- Lists documents from TruSpace:  /api/documents?workspace=...
- Detects new/updated by (docId, meta.version)
- Downloads file bytes via:       /api/documents/version/{cid}   (preferred on your instance)
  (falls back to several docId routes and then IPFS gateway if provided)
- Uploads to Paperless-ngx via:   /api/documents/post_document/
- Tracks progress in a local state file (no duplicate uploads)

ENV or flags:
  # TruSpace
  TRUSPACE_URL (default http://smartspace.local:8000)
  TRUSPACE_WORKSPACE_ID  (required if not passed via --workspace-id)
  TRUSPACE_CSRFTOKEN
  TRUSPACE_SESSIONID
  TRUSPACE_AUTH_TOKEN
  TRUSPACE_LOGIN_JSON
  TRUSPACE_IPFS_GATEWAY (optional, default http://smartspace.local:8080)

  # Paperless
  PAPERLESS_URL (required)
  PAPERLESS_TOKEN (required)
  PAPERLESS_STORAGE_PATH_ID (optional)

  # General
  PAPERLESS_INTERVAL (seconds, default 600)
  STATE_FILE (default .truspace_paperless_state.json)
  LOG_LEVEL (default INFO)
"""

import argparse
import json
import logging
import os
from pathlib import Path
import signal
import sys
import time
from typing import Dict, List, Optional

import requests

STATE_FILE_NAME = ".truspace_paperless_state.json"
STOP = False

# -------------------- signal handling --------------------
def _sigterm_handler(signum, frame):
    global STOP
    STOP = True
    logging.info("Received signal %s; will exit after this iteration.", signum)

for sig in (signal.SIGINT, signal.SIGTERM):
    try:
        signal.signal(sig, _sigterm_handler)
    except Exception:
        pass

# -------------------- state helpers --------------------
def load_state(state_path: Path) -> Dict[str, Dict]:
    if state_path.exists():
        try:
            return json.loads(state_path.read_text(encoding="utf-8"))
        except Exception:
            logging.warning("State file corrupted; starting fresh.")
    return {}

def save_state(state_path: Path, state: Dict[str, Dict]) -> None:
    tmp = state_path.with_suffix(".tmp")
    tmp.write_text(json.dumps(state, indent=2, sort_keys=True), encoding="utf-8")
    tmp.replace(state_path)

# -------------------- paperless API --------------------
def upload_bytes_to_paperless(
    data: bytes,
    filename: str,
    base_url: str,
    token: str,
    storage_path_id: Optional[int] = None,
    timeout: int = 180,
    api_version: int = 9,
) -> Optional[str]:
    """
    Upload a single file (bytes) to paperless-ngx.
    Returns the task UUID (string) if accepted, else None.
    Handles responses that are:
      - {"task_id": "..."} or {"id": "..."} or {"uuid": "..."}
      - plain JSON string: "...."
      - plain text body with UUID
      - empty body (rare)
    """
    url = base_url.rstrip("/") + "/api/documents/post_document/"
    headers = {
        "Authorization": f"Token {token}",
        "Accept": f"application/json; version={api_version}",
    }
    form = {}
    if storage_path_id is not None:
        form["storage_path"] = str(storage_path_id)
    files = {"document": (filename, data)}

    try:
        r = requests.post(url, headers=headers, data=form, files=files, timeout=timeout)
    except requests.RequestException as e:
        logging.error("Paperless HTTP error uploading %s: %s", filename, e)
        return None

    if r.status_code >= 400:
        logging.error("Paperless upload failed for %s: %s %s", filename, r.status_code, r.text[:500])
        return None

    task_id: Optional[str] = None
    try:
        js = r.json()  # may be dict OR str
        if isinstance(js, dict):
            task_id = js.get("task_id") or js.get("id") or js.get("uuid")
        elif isinstance(js, str):
            task_id = js.strip().strip('"')
    except ValueError:
        pass

    if not task_id:
        body = (r.text or "").strip()
        if body:
            task_id = body.strip('"')
        else:
            logging.info("Paperless returned empty body for %s (HTTP %s).", filename, r.status_code)

    return task_id or None

def poll_paperless_task_for_document_id(
    base_url: str, token: str, task_id: str, timeout: int = 120, api_version: int = 9
) -> Optional[int]:
    """
    Polls /api/tasks/?task_id=<uuid> for up to `timeout` seconds.
    Returns the created document ID (int) if available.
    """
    url = base_url.rstrip("/") + "/api/tasks/"
    headers = {
        "Authorization": f"Token {token}",
        "Accept": f"application/json; version={api_version}",
    }
    deadline = time.time() + timeout
    last_state = None
    while time.time() < deadline and not STOP:
        try:
            r = requests.get(url, headers=headers, params={"task_id": task_id}, timeout=15)
            if r.status_code < 400:
                js = r.json()
                results = js.get("results") if isinstance(js, dict) else None
                if results:
                    res = results[0]
                    state = res.get("state")
                    last_state = state
                    if state == "SUCCESS":
                        result = res.get("result") or res.get("task_result") or {}
                        if isinstance(result, dict):
                            doc_id = result.get("document_id") or result.get("id")
                            if isinstance(doc_id, int):
                                return doc_id
                        return None
                    if state in {"FAILURE", "REVOKED"}:
                        logging.error("Paperless task %s ended in %s: %s", task_id, state, res)
                        return None
            time.sleep(2)
        except requests.RequestException:
            time.sleep(3)
    if last_state:
        logging.info("Paperless task %s last seen state: %s", task_id, last_state)
    return None

# -------------------- TruSpace API --------------------
def truspace_headers(base_url: str, csrftoken: Optional[str], jwt: Optional[str]) -> Dict[str, str]:
    h = {
        "Accept": "application/json",
        "X-Requested-With": "XMLHttpRequest",
        "Referer": base_url.rstrip("/") + "/",
    }
    if csrftoken:
        h["X-CSRFToken"] = csrftoken
        h["X-CSRF-Token"] = csrftoken
    if jwt:
        # Usually not needed if cookies are present, but harmless:
        h["Authorization"] = f"Bearer {jwt}"
    return h

def truspace_cookies(csrftoken, sessionid, auth_token, login) -> Dict[str, str]:
    cookies = {}
    if csrftoken: cookies["csrftoken"] = csrftoken
    if sessionid: cookies["sessionid"] = sessionid
    if auth_token: cookies["auth_token"] = auth_token
    if login:     cookies["login"] = login
    return cookies

def list_truspace_documents(
    base_url: str,
    workspace_id: str,
    cookies: Dict[str, str],
    headers: Dict[str, str],
    start: int = 0,
    limit: int = 50,
    search: str = "",
    timeout: int = 20,
) -> Dict:
    """
    Returns JSON payload from /api/documents with { count, data: [...] }.
    """
    url = base_url.rstrip("/") + "/api/documents"
    params = {
        "from": str(start),
        "limit": str(limit),
        "workspace": workspace_id,
        "search": search,
    }
    r = requests.get(url, params=params, headers=headers, cookies=cookies, timeout=timeout)
    r.raise_for_status()
    return r.json()

def iterate_truspace_pages(
    base_url: str, workspace_id: str, cookies: Dict[str, str], headers: Dict[str, str], page_size: int = 50
) -> List[Dict]:
    """
    Fetches all documents for the workspace (paged).
    """
    start = 0
    out: List[Dict] = []
    while True:
        payload = list_truspace_documents(base_url, workspace_id, cookies, headers, start=start, limit=page_size)
        data = payload.get("data") or []
        out.extend(data)
        count = int(payload.get("count", len(out)))
        if start + page_size >= count or not data:
            break
        start += page_size
    return out

def discover_new_or_updated(items: List[Dict], state: Dict[str, Dict]) -> List[Dict]:
    """
    Decide which documents to transfer based on docId + meta.version.
    Carry cid and filename for download/upload.
    """
    to_transfer = []
    for it in items:
        if not isinstance(it, dict):
            continue
        doc_id = it.get("docId") or it.get("id")
        meta = it.get("meta") or {}
        version = str(meta.get("version") or "")
        filename = meta.get("filename") or it.get("filename") or (doc_id and f"{doc_id}.bin")
        cid = it.get("cid") or meta.get("cid")
        if not doc_id or not filename:
            continue
        prev = state.get(doc_id)
        if not prev or str(prev.get("version", "")) != version:
            to_transfer.append({"docId": doc_id, "filename": filename, "version": version, "cid": cid})
    return to_transfer

def download_truspace_document_file(
    base_url: str,
    doc_id: str,
    cookies: Dict[str, str],
    headers: Dict[str, str],
    timeout: int = 180,
    ipfs_gateway: Optional[str] = None,
    cid: Optional[str] = None,
) -> bytes:
    """
    Your instance serves files by CID at /api/documents/version/{cid}.
    Use that first; then try docId routes; lastly IPFS gateway if provided.
    """
    base = base_url.rstrip("/")
    h = dict(headers)
    h["Accept"] = "application/octet-stream"

    errors = []

    # 1) CID-based route (works on your instance)
    if cid:
        url = f"{base}/api/documents/version/{cid}"
        try:
            r = requests.get(url, headers=h, cookies=cookies, timeout=timeout, stream=True)
            if r.status_code == 200:
                return r.content
            errors.append(f"{url} -> {r.status_code} {r.text[:120]!r}")
        except requests.RequestException as e:
            errors.append(f"{url} -> {e}")

    # 2) docId routes (in case CID missing)
    for path in (
        f"/api/documents/{doc_id}/file",
        f"/api/documents/{doc_id}/download",
        f"/api/document/{doc_id}/file",
        f"/api/document/{doc_id}/download",
    ):
        url = base + path
        try:
            r = requests.get(url, headers=h, cookies=cookies, timeout=timeout, stream=True)
            if r.status_code == 200:
                return r.content
            errors.append(f"{url} -> {r.status_code} {r.text[:120]!r}")
        except requests.RequestException as e:
            errors.append(f"{url} -> {e}")

    # 3) IPFS fallback (optional)
    if ipfs_gateway and cid:
        ipfs_url = f"{ipfs_gateway.rstrip('/')}/ipfs/{cid}"
        try:
            r = requests.get(ipfs_url, timeout=timeout, stream=True)
            r.raise_for_status()
            return r.content
        except requests.RequestException as e:
            errors.append(f"{ipfs_url} -> {e}")

    raise requests.HTTPError("All TruSpace download attempts failed:\n" + "\n".join(errors))

# -------------------- core sync --------------------
def run_once(
    truspace_base: str,
    workspace_id: str,
    cookies: Dict[str, str],
    headers: Dict[str, str],
    paperless_url: str,
    paperless_token: str,
    storage_path_id: Optional[int],
    state: Dict[str, Dict],
    state_path: Path,
) -> None:
    # 1) list TruSpace docs
    all_items = iterate_truspace_pages(truspace_base, workspace_id, cookies, headers, page_size=50)

    # 2) find new/updated
    to_xfer = discover_new_or_updated(all_items, state)
    if not to_xfer:
        logging.info("No new/updated TruSpace documents.")
        return

    logging.info("Found %d new/updated document(s) to transfer.", len(to_xfer))

    # 3) transfer each
    for item in to_xfer:
        if STOP:
            break
        doc_id = item["docId"]
        filename = item["filename"]
        version = item["version"]
        cid = item.get("cid")
        try:
            logging.info("Downloading TruSpace document %s (%s) ...", doc_id, filename)
            blob = download_truspace_document_file(
                truspace_base, doc_id, cookies, headers,
                ipfs_gateway=os.getenv("TRUSPACE_IPFS_GATEWAY", "http://smartspace.local:8080"),
                cid=cid
            )
            logging.info("Uploading to Paperless: %s", filename)
            task_id = upload_bytes_to_paperless(
                data=blob,
                filename=filename,
                base_url=paperless_url,
                token=paperless_token,
                storage_path_id=storage_path_id,
            )
            if task_id:
                logging.info("Paperless accepted upload (task %s). Polling ...", task_id)
                doc_no = poll_paperless_task_for_document_id(paperless_url, paperless_token, task_id)
                if doc_no is not None:
                    logging.info("Paperless ingested #%s from %s", doc_no, filename)
                else:
                    logging.info("Paperless ingestion complete (document id not available yet).")
            else:
                logging.warning("Paperless did not return a task id for %s", filename)

            # Update state on success to avoid re-uploading the same version
            state[doc_id] = {"version": version, "filename": filename, "cid": cid}
            save_state(state_path, state)

        except requests.HTTPError as e:
            logging.error("Download failed for %s: %s", doc_id, e)
        except Exception as e:
            logging.exception("Failed processing %s: %s", doc_id, e)

# -------------------- CLI --------------------
def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description="Sync new/updated TruSpace documents to Paperless-ngx.")
    # TruSpace
    p.add_argument("--truspace-url", default=os.getenv("TRUSPACE_URL", "http://smartspace.local:8000"))
    p.add_argument("--workspace-id", default=os.getenv("TRUSPACE_WORKSPACE_ID"))
    p.add_argument("--csrftoken", default=os.getenv("TRUSPACE_CSRFTOKEN"))
    p.add_argument("--sessionid", default=os.getenv("TRUSPACE_SESSIONID"))
    p.add_argument("--auth-token", default=os.getenv("TRUSPACE_AUTH_TOKEN"))
    p.add_argument("--login", default=os.getenv("TRUSPACE_LOGIN_JSON"))
    # Paperless
    p.add_argument("--url", default=os.getenv("PAPERLESS_URL"), help="Paperless base URL")
    p.add_argument("--token", default=os.getenv("PAPERLESS_TOKEN"), help="Paperless API token")
    p.add_argument("--storage-path-id", type=int, default=os.getenv("PAPERLESS_STORAGE_PATH_ID") and int(os.getenv("PAPERLESS_STORAGE_PATH_ID")))
    # General
    p.add_argument("--interval", type=int, default=int(os.getenv("PAPERLESS_INTERVAL", "600")))
    p.add_argument("--state-file", default=os.getenv("STATE_FILE", STATE_FILE_NAME))
    p.add_argument("--log-level", default=os.getenv("LOG_LEVEL", "INFO"))
    return p.parse_args()

def main():
    args = parse_args()
    logging.basicConfig(
        level=getattr(logging, args.log_level.upper(), logging.INFO),
        format="%(asctime)s %(levelname)s %(message)s",
    )

    if not args.workspace_id:
        print("Error: --workspace-id or TRUSPACE_WORKSPACE_ID is required.", file=sys.stderr)
        sys.exit(2)
    if not args.url or not args.token:
        print("Error: Paperless --url and --token are required (or env vars).", file=sys.stderr)
        sys.exit(2)

    headers = truspace_headers(args.truspace_url, args.csrftoken, args.auth_token)
    cookies = truspace_cookies(args.csrftoken, args.sessionid, args.auth_token, args.login)

    state_path = Path(args.state_file).expanduser().resolve()
    state = load_state(state_path)

    logging.info(
        "Starting bridge. TruSpace=%s Workspace=%s Paperless=%s Interval=%ss",
        args.truspace_url, args.workspace_id, args.url, args.interval,
    )

    while not STOP:
        try:
            run_once(
                truspace_base=args.truspace_url,
                workspace_id=args.workspace_id,
                cookies=cookies,
                headers=headers,
                paperless_url=args.url,
                paperless_token=args.token,
                storage_path_id=args.storage_path_id,
                state=state,
                state_path=state_path,
            )
        except Exception as e:
            logging.exception("Iteration failed: %s", e)

        remaining = args.interval
        while remaining > 0 and not STOP:
            time.sleep(min(5, remaining))
            remaining -= 5

    logging.info("Exiting.")

if __name__ == "__main__":
    main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Integration Paperless NGX to TruSpace #254

Uh oh!

{{title}}

Uh oh!

Replies: 0 comments

Select a reply

Uh oh!

Integration Paperless NGX to TruSpace #254

Uh oh!

jzakotnik Nov 10, 2025 Maintainer

Replies: 0 comments

jzakotnik
Nov 10, 2025
Maintainer