aws-docs-history/aws_docs.py at main · crowecawcaw/aws-docs-history · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Shared AWS documentation utilities used by the crawler and discovery tools."""

from __future__ import annotations

from dataclasses import dataclass
import posixpath
from urllib.parse import urlparse, urlunparse


DOCS_BASE_URL = "https://docs.aws.amazon.com"
DEFAULT_LOCALE = "en_us"
DOCS_NETLOC = urlparse(DOCS_BASE_URL).netloc


@dataclass(frozen=True)
class ServiceGuide:
    """Represent a single documentation guide for a service."""

    title: str
    url: str
    allowed_prefix: str


@dataclass(frozen=True)
class ServiceScope:
    """Describe the default crawl scope for a single AWS service."""

    guides: tuple[ServiceGuide, ...]

    @property
    def start_urls(self) -> tuple[str, ...]:
        return tuple(guide.url for guide in self.guides)

    @property
    def allowed_prefixes(self) -> tuple[str, ...]:
        return tuple(sorted({guide.allowed_prefix for guide in self.guides}))


def normalise_url(url: str) -> str:
    """Normalise a URL by removing fragments and redundant path segments."""

    parsed = urlparse(url)
    cleaned_path = posixpath.normpath(parsed.path or "/")
    if parsed.path.endswith("/") and not cleaned_path.endswith("/"):
        cleaned_path += "/"

    cleaned = parsed._replace(path=cleaned_path, fragment="", query="")
    return urlunparse(cleaned)


def derive_allowed_prefix(url: str) -> str:
    """Derive the crawl prefix for ``url`` by trimming to the containing folder."""

    path = urlparse(url).path or "/"
    normalized = "/" + path.lstrip("/")
    directory = normalized if normalized.endswith("/") else posixpath.dirname(normalized)
    directory = directory or "/"

    if directory != "/" and not directory.endswith("/"):
        directory = directory.rstrip("/") + "/"

    return directory