-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathaws_docs.py
More file actions
63 lines (41 loc) · 1.73 KB
/
aws_docs.py
File metadata and controls
63 lines (41 loc) · 1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
"""Shared AWS documentation utilities used by the crawler and discovery tools."""
from __future__ import annotations
from dataclasses import dataclass
import posixpath
from urllib.parse import urlparse, urlunparse
DOCS_BASE_URL = "https://docs.aws.amazon.com"
DEFAULT_LOCALE = "en_us"
DOCS_NETLOC = urlparse(DOCS_BASE_URL).netloc
@dataclass(frozen=True)
class ServiceGuide:
"""Represent a single documentation guide for a service."""
title: str
url: str
allowed_prefix: str
@dataclass(frozen=True)
class ServiceScope:
"""Describe the default crawl scope for a single AWS service."""
guides: tuple[ServiceGuide, ...]
@property
def start_urls(self) -> tuple[str, ...]:
return tuple(guide.url for guide in self.guides)
@property
def allowed_prefixes(self) -> tuple[str, ...]:
return tuple(sorted({guide.allowed_prefix for guide in self.guides}))
def normalise_url(url: str) -> str:
"""Normalise a URL by removing fragments and redundant path segments."""
parsed = urlparse(url)
cleaned_path = posixpath.normpath(parsed.path or "/")
if parsed.path.endswith("/") and not cleaned_path.endswith("/"):
cleaned_path += "/"
cleaned = parsed._replace(path=cleaned_path, fragment="", query="")
return urlunparse(cleaned)
def derive_allowed_prefix(url: str) -> str:
"""Derive the crawl prefix for ``url`` by trimming to the containing folder."""
path = urlparse(url).path or "/"
normalized = "/" + path.lstrip("/")
directory = normalized if normalized.endswith("/") else posixpath.dirname(normalized)
directory = directory or "/"
if directory != "/" and not directory.endswith("/"):
directory = directory.rstrip("/") + "/"
return directory