pdfscapel/web-scrape-example.toml at main · ridpath/pdfscapel · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# PDFScalpel Web Extraction Configuration Example
# Use this file with: pdfscalpel extract web --config web-scrape-example.toml

[web_extraction]
# Base URL for the API or web service
base_url = "https://api.example.com/INFOIWANT"

# Parameter name for page number in URL (default: "page")
page_param = "page"

# Page range to extract
# Format: "1-10", "1,3,5-7", "1-5,10,15-20"
pages = "1-50"

# Optional: URL template with {page} placeholder
# Use this for more complex URL patterns
# url_template = "https://api.example.com/challenge/{page}.png"

# Output file path
output_file = "web_extraction.pdf"

# PDF title (appears in PDF metadata)
title = "API Content Archive"

# Minimum image size in bytes (skip images smaller than this)
min_image_size = 1500

# Auto-discover available pages (ignores 'pages' parameter)
auto_discover = false

# Maximum pages to test during auto-discovery
max_discovery_pages = 200

# Stop auto-discovery after this many missing pages in a row
discovery_gap = 5

# Load cookies from browser (firefox, chrome, edge, chromium)
# cookies_from_browser = "firefox"

# Domain filter for browser cookies
# browser_domain = "example.com"

# Enable resume from cached downloads
resume_from_cache = true

# Cache directory for downloaded pages
# cache_dir = ".web_cache"


[retry]
# Maximum number of retry attempts per page
max_retries = 5

# Initial retry delay in milliseconds
retry_delay_ms = 3000

# Use exponential backoff for retries
exponential_backoff = true

# Request timeout in seconds
timeout_seconds = 30


[rate_limit]
# Base delay between requests in milliseconds
base_delay_ms = 2000

# Random jitter to add/subtract from delay (helps avoid rate limiting)
jitter_ms = 1000


[cookies]
# Manual cookie configuration
# Add your cookies here if not using browser cookies
# session_id = "your_session_cookie_value"
# auth_token = "your_auth_token"
# PHPSESSID = "abc123def456"


# Example configurations for different use cases:


# ===================================
# Example 1: Paginated API with Query Parameters
# ===================================
# [web_extraction]
# base_url = "https://api.docs.example.com/pages"
# page_param = "p"
# pages = "1-25"
# title = "Documentation Archive"
#
# [retry]
# max_retries = 3
# retry_delay_ms = 2000
#
# [rate_limit]
# base_delay_ms = 1500
# jitter_ms = 500

# ===================================
# Example 2: Authenticated Content with Cookies
# ===================================
# [web_extraction]
# base_url = "https://secure.example.com/images"
# pages = "1-50"
# title = "Secure Content Archive"
#
# [cookies]
# session = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..."
# csrf_token = "abc123def456"

# ===================================
# Example 3: Auto-Discovery Mode
# ===================================
# [web_extraction]
# base_url = "https://cdn.example.com/slides/presentation"
# url_template = "https://cdn.example.com/slides/presentation/{page}.jpg"
# auto_discover = true
# max_discovery_pages = 300
# discovery_gap = 10
# title = "Presentation Slides"
# min_image_size = 5000
#
# [rate_limit]
# base_delay_ms = 500
# jitter_ms = 200