-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb-scrape-example.toml
More file actions
127 lines (96 loc) · 3.06 KB
/
web-scrape-example.toml
File metadata and controls
127 lines (96 loc) · 3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# PDFScalpel Web Extraction Configuration Example
# Use this file with: pdfscalpel extract web --config web-scrape-example.toml
[web_extraction]
# Base URL for the API or web service
base_url = "https://api.example.com/INFOIWANT"
# Parameter name for page number in URL (default: "page")
page_param = "page"
# Page range to extract
# Format: "1-10", "1,3,5-7", "1-5,10,15-20"
pages = "1-50"
# Optional: URL template with {page} placeholder
# Use this for more complex URL patterns
# url_template = "https://api.example.com/challenge/{page}.png"
# Output file path
output_file = "web_extraction.pdf"
# PDF title (appears in PDF metadata)
title = "API Content Archive"
# Minimum image size in bytes (skip images smaller than this)
min_image_size = 1500
# Auto-discover available pages (ignores 'pages' parameter)
auto_discover = false
# Maximum pages to test during auto-discovery
max_discovery_pages = 200
# Stop auto-discovery after this many missing pages in a row
discovery_gap = 5
# Load cookies from browser (firefox, chrome, edge, chromium)
# cookies_from_browser = "firefox"
# Domain filter for browser cookies
# browser_domain = "example.com"
# Enable resume from cached downloads
resume_from_cache = true
# Cache directory for downloaded pages
# cache_dir = ".web_cache"
[retry]
# Maximum number of retry attempts per page
max_retries = 5
# Initial retry delay in milliseconds
retry_delay_ms = 3000
# Use exponential backoff for retries
exponential_backoff = true
# Request timeout in seconds
timeout_seconds = 30
[rate_limit]
# Base delay between requests in milliseconds
base_delay_ms = 2000
# Random jitter to add/subtract from delay (helps avoid rate limiting)
jitter_ms = 1000
[cookies]
# Manual cookie configuration
# Add your cookies here if not using browser cookies
# session_id = "your_session_cookie_value"
# auth_token = "your_auth_token"
# PHPSESSID = "abc123def456"
# Example configurations for different use cases:
# ===================================
# Example 1: Paginated API with Query Parameters
# ===================================
# [web_extraction]
# base_url = "https://api.docs.example.com/pages"
# page_param = "p"
# pages = "1-25"
# title = "Documentation Archive"
#
# [retry]
# max_retries = 3
# retry_delay_ms = 2000
#
# [rate_limit]
# base_delay_ms = 1500
# jitter_ms = 500
# ===================================
# Example 2: Authenticated Content with Cookies
# ===================================
# [web_extraction]
# base_url = "https://secure.example.com/images"
# pages = "1-50"
# title = "Secure Content Archive"
#
# [cookies]
# session = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9..."
# csrf_token = "abc123def456"
# ===================================
# Example 3: Auto-Discovery Mode
# ===================================
# [web_extraction]
# base_url = "https://cdn.example.com/slides/presentation"
# url_template = "https://cdn.example.com/slides/presentation/{page}.jpg"
# auto_discover = true
# max_discovery_pages = 300
# discovery_gap = 10
# title = "Presentation Slides"
# min_image_size = 5000
#
# [rate_limit]
# base_delay_ms = 500
# jitter_ms = 200