-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
50 lines (43 loc) · 1.93 KB
/
scraper.py
File metadata and controls
50 lines (43 loc) · 1.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
###
# Scrape domain looking for all internal links
import time
import requests
from bs4 import BeautifulSoup
url = "https://terem.tech/"
linkstoParse = set()
parsedLinks = set()
def processLinks(link):
if "<a href" not in str(link):
return
aa = str(link).partition(">")[0].rstrip("\"") #
aa = aa.lstrip("<a href=") # extract the html part
aa = aa.lstrip("\"") #
if (url in aa or aa.startswith("/")): # we need to ignore external links
# and keep internal links like /contact
if aa.startswith("/"):
aa = url + aa # expand to full url
aa = aa.partition("\" ")[0]
if aa not in linkstoParse and aa.startswith("http") and (not aa.endswith("webm")):
# haven't added it before
linkstoParse.add(aa)
print(aa)
def crawlsite(link):
print(f"Site to crawl: {link}")
linkstoParse.add(link)
while (len(parsedLinks) < len(linkstoParse) and len(linkstoParse) < 5): # look until we've parsed all links in queue
copy = linkstoParse.copy()
print("Parsed: " + str(len(parsedLinks)))
print("Links: " + str(len(linkstoParse)))
print("Copy: " + str(len(copy)))
for a in copy: # use copy to avoid modifying parsedLinks while iterating through it
time.sleep(1) # make longer to avoid hammering a site
if str(a) not in parsedLinks:
parsedLinks.add(str(a)) # track that we've looked at it
r = requests.get(a, timeout=30)
soup = BeautifulSoup(r.content, 'lxml')
newlinks = soup.find_all('a') # find all new links on this page, process
for b in newlinks:
processLinks(b)
print("Len of parsedLinks: " + str(len(parsedLinks)))
print("Len of linkstoParse: " + str(len(linkstoParse)))
return list(linkstoParse)