From 5b98b8f7f5ab274cff19aaadb5c2d31eac60bb2c Mon Sep 17 00:00:00 2001 From: sebclick Date: Wed, 8 Aug 2012 21:51:41 +0200 Subject: [PATCH 1/2] Correction Issue #3 Erreur dans l'init du tableau. --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 4b3eb97..c281107 100755 --- a/main.py +++ b/main.py @@ -139,7 +139,7 @@ def exclude_url(exclude, link): if response.getcode() in responseCode: responseCode[response.getcode()]+=1 else: - responseCode[response.getcode()] = 0 + responseCode[response.getcode()] = 1 if response.getcode()==200: msg = response.read() else: From 7d8a610bafdaa1b631262ab3dfa837a6170ef081 Mon Sep 17 00:00:00 2001 From: sebclick Date: Wed, 8 Aug 2012 22:21:27 +0200 Subject: [PATCH 2/2] =?UTF-8?q?Modification=20pour=20meilleure=20gestion?= =?UTF-8?q?=20des=20URL=20qui=20ne=20r=C3=A9pondent=20pas=20200=20OK?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Les URLS qui ne répondent pas 200 OK ne sont plus listés dans le sitemap. Les URLS qui ne répondent pas 200 OK ne sont vérifiés qu'une seule fois (car ajouté à la liste crawled). --- main.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index c281107..1a07693 100755 --- a/main.py +++ b/main.py @@ -133,6 +133,7 @@ def exclude_url(exclude, link): crawling = tocrawl.pop() url = urlparse(crawling) + crawled.add(crawling) try: request = Request(crawling, headers={"User-Agent":'Sitemap crawler'}) response = urlopen(request) @@ -143,7 +144,8 @@ def exclude_url(exclude, link): if response.getcode()==200: msg = response.read() else: - msg = "" + response.close() + continue response.close() except Exception as e: @@ -151,9 +153,9 @@ def exclude_url(exclude, link): logging.debug ("{1} ==> {0}".format(e, crawling)) continue - + print (""+url.geturl()+"", file=output_file) + output_file.flush() links = linkregex.findall(msg) - crawled.add(crawling) for link in links: link = link.decode("utf-8") if link.startswith('/'): @@ -173,7 +175,6 @@ def exclude_url(exclude, link): target_extension = os.path.splitext(parsed_link.path)[1][1:] if (link not in crawled) and (link not in tocrawl) and (domain_link == target_domain) and can_fetch(arg.parserobots, rp, link,arg.debug) and ("javascript:" not in link) and (target_extension not in arg.skipext) and (exclude_url(arg.exclude, link)): - print (""+link+"", file=output_file) tocrawl.add(link) print (footer, file=output_file)