From 321d9f54dd2f0b0392f3c4c37720a53449537527 Mon Sep 17 00:00:00 2001 From: Bjoern Buerger Date: Fri, 19 Oct 2007 19:38:38 +0000 Subject: * let linkchecker ignore robots.txt restrictions git-svn-id: https://svn.pengutronix.de/svn/ptxdist/trunks/ptxdist-trunk@7555 33e552b5-05e3-0310-8538-816dae2090ed --- plugins/url_check_ng/robotparser2.py.diff | 44 +++++++++++++++++++++++++++++ plugins/url_check_ng/robotparser2.py.readme | 1 + 2 files changed, 45 insertions(+) create mode 100644 plugins/url_check_ng/robotparser2.py.diff create mode 100644 plugins/url_check_ng/robotparser2.py.readme (limited to 'plugins') diff --git a/plugins/url_check_ng/robotparser2.py.diff b/plugins/url_check_ng/robotparser2.py.diff new file mode 100644 index 000000000..feb8a77ba --- /dev/null +++ b/plugins/url_check_ng/robotparser2.py.diff @@ -0,0 +1,44 @@ +linkchecker patch +----------------- + +The following patch is a quick hack to prevent robots.txt +restrictions from taking effect. This is somewhat ugly, but +we need to *know exactly* if a source package is accessible +or has moved. So we explicitly don't want to honor those +restrictions. Unfortunately, the --robots-txt option is +enabled per default and you cannot disable it from the +command line. + +Since this hack is generally a bad thing, please behave +reasponsible: Don't run url_checks too often (or at +least ask the mirror maintainers for approval to do so). +A server maintainer who doesn't want spiders to index +his machine might have his reasons... ;-) + +bbu, 20071019 + +---------------------------------------------------------------------------- + +--- robotparser2.py.orig 2007-10-19 14:59:06.000000000 +0200 ++++ robotparser2.py 2007-10-19 15:03:28.000000000 +0200 +@@ -344,19 +344,9 @@ + if not isinstance(url, str): + url = url.encode("ascii", "ignore") + if self.disallow_all: +- return False ++ return True + if self.allow_all: + return True +- # search for given user agent matches +- # the first match counts +- url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" +- for entry in self.entries: +- if entry.applies_to(useragent): +- return entry.allowance(url) +- # try the default entry last +- if self.default_entry is not None: +- return self.default_entry.allowance(url) +- # agent not found ==> access granted + return True + + def get_crawldelay (self, useragent): diff --git a/plugins/url_check_ng/robotparser2.py.readme b/plugins/url_check_ng/robotparser2.py.readme new file mode 100644 index 000000000..a05042197 --- /dev/null +++ b/plugins/url_check_ng/robotparser2.py.readme @@ -0,0 +1 @@ +/usr/share/pycentral/linkchecker/site-packages/linkcheck -- cgit v1.2.3