diff options
-rw-r--r-- | plugins/url_check_ng/robotparser2.py.diff | 44 | ||||
-rw-r--r-- | plugins/url_check_ng/robotparser2.py.readme | 1 |
2 files changed, 45 insertions, 0 deletions
diff --git a/plugins/url_check_ng/robotparser2.py.diff b/plugins/url_check_ng/robotparser2.py.diff new file mode 100644 index 000000000..feb8a77ba --- /dev/null +++ b/plugins/url_check_ng/robotparser2.py.diff @@ -0,0 +1,44 @@ +linkchecker patch +----------------- + +The following patch is a quick hack to prevent robots.txt +restrictions from taking effect. This is somewhat ugly, but +we need to *know exactly* if a source package is accessible +or has moved. So we explicitly don't want to honor those +restrictions. Unfortunately, the --robots-txt option is +enabled per default and you cannot disable it from the +command line. + +Since this hack is generally a bad thing, please behave +reasponsible: Don't run url_checks too often (or at +least ask the mirror maintainers for approval to do so). +A server maintainer who doesn't want spiders to index +his machine might have his reasons... ;-) + +bbu, 20071019 + +---------------------------------------------------------------------------- + +--- robotparser2.py.orig 2007-10-19 14:59:06.000000000 +0200 ++++ robotparser2.py 2007-10-19 15:03:28.000000000 +0200 +@@ -344,19 +344,9 @@ + if not isinstance(url, str): + url = url.encode("ascii", "ignore") + if self.disallow_all: +- return False ++ return True + if self.allow_all: + return True +- # search for given user agent matches +- # the first match counts +- url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" +- for entry in self.entries: +- if entry.applies_to(useragent): +- return entry.allowance(url) +- # try the default entry last +- if self.default_entry is not None: +- return self.default_entry.allowance(url) +- # agent not found ==> access granted + return True + + def get_crawldelay (self, useragent): diff --git a/plugins/url_check_ng/robotparser2.py.readme b/plugins/url_check_ng/robotparser2.py.readme new file mode 100644 index 000000000..a05042197 --- /dev/null +++ b/plugins/url_check_ng/robotparser2.py.readme @@ -0,0 +1 @@ +/usr/share/pycentral/linkchecker/site-packages/linkcheck |