diff options
author | Bjoern Buerger <b.buerger@pengutronix.de> | 2007-10-19 19:38:38 +0000 |
---|---|---|
committer | Bjoern Buerger <b.buerger@pengutronix.de> | 2007-10-19 19:38:38 +0000 |
commit | 321d9f54dd2f0b0392f3c4c37720a53449537527 (patch) | |
tree | 4ab1c3582625dbc67837faa2f20a37e7f36951b7 /plugins | |
parent | 3a5a2e69d36bc21d69cb4acdcb67ea44540bd518 (diff) | |
download | ptxdist-321d9f54dd2f0b0392f3c4c37720a53449537527.tar.gz ptxdist-321d9f54dd2f0b0392f3c4c37720a53449537527.tar.xz |
* let linkchecker ignore robots.txt restrictions
git-svn-id: https://svn.pengutronix.de/svn/ptxdist/trunks/ptxdist-trunk@7555 33e552b5-05e3-0310-8538-816dae2090ed
Diffstat (limited to 'plugins')
-rw-r--r-- | plugins/url_check_ng/robotparser2.py.diff | 44 | ||||
-rw-r--r-- | plugins/url_check_ng/robotparser2.py.readme | 1 |
2 files changed, 45 insertions, 0 deletions
diff --git a/plugins/url_check_ng/robotparser2.py.diff b/plugins/url_check_ng/robotparser2.py.diff new file mode 100644 index 000000000..feb8a77ba --- /dev/null +++ b/plugins/url_check_ng/robotparser2.py.diff @@ -0,0 +1,44 @@ +linkchecker patch +----------------- + +The following patch is a quick hack to prevent robots.txt +restrictions from taking effect. This is somewhat ugly, but +we need to *know exactly* if a source package is accessible +or has moved. So we explicitly don't want to honor those +restrictions. Unfortunately, the --robots-txt option is +enabled per default and you cannot disable it from the +command line. + +Since this hack is generally a bad thing, please behave +reasponsible: Don't run url_checks too often (or at +least ask the mirror maintainers for approval to do so). +A server maintainer who doesn't want spiders to index +his machine might have his reasons... ;-) + +bbu, 20071019 + +---------------------------------------------------------------------------- + +--- robotparser2.py.orig 2007-10-19 14:59:06.000000000 +0200 ++++ robotparser2.py 2007-10-19 15:03:28.000000000 +0200 +@@ -344,19 +344,9 @@ + if not isinstance(url, str): + url = url.encode("ascii", "ignore") + if self.disallow_all: +- return False ++ return True + if self.allow_all: + return True +- # search for given user agent matches +- # the first match counts +- url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" +- for entry in self.entries: +- if entry.applies_to(useragent): +- return entry.allowance(url) +- # try the default entry last +- if self.default_entry is not None: +- return self.default_entry.allowance(url) +- # agent not found ==> access granted + return True + + def get_crawldelay (self, useragent): diff --git a/plugins/url_check_ng/robotparser2.py.readme b/plugins/url_check_ng/robotparser2.py.readme new file mode 100644 index 000000000..a05042197 --- /dev/null +++ b/plugins/url_check_ng/robotparser2.py.readme @@ -0,0 +1 @@ +/usr/share/pycentral/linkchecker/site-packages/linkcheck |