summaryrefslogtreecommitdiffstats
path: root/plugins
diff options
context:
space:
mode:
authorBjoern Buerger <b.buerger@pengutronix.de>2007-10-19 19:38:38 +0000
committerBjoern Buerger <b.buerger@pengutronix.de>2007-10-19 19:38:38 +0000
commit321d9f54dd2f0b0392f3c4c37720a53449537527 (patch)
tree4ab1c3582625dbc67837faa2f20a37e7f36951b7 /plugins
parent3a5a2e69d36bc21d69cb4acdcb67ea44540bd518 (diff)
downloadptxdist-321d9f54dd2f0b0392f3c4c37720a53449537527.tar.gz
ptxdist-321d9f54dd2f0b0392f3c4c37720a53449537527.tar.xz
* let linkchecker ignore robots.txt restrictions
git-svn-id: https://svn.pengutronix.de/svn/ptxdist/trunks/ptxdist-trunk@7555 33e552b5-05e3-0310-8538-816dae2090ed
Diffstat (limited to 'plugins')
-rw-r--r--plugins/url_check_ng/robotparser2.py.diff44
-rw-r--r--plugins/url_check_ng/robotparser2.py.readme1
2 files changed, 45 insertions, 0 deletions
diff --git a/plugins/url_check_ng/robotparser2.py.diff b/plugins/url_check_ng/robotparser2.py.diff
new file mode 100644
index 000000000..feb8a77ba
--- /dev/null
+++ b/plugins/url_check_ng/robotparser2.py.diff
@@ -0,0 +1,44 @@
+linkchecker patch
+-----------------
+
+The following patch is a quick hack to prevent robots.txt
+restrictions from taking effect. This is somewhat ugly, but
+we need to *know exactly* if a source package is accessible
+or has moved. So we explicitly don't want to honor those
+restrictions. Unfortunately, the --robots-txt option is
+enabled per default and you cannot disable it from the
+command line.
+
+Since this hack is generally a bad thing, please behave
+reasponsible: Don't run url_checks too often (or at
+least ask the mirror maintainers for approval to do so).
+A server maintainer who doesn't want spiders to index
+his machine might have his reasons... ;-)
+
+bbu, 20071019
+
+----------------------------------------------------------------------------
+
+--- robotparser2.py.orig 2007-10-19 14:59:06.000000000 +0200
++++ robotparser2.py 2007-10-19 15:03:28.000000000 +0200
+@@ -344,19 +344,9 @@
+ if not isinstance(url, str):
+ url = url.encode("ascii", "ignore")
+ if self.disallow_all:
+- return False
++ return True
+ if self.allow_all:
+ return True
+- # search for given user agent matches
+- # the first match counts
+- url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
+- for entry in self.entries:
+- if entry.applies_to(useragent):
+- return entry.allowance(url)
+- # try the default entry last
+- if self.default_entry is not None:
+- return self.default_entry.allowance(url)
+- # agent not found ==> access granted
+ return True
+
+ def get_crawldelay (self, useragent):
diff --git a/plugins/url_check_ng/robotparser2.py.readme b/plugins/url_check_ng/robotparser2.py.readme
new file mode 100644
index 000000000..a05042197
--- /dev/null
+++ b/plugins/url_check_ng/robotparser2.py.readme
@@ -0,0 +1 @@
+/usr/share/pycentral/linkchecker/site-packages/linkcheck