From 4ce8184e651e318de2328f2b47d62871cc3b43c8 Mon Sep 17 00:00:00 2001 From: Magnus Hagander Date: Wed, 8 Nov 2017 12:02:58 -0500 Subject: [PATCH] Explicitly exclude urls with .. in search crawling There were per-site configured rules defined but the regexp was slightly incorrectly defined. However, we should just simply never crawl urls like this unless they are normalized, so for now just add them to the hardcoded exclusion rules. --- tools/search/crawler/lib/genericsite.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/search/crawler/lib/genericsite.py b/tools/search/crawler/lib/genericsite.py index 7e46ae5b..b74bb469 100644 --- a/tools/search/crawler/lib/genericsite.py +++ b/tools/search/crawler/lib/genericsite.py @@ -31,6 +31,8 @@ class GenericSiteCrawler(BaseSiteCrawler): self.queue.put((x, 0.5, False)) def exclude_url(self, url): + if ".." in url: + return True if self.robots and self.robots.block_url(url): return True for r in self.extra_excludes: