Files
postgres-web/tools/search/crawler/lib/genericsite.py
Magnus Hagander 4ce8184e65 Explicitly exclude urls with .. in search crawling
There were per-site configured rules defined but the regexp was slightly
incorrectly defined. However, we should just simply never crawl urls
like this unless they are normalized, so for now just add them to the
hardcoded exclusion rules.
2017-11-08 12:04:36 -05:00

53 lines
1.5 KiB
Python

import re
from basecrawler import BaseSiteCrawler
from parsers import RobotsParser
class GenericSiteCrawler(BaseSiteCrawler):
def __init__(self, hostname, dbconn, siteid, https=False):
super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)
def init_crawl(self):
# Load robots.txt
self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)
# We need to seed the crawler with every URL we've already seen, since
# we don't recrawl the contents if they haven't changed.
allpages = self.scantimes.keys()
# Figure out if there are any excludes to deal with (beyond the
# robots.txt ones)
curs = self.dbconn.cursor()
curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
'site': self.siteid,
})
self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]
# We *always* crawl the root page, of course
self.queue.put(("/", 0.5, False))
# Now do all the other pages
for x in allpages:
self.queue.put((x, 0.5, False))
def exclude_url(self, url):
if ".." in url:
return True
if self.robots and self.robots.block_url(url):
return True
for r in self.extra_excludes:
if r.search(url):
return True
return False
def queue_url(self, url):
self.queue.put((url.strip(), 0.5, False))
def post_process_page(self, url):
for l in self.resolve_links(self.page.links, url):
if self.pages_crawled.has_key(l) or self.pages_crawled.has_key(l+"/"):
continue
if self.exclude_url(l):
continue
self.queue_url(l)