Files
postgres-web/tools/search/crawler/lib/genericsite.py
2019-01-26 16:19:26 +01:00

54 lines
1.7 KiB
Python

import re
from .basecrawler import BaseSiteCrawler
from .parsers import RobotsParser
class GenericSiteCrawler(BaseSiteCrawler):
def __init__(self, hostname, dbconn, siteid, https=False):
super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)
def init_crawl(self):
# Load robots.txt
self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)
# We need to seed the crawler with every URL we've already seen, since
# we don't recrawl the contents if they haven't changed.
allpages = self.scantimes.keys()
# Figure out if there are any excludes to deal with (beyond the
# robots.txt ones)
curs = self.dbconn.cursor()
curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
'site': self.siteid,
})
self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]
# We *always* crawl the root page, of course
self.queue.put(("/", 0.5, False))
# Now do all the other pages
for x in allpages:
self.queue.put((x, 0.5, False))
def exclude_url(self, url):
if ".." in url:
return True
if self.robots and self.robots.block_url(url):
return True
for r in self.extra_excludes:
if r.search(url):
return True
return False
def queue_url(self, url):
self.queue.put((url.strip(), 0.5, False))
def post_process_page(self, url):
for l in self.resolve_links(self.page.links, url):
if l in self.pages_crawled or l + "/" in self.pages_crawled:
continue
if self.exclude_url(l):
continue
self.queue_url(l)