postgres-web/tools/search/crawler/lib/genericsite.py

import re

from .basecrawler import BaseSiteCrawler
from .parsers import RobotsParser


class GenericSiteCrawler(BaseSiteCrawler):
    def __init__(self, hostname, dbconn, siteid, https=False):
        super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)

    def init_crawl(self):
        # Load robots.txt
        self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)

        # We need to seed the crawler with every URL we've already seen, since
        # we don't recrawl the contents if they haven't changed.
        allpages = self.scantimes.keys()

        # Figure out if there are any excludes to deal with (beyond the
        # robots.txt ones)
        curs = self.dbconn.cursor()
        curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
            'site': self.siteid,
        })
        self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]

        # We *always* crawl the root page, of course
        self.queue.put(("/", 0.5, False))

        # Now do all the other pages
        for x in allpages:
            self.queue.put((x, 0.5, False))

    def exclude_url(self, url):
        if ".." in url:
            return True
        if self.robots and self.robots.block_url(url):
            return True
        for r in self.extra_excludes:
            if r.search(url):
                return True
        return False

    def queue_url(self, url):
        self.queue.put((url.strip(), 0.5, False))

    def post_process_page(self, url):
        for l in self.resolve_links(self.page.links, url):
            if l in self.pages_crawled or l + "/" in self.pages_crawled:
                continue
            if self.exclude_url(l):
                continue
            self.queue_url(l)