mirror of
https://github.com/postgres/pgweb.git
synced 2025-08-03 15:38:59 +00:00
54 lines
1.7 KiB
Python
54 lines
1.7 KiB
Python
import re
|
|
|
|
from .basecrawler import BaseSiteCrawler
|
|
from .parsers import RobotsParser
|
|
|
|
|
|
class GenericSiteCrawler(BaseSiteCrawler):
|
|
def __init__(self, hostname, dbconn, siteid, https=False):
|
|
super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)
|
|
|
|
def init_crawl(self):
|
|
# Load robots.txt
|
|
self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)
|
|
|
|
# We need to seed the crawler with every URL we've already seen, since
|
|
# we don't recrawl the contents if they haven't changed.
|
|
allpages = self.scantimes.keys()
|
|
|
|
# Figure out if there are any excludes to deal with (beyond the
|
|
# robots.txt ones)
|
|
curs = self.dbconn.cursor()
|
|
curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
|
|
'site': self.siteid,
|
|
})
|
|
self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]
|
|
|
|
# We *always* crawl the root page, of course
|
|
self.queue.put(("/", 0.5, False))
|
|
|
|
# Now do all the other pages
|
|
for x in allpages:
|
|
self.queue.put((x, 0.5, False))
|
|
|
|
def exclude_url(self, url):
|
|
if ".." in url:
|
|
return True
|
|
if self.robots and self.robots.block_url(url):
|
|
return True
|
|
for r in self.extra_excludes:
|
|
if r.search(url):
|
|
return True
|
|
return False
|
|
|
|
def queue_url(self, url):
|
|
self.queue.put((url.strip(), 0.5, False))
|
|
|
|
def post_process_page(self, url):
|
|
for l in self.resolve_links(self.page.links, url):
|
|
if l in self.pages_crawled or l + "/" in self.pages_crawled:
|
|
continue
|
|
if self.exclude_url(l):
|
|
continue
|
|
self.queue_url(l)
|