mirror of
https://github.com/postgres/pgweb.git
synced 2025-08-01 15:54:53 +00:00

There were per-site configured rules defined but the regexp was slightly incorrectly defined. However, we should just simply never crawl urls like this unless they are normalized, so for now just add them to the hardcoded exclusion rules.
53 lines
1.5 KiB
Python
53 lines
1.5 KiB
Python
import re
|
|
|
|
from basecrawler import BaseSiteCrawler
|
|
from parsers import RobotsParser
|
|
|
|
class GenericSiteCrawler(BaseSiteCrawler):
|
|
def __init__(self, hostname, dbconn, siteid, https=False):
|
|
super(GenericSiteCrawler, self).__init__(hostname, dbconn, siteid, https=https)
|
|
|
|
def init_crawl(self):
|
|
# Load robots.txt
|
|
self.robots = RobotsParser("http://%s/robots.txt" % self.hostname)
|
|
|
|
# We need to seed the crawler with every URL we've already seen, since
|
|
# we don't recrawl the contents if they haven't changed.
|
|
allpages = self.scantimes.keys()
|
|
|
|
# Figure out if there are any excludes to deal with (beyond the
|
|
# robots.txt ones)
|
|
curs = self.dbconn.cursor()
|
|
curs.execute("SELECT suburlre FROM site_excludes WHERE site=%(site)s", {
|
|
'site': self.siteid,
|
|
})
|
|
self.extra_excludes = [re.compile(x) for x, in curs.fetchall()]
|
|
|
|
# We *always* crawl the root page, of course
|
|
self.queue.put(("/", 0.5, False))
|
|
|
|
# Now do all the other pages
|
|
for x in allpages:
|
|
self.queue.put((x, 0.5, False))
|
|
|
|
def exclude_url(self, url):
|
|
if ".." in url:
|
|
return True
|
|
if self.robots and self.robots.block_url(url):
|
|
return True
|
|
for r in self.extra_excludes:
|
|
if r.search(url):
|
|
return True
|
|
return False
|
|
|
|
def queue_url(self, url):
|
|
self.queue.put((url.strip(), 0.5, False))
|
|
|
|
def post_process_page(self, url):
|
|
for l in self.resolve_links(self.page.links, url):
|
|
if self.pages_crawled.has_key(l) or self.pages_crawled.has_key(l+"/"):
|
|
continue
|
|
if self.exclude_url(l):
|
|
continue
|
|
self.queue_url(l)
|