Files
postgres-web/tools/search/crawler/lib/sitemapsite.py
2019-01-17 21:19:57 +01:00

105 lines
3.7 KiB
Python

import urllib
import xml.parsers.expat
import dateutil.parser
from lib.log import log
from lib.basecrawler import BaseSiteCrawler
class SitemapParser(object):
def __init__(self):
self.urls = []
def parse(self, f, internal=False):
self.parser = xml.parsers.expat.ParserCreate()
self.currenturl = ""
self.currentprio = 0
self.currentlastmod = None
self.geturl = False
self.getprio = False
self.getlastmod = False
self.currstr = ""
self.internal = False
self.parser.StartElementHandler = lambda name, attrs: self.processelement(name, attrs)
self.parser.EndElementHandler = lambda name: self.processendelement(name)
self.parser.CharacterDataHandler = lambda data: self.processcharacterdata(data)
self.internal = internal
self.parser.ParseFile(f)
def processelement(self, name, attrs):
if name == "url":
self.currenturl = ""
self.currentprio = 0
self.currentlastmod = None
elif name == "loc":
self.geturl = True
self.currstr = ""
elif name == "priority":
self.getprio = True
self.currstr = ""
elif name == "lastmod":
self.getlastmod = True
self.currstr = ""
def processendelement(self, name):
if name == "loc":
self.geturl = False
self.currenturl = self.currstr
elif name == "priority":
self.getprio = False
self.currentprio = float(self.currstr)
elif name == "lastmod":
self.getlastmod = False
self.currentlastmod = dateutil.parser.parse(self.currstr)
elif name == "url":
self.urls.append((self.currenturl, self.currentprio, self.currentlastmod, self.internal))
def processcharacterdata(self, data):
if self.geturl or self.getprio or self.getlastmod:
self.currstr += data
class SitemapSiteCrawler(BaseSiteCrawler):
def __init__(self, hostname, dbconn, siteid, serverip, https=False):
super(SitemapSiteCrawler, self).__init__(hostname, dbconn, siteid, serverip, https)
def init_crawl(self):
# Fetch the sitemap. We ignore robots.txt in this case, and
# assume it's always under /sitemap.xml
u = urllib.urlopen("https://%s/sitemap.xml" % self.hostname)
p = SitemapParser()
p.parse(u)
u.close()
# Attempt to fetch a sitempa_internal.xml. This is used to index
# pages on our internal search engine that we don't want on
# Google. They should also be excluded from default search
# results (unless searching with a specific suburl)
u = urllib.urlopen("https://%s/sitemap_internal.xml" % self.hostname)
if u.getcode() == 200:
p.parse(u, True)
u.close()
for url, prio, lastmod, internal in p.urls:
# Advance 8 characters - length of https://.
url = url[len(self.hostname) + 8:]
if lastmod:
if url in self.scantimes:
if lastmod < self.scantimes[url]:
# Not modified since last scan, so don't reload
# Stick it in the list of pages we've scanned though,
# to make sure we don't remove it...
self.pages_crawled[url] = 1
continue
self.queue.put((url, prio, internal))
log("About to crawl %s pages from sitemap" % self.queue.qsize())
# Stub functions used when crawling, ignored here
def queue_url(self, url):
pass
def post_process_page(self, url):
pass