mirror of
https://github.com/postgres/pgweb.git
synced 2025-08-06 09:57:57 +00:00

Replaces the old search code with something that's not quite as much spaghetti (e.g. not evolved over too much time), and more stable (actual error handling instead of random crashes) Crawlers are now also multithreaded to deal with higher latency to some sites.
168 lines
5.1 KiB
Python
168 lines
5.1 KiB
Python
import datetime
|
|
import httplib
|
|
from Queue import Queue
|
|
import threading
|
|
import sys
|
|
import time
|
|
|
|
from lib.log import log
|
|
from lib.parsers import ArchivesParser
|
|
|
|
class MultiListCrawler(object):
|
|
def __init__(self, lists, conn, status_interval=30, commit_interval=500):
|
|
self.lists = lists
|
|
self.conn = conn
|
|
self.status_interval = status_interval
|
|
self.commit_interval = commit_interval
|
|
|
|
self.queue = Queue()
|
|
self.counter = 0
|
|
self.counterlock = threading.RLock()
|
|
self.stopevent = threading.Event()
|
|
|
|
def crawl(self, full=False, month=None):
|
|
# Each thread can independently run on one month, so we can get
|
|
# a reasonable spread. Therefor, submit them as separate jobs
|
|
# to the queue.
|
|
for listid, listname in self.lists:
|
|
if full:
|
|
# Generate a sequence of everything to index
|
|
for year in range(1997, datetime.datetime.now().year+1):
|
|
for month in range(1,13):
|
|
self.queue.put((listid, listname, year, month, -1))
|
|
elif month:
|
|
# Do one specific month
|
|
pieces = month.split("-")
|
|
if len(pieces) != 2:
|
|
print "Month format is <y>-<m>, cannot parse '%s'" % month
|
|
sys.exit(1)
|
|
try:
|
|
pieces = [int(x) for x in pieces]
|
|
except:
|
|
print "Month format is <y>-<m>, cannot convert '%s' to integers" % month
|
|
sys.exit(1)
|
|
self.queue.put((listid, listname, pieces[0], pieces[1], -1))
|
|
else:
|
|
# In incremental scan, we check the current month and the
|
|
# previous one, but only for new messages.
|
|
curs = self.conn.cursor()
|
|
curr = datetime.date.today()
|
|
if curr.month == 1:
|
|
prev = datetime.date(curr.year-1, 12, 1)
|
|
else:
|
|
prev = datetime.date(curr.year, curr.month-1, 1)
|
|
|
|
for d in curr, prev:
|
|
# Figure out what the highest indexed page in this
|
|
# month is.
|
|
curs.execute("SELECT max(msgnum) FROM messages WHERE list=%(list)s AND year=%(year)s AND month=%(month)s", {
|
|
'list': listid,
|
|
'year': d.year,
|
|
'month': d.month,
|
|
})
|
|
x = curs.fetchall()
|
|
if x[0][0]:
|
|
maxmsg = x[0][0]
|
|
else:
|
|
maxmsg = -1
|
|
self.queue.put((listid, listname, d.year, d.month, maxmsg))
|
|
|
|
for x in range(5):
|
|
t = threading.Thread(name="Indexer %s" % x,
|
|
target = lambda: self.crawl_from_queue())
|
|
t.daemon= True
|
|
t.start()
|
|
|
|
t = threading.Thread(name="statusthread", target = lambda: self.status_thread())
|
|
t.daemon = True
|
|
t.start()
|
|
|
|
# XXX: need to find a way to deal with all threads crashed and
|
|
# not done here yet!
|
|
self.queue.join()
|
|
self.stopevent.set()
|
|
|
|
return self.counter
|
|
|
|
def status_thread(self):
|
|
lastcommit = 0
|
|
starttime = time.time()
|
|
while not self.stopevent.is_set():
|
|
self.stopevent.wait(self.status_interval)
|
|
nowtime = time.time()
|
|
with self.counterlock:
|
|
log("Indexed %s messages so far (%s active threads, %s months still queued, %.1f msg/sec)" % (
|
|
self.counter,
|
|
threading.active_count() - 2 , # main thread + status thread
|
|
self.queue.qsize(),
|
|
self.counter / (nowtime - starttime),
|
|
))
|
|
# Commit every 500 messages
|
|
if self.counter - lastcommit > self.commit_interval:
|
|
lastcommit = self.counter
|
|
self.conn.commit()
|
|
|
|
def crawl_from_queue(self):
|
|
while not self.stopevent.is_set():
|
|
(listid, listname, year, month, maxmsg) = self.queue.get()
|
|
self.crawl_month(listid, listname, year, month, maxmsg)
|
|
self.queue.task_done()
|
|
|
|
def crawl_month(self, listid, listname, year, month, maxmsg):
|
|
currentmsg = maxmsg
|
|
while True:
|
|
currentmsg += 1
|
|
try:
|
|
if not self.crawl_single_message(listid, listname, year, month, currentmsg):
|
|
break
|
|
except Exception, e:
|
|
log("Exception when crawling %s/%s/%s/%s - %s" % (
|
|
listname, year, month, currentmsg, e))
|
|
# Continue on to try the next message
|
|
|
|
def crawl_single_message(self, listid, listname, year, month, msgnum):
|
|
curs = self.conn.cursor()
|
|
h = httplib.HTTPConnection(host="archives.postgresql.org",
|
|
port=80,
|
|
strict=True,
|
|
timeout=10)
|
|
url = "/%s/%04d-%02d/msg%05d.php" % (
|
|
listname,
|
|
year,
|
|
month,
|
|
msgnum)
|
|
h.putrequest("GET", url)
|
|
h.putheader("User-agent", "pgsearch/0.2")
|
|
h.putheader("Connection", "close")
|
|
h.endheaders()
|
|
resp = h.getresponse()
|
|
txt = resp.read()
|
|
h.close()
|
|
|
|
if resp.status == 404:
|
|
# Past the end of the month
|
|
return False
|
|
elif resp.status != 200:
|
|
raise Exception("%s/%s/%s/%s returned status %s" % (listname, year, month, msgnum, reps.status))
|
|
|
|
# Else we have the message!
|
|
p = ArchivesParser()
|
|
if not p.parse(txt):
|
|
log("Failed to parse %s/%s/%s/%s" % (listname, year, month, msgnum))
|
|
# We return true to move on to the next message anyway!
|
|
return True
|
|
curs.execute("INSERT INTO messages (list, year, month, msgnum, date, subject, author, txt, fti) VALUES (%(listid)s, %(year)s, %(month)s, %(msgnum)s, %(date)s, %(subject)s, %(author)s, %(txt)s, setweight(to_tsvector('pg', %(subject)s), 'A') || to_tsvector('pg', %(txt)s))", {
|
|
'listid': listid,
|
|
'year': year,
|
|
'month': month,
|
|
'msgnum': msgnum,
|
|
'date': p.date,
|
|
'subject': p.subject[:127],
|
|
'author': p.author[:127],
|
|
'txt': p.body,
|
|
})
|
|
with self.counterlock:
|
|
self.counter += 1
|
|
|
|
return True
|