Remove indexer for old archives

We have been using the new archives for many years now, so remove this
instead of trying to port it to python3.
This commit is contained in:
Magnus Hagander
2019-01-19 20:23:16 +01:00
parent 07d83eae42
commit e0fcc082ee
4 changed files with 0 additions and 358 deletions

View File

@ -1,168 +0,0 @@
import datetime
import httplib
from Queue import Queue
import threading
import sys
import time
from lib.log import log
from lib.parsers import ArchivesParser
class MultiListCrawler(object):
def __init__(self, lists, conn, status_interval=30, commit_interval=500):
self.lists = lists
self.conn = conn
self.status_interval = status_interval
self.commit_interval = commit_interval
self.queue = Queue()
self.counter = 0
self.counterlock = threading.RLock()
self.stopevent = threading.Event()
def crawl(self, full=False, month=None):
# Each thread can independently run on one month, so we can get
# a reasonable spread. Therefor, submit them as separate jobs
# to the queue.
for listid, listname in self.lists:
if full:
# Generate a sequence of everything to index
for year in range(1997, datetime.datetime.now().year + 1):
for month in range(1, 13):
self.queue.put((listid, listname, year, month, -1))
elif month:
# Do one specific month
pieces = month.split("-")
if len(pieces) != 2:
print("Month format is <y>-<m>, cannot parse '%s'" % month)
sys.exit(1)
try:
pieces = [int(x) for x in pieces]
except:
print("Month format is <y>-<m>, cannot convert '%s' to integers" % month)
sys.exit(1)
self.queue.put((listid, listname, pieces[0], pieces[1], -1))
else:
# In incremental scan, we check the current month and the
# previous one, but only for new messages.
curs = self.conn.cursor()
curr = datetime.date.today()
if curr.month == 1:
prev = datetime.date(curr.year - 1, 12, 1)
else:
prev = datetime.date(curr.year, curr.month - 1, 1)
for d in curr, prev:
# Figure out what the highest indexed page in this
# month is.
curs.execute("SELECT max(msgnum) FROM messages WHERE list=%(list)s AND year=%(year)s AND month=%(month)s", {
'list': listid,
'year': d.year,
'month': d.month,
})
x = curs.fetchall()
if x[0][0] is not None:
maxmsg = x[0][0]
else:
maxmsg = -1
self.queue.put((listid, listname, d.year, d.month, maxmsg))
for x in range(5):
t = threading.Thread(name="Indexer %s" % x,
target=lambda: self.crawl_from_queue())
t.daemon = True
t.start()
t = threading.Thread(name="statusthread", target=lambda: self.status_thread())
t.daemon = True
t.start()
# XXX: need to find a way to deal with all threads crashed and
# not done here yet!
self.queue.join()
self.stopevent.set()
return self.counter
def status_thread(self):
lastcommit = 0
starttime = time.time()
while not self.stopevent.is_set():
self.stopevent.wait(self.status_interval)
nowtime = time.time()
with self.counterlock:
log("Indexed %s messages so far (%s active threads, %s months still queued, %.1f msg/sec)" % (
self.counter,
threading.active_count() - 2, # main thread + status thread
self.queue.qsize(),
self.counter / (nowtime - starttime),
))
# Commit every 500 messages
if self.counter - lastcommit > self.commit_interval:
lastcommit = self.counter
self.conn.commit()
def crawl_from_queue(self):
while not self.stopevent.is_set():
(listid, listname, year, month, maxmsg) = self.queue.get()
self.crawl_month(listid, listname, year, month, maxmsg)
self.queue.task_done()
def crawl_month(self, listid, listname, year, month, maxmsg):
currentmsg = maxmsg
while True:
currentmsg += 1
try:
if not self.crawl_single_message(listid, listname, year, month, currentmsg):
break
except Exception as e:
log("Exception when crawling %s/%s/%s/%s - %s" % (
listname, year, month, currentmsg, e))
# Continue on to try the next message
def crawl_single_message(self, listid, listname, year, month, msgnum):
curs = self.conn.cursor()
h = httplib.HTTPConnection(host="archives.postgresql.org",
port=80,
strict=True,
timeout=10)
url = "/%s/%04d-%02d/msg%05d.php" % (
listname,
year,
month,
msgnum)
h.putrequest("GET", url)
h.putheader("User-agent", "pgsearch/0.2")
h.putheader("Connection", "close")
h.endheaders()
resp = h.getresponse()
txt = resp.read()
h.close()
if resp.status == 404:
# Past the end of the month
return False
elif resp.status != 200:
raise Exception("%s/%s/%s/%s returned status %s" % (listname, year, month, msgnum, resp.status))
# Else we have the message!
p = ArchivesParser()
if not p.parse(txt):
log("Failed to parse %s/%s/%s/%s" % (listname, year, month, msgnum))
# We return true to move on to the next message anyway!
return True
curs.execute("INSERT INTO messages (list, year, month, msgnum, date, subject, author, txt, fti) VALUES (%(listid)s, %(year)s, %(month)s, %(msgnum)s, %(date)s, %(subject)s, %(author)s, %(txt)s, setweight(to_tsvector('pg', %(subject)s), 'A') || to_tsvector('pg', %(txt)s))", {
'listid': listid,
'year': year,
'month': month,
'msgnum': msgnum,
'date': p.date,
'subject': p.subject[:127],
'author': p.author[:127],
'txt': p.body,
})
with self.counterlock:
self.counter += 1
return True

View File

@ -58,85 +58,6 @@ class GenericHtmlParser(HTMLParser):
return self.pagedata.read()
class ArchivesParser(object):
rematcher = re.compile("<!--X-Subject: ([^\n]*) -->.*<!--X-From-R13: ([^\n]*) -->.*<!--X-Date: ([^\n]*) -->.*<!--X-Body-of-Message-->(.*)<!--X-Body-of-Message-End-->", re.DOTALL)
hp = HTMLParser()
def __init__(self):
self.subject = None
self.author = None
self.date = None
self.body = None
def parse(self, contents):
contents = lossy_unicode(contents)
match = self.rematcher.search(contents)
if not match:
return False
self.subject = self.hp.unescape(match.group(1))
self.author = self.almost_rot13(self.hp.unescape(match.group(2)))
if not self.parse_date(self.hp.unescape(match.group(3))):
return False
self.body = self.hp.unescape(match.group(4))
return True
_date_multi_re = re.compile(' \((\w+\s\w+|)\)$')
_date_trailing_envelope = re.compile('\s+\(envelope.*\)$')
def parse_date(self, d):
# For some reason, we have dates that look like this:
# http://archives.postgresql.org/pgsql-bugs/1999-05/msg00018.php
# Looks like an mhonarc bug, but let's just remove that trailing
# stuff here to be sure...
if self._date_trailing_envelope.search(d):
d = self._date_trailing_envelope.sub('', d)
# We have a number of dates in the format
# "<full datespace> +0200 (MET DST)"
# or similar. The problem coming from the space within the
# parenthesis, or if the contents of the parenthesis is
# completely empty
if self._date_multi_re.search(d):
d = self._date_multi_re.sub('', d)
# Isn't it wonderful with a string with a trailing quote but no
# leading quote? MUA's are weird...
if d.endswith('"') and not d.startswith('"'):
d = d[:-1]
# We also have "known incorrect timezone specs".
if d.endswith('MST7MDT'):
d = d[:-4]
elif d.endswith('METDST'):
d = d[:-3]
elif d.endswith('"MET'):
d = d[:-4] + "MET"
try:
self.date = dateutil.parser.parse(d)
except ValueError:
log("Failed to parse date '%s'" % d)
return False
if self.date.utcoffset():
# We have some messages with completely incorrect utc offsets,
# so we need to reject those too
if self.date.utcoffset() > timedelta(hours=12) or self.date.utcoffset() < timedelta(hours=-12):
log("Failed to parse date %s', timezone offset out of range." % d)
return False
return True
# Semi-hacked rot13, because the one used by mhonarc is broken.
# So we copy the brokenness here.
# This code is from MHonArc/ewhutil.pl, mrot13()
_arot13_trans = dict(list(zip(list(map(ord,
'@ABCDEFGHIJKLMNOPQRSTUVWXYZ[abcdefghijklmnopqrstuvwxyz')),
'NOPQRSTUVWXYZ[@ABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm')))
def almost_rot13(self, s):
return str(s).translate(self._arot13_trans)
class RobotsParser(object):
def __init__(self, url):
try:

View File

@ -1,62 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from lib.log import log
from lib.archives import MultiListCrawler
from lib.threadwrapper import threadwrapper
from ConfigParser import ConfigParser
from optparse import OptionParser
import psycopg2
import sys
import time
def doit(opt):
cp = ConfigParser()
cp.read("search.ini")
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
conn = psycopg2.connect(cp.get("search", "db"))
curs = conn.cursor()
if opt.list:
# Multiple lists can be specified with a comma separator (no spaces)
curs.execute("SELECT id,name FROM lists WHERE name=ANY(%(names)s)", {
'names': opt.list.split(','),
})
else:
curs.execute("SELECT id,name FROM lists WHERE active ORDER BY id")
listinfo = [(id, name) for id, name in curs.fetchall()]
c = MultiListCrawler(listinfo, conn, opt.status_interval, opt.commit_interval)
n = c.crawl(opt.full, opt.month)
# Update total counts
curs.execute("WITH t AS (SELECT list,count(*) AS c FROM messages GROUP BY list) UPDATE lists SET pagecount=t.c FROM t WHERE id=t.list")
# Indicate when we crawled
curs.execute("UPDATE lastcrawl SET lastcrawl=CURRENT_TIMESTAMP")
conn.commit()
log("Indexed %s messages" % n)
time.sleep(1)
if __name__ == "__main__":
parser = OptionParser()
parser.add_option("-l", "--list", dest='list', help="Crawl only this list")
parser.add_option("-m", "--month", dest='month', help="Crawl only this month")
parser.add_option("-f", "--full", dest='full', action="store_true", help="Make a full crawl")
parser.add_option("-t", "--status-interval", dest='status_interval', help="Seconds between status updates")
parser.add_option("-c", "--commit-interval", dest='commit_interval', help="Messages between each commit")
(opt, args) = parser.parse_args()
if opt.full and opt.month:
print("Can't use both full and specific month!")
sys.exit(1)
# assign default values
opt.status_interval = opt.status_interval and int(opt.status_interval) or 30
opt.commit_interval = opt.commit_interval and int(opt.commit_interval) or 500
threadwrapper(doit, opt)

View File

@ -1,49 +0,0 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from lib.log import log
from ConfigParser import ConfigParser
import psycopg2
import urllib
import simplejson as json
if __name__ == "__main__":
cp = ConfigParser()
cp.read("search.ini")
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
conn = psycopg2.connect(cp.get("search", "db"))
curs = conn.cursor()
u = urllib.urlopen("http://%s/community/lists/listinfo/" % cp.get("search", "web"))
obj = json.load(u)
u.close()
# We don't care about the groups here, just the lists!
curs.execute("SELECT id, name, grp, active FROM lists")
lists = curs.fetchall()
for id, name, groupid, active in lists:
thislist = [x for x in obj['lists'] if x['id'] == id]
if len(thislist) == 0:
log("List %s should be removed, do that manually!" % name)
else:
# Compare contents of list
l = thislist[0]
if l['name'] != name:
log("Renaming list %s -> %s" % (name, l['name']))
curs.execute("UPDATE lists SET name=%(name)s WHERE id=%(id)s", l)
if thislist[0]['active'] != active:
log("Changing active flag for %s to %s" % (l['name'], l['active']))
curs.execute("UPDATE lists SET active=%(active)s WHERE id=%(id)s", l)
if thislist[0]['groupid'] != groupid:
log("Changing group for %s to %s" % (l['name'], l['groupid']))
curs.execute("UPDATE lists SET grp=%(groupid)s WHERE id=%(id)s", l)
for l in obj['lists']:
thislist = [x for x in lists if x[0] == l['id']]
if len(thislist) == 0:
log("Adding list %s" % l['name'])
curs.execute("INSERT INTO lists (id, name, grp, active, pagecount) VALUES (%(id)s, %(name)s, %(groupid)s, %(active)s, 0)",
l)
conn.commit()