mirror of
https://github.com/postgres/pgweb.git
synced 2025-08-15 21:34:46 +00:00
Remove indexer for old archives
We have been using the new archives for many years now, so remove this instead of trying to port it to python3.
This commit is contained in:
@ -1,168 +0,0 @@
|
||||
import datetime
|
||||
import httplib
|
||||
from Queue import Queue
|
||||
import threading
|
||||
import sys
|
||||
import time
|
||||
|
||||
from lib.log import log
|
||||
from lib.parsers import ArchivesParser
|
||||
|
||||
|
||||
class MultiListCrawler(object):
|
||||
def __init__(self, lists, conn, status_interval=30, commit_interval=500):
|
||||
self.lists = lists
|
||||
self.conn = conn
|
||||
self.status_interval = status_interval
|
||||
self.commit_interval = commit_interval
|
||||
|
||||
self.queue = Queue()
|
||||
self.counter = 0
|
||||
self.counterlock = threading.RLock()
|
||||
self.stopevent = threading.Event()
|
||||
|
||||
def crawl(self, full=False, month=None):
|
||||
# Each thread can independently run on one month, so we can get
|
||||
# a reasonable spread. Therefor, submit them as separate jobs
|
||||
# to the queue.
|
||||
for listid, listname in self.lists:
|
||||
if full:
|
||||
# Generate a sequence of everything to index
|
||||
for year in range(1997, datetime.datetime.now().year + 1):
|
||||
for month in range(1, 13):
|
||||
self.queue.put((listid, listname, year, month, -1))
|
||||
elif month:
|
||||
# Do one specific month
|
||||
pieces = month.split("-")
|
||||
if len(pieces) != 2:
|
||||
print("Month format is <y>-<m>, cannot parse '%s'" % month)
|
||||
sys.exit(1)
|
||||
try:
|
||||
pieces = [int(x) for x in pieces]
|
||||
except:
|
||||
print("Month format is <y>-<m>, cannot convert '%s' to integers" % month)
|
||||
sys.exit(1)
|
||||
self.queue.put((listid, listname, pieces[0], pieces[1], -1))
|
||||
else:
|
||||
# In incremental scan, we check the current month and the
|
||||
# previous one, but only for new messages.
|
||||
curs = self.conn.cursor()
|
||||
curr = datetime.date.today()
|
||||
if curr.month == 1:
|
||||
prev = datetime.date(curr.year - 1, 12, 1)
|
||||
else:
|
||||
prev = datetime.date(curr.year, curr.month - 1, 1)
|
||||
|
||||
for d in curr, prev:
|
||||
# Figure out what the highest indexed page in this
|
||||
# month is.
|
||||
curs.execute("SELECT max(msgnum) FROM messages WHERE list=%(list)s AND year=%(year)s AND month=%(month)s", {
|
||||
'list': listid,
|
||||
'year': d.year,
|
||||
'month': d.month,
|
||||
})
|
||||
x = curs.fetchall()
|
||||
if x[0][0] is not None:
|
||||
maxmsg = x[0][0]
|
||||
else:
|
||||
maxmsg = -1
|
||||
self.queue.put((listid, listname, d.year, d.month, maxmsg))
|
||||
|
||||
for x in range(5):
|
||||
t = threading.Thread(name="Indexer %s" % x,
|
||||
target=lambda: self.crawl_from_queue())
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
t = threading.Thread(name="statusthread", target=lambda: self.status_thread())
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
# XXX: need to find a way to deal with all threads crashed and
|
||||
# not done here yet!
|
||||
self.queue.join()
|
||||
self.stopevent.set()
|
||||
|
||||
return self.counter
|
||||
|
||||
def status_thread(self):
|
||||
lastcommit = 0
|
||||
starttime = time.time()
|
||||
while not self.stopevent.is_set():
|
||||
self.stopevent.wait(self.status_interval)
|
||||
nowtime = time.time()
|
||||
with self.counterlock:
|
||||
log("Indexed %s messages so far (%s active threads, %s months still queued, %.1f msg/sec)" % (
|
||||
self.counter,
|
||||
threading.active_count() - 2, # main thread + status thread
|
||||
self.queue.qsize(),
|
||||
self.counter / (nowtime - starttime),
|
||||
))
|
||||
# Commit every 500 messages
|
||||
if self.counter - lastcommit > self.commit_interval:
|
||||
lastcommit = self.counter
|
||||
self.conn.commit()
|
||||
|
||||
def crawl_from_queue(self):
|
||||
while not self.stopevent.is_set():
|
||||
(listid, listname, year, month, maxmsg) = self.queue.get()
|
||||
self.crawl_month(listid, listname, year, month, maxmsg)
|
||||
self.queue.task_done()
|
||||
|
||||
def crawl_month(self, listid, listname, year, month, maxmsg):
|
||||
currentmsg = maxmsg
|
||||
while True:
|
||||
currentmsg += 1
|
||||
try:
|
||||
if not self.crawl_single_message(listid, listname, year, month, currentmsg):
|
||||
break
|
||||
except Exception as e:
|
||||
log("Exception when crawling %s/%s/%s/%s - %s" % (
|
||||
listname, year, month, currentmsg, e))
|
||||
# Continue on to try the next message
|
||||
|
||||
def crawl_single_message(self, listid, listname, year, month, msgnum):
|
||||
curs = self.conn.cursor()
|
||||
h = httplib.HTTPConnection(host="archives.postgresql.org",
|
||||
port=80,
|
||||
strict=True,
|
||||
timeout=10)
|
||||
url = "/%s/%04d-%02d/msg%05d.php" % (
|
||||
listname,
|
||||
year,
|
||||
month,
|
||||
msgnum)
|
||||
h.putrequest("GET", url)
|
||||
h.putheader("User-agent", "pgsearch/0.2")
|
||||
h.putheader("Connection", "close")
|
||||
h.endheaders()
|
||||
resp = h.getresponse()
|
||||
txt = resp.read()
|
||||
h.close()
|
||||
|
||||
if resp.status == 404:
|
||||
# Past the end of the month
|
||||
return False
|
||||
elif resp.status != 200:
|
||||
raise Exception("%s/%s/%s/%s returned status %s" % (listname, year, month, msgnum, resp.status))
|
||||
|
||||
# Else we have the message!
|
||||
p = ArchivesParser()
|
||||
if not p.parse(txt):
|
||||
log("Failed to parse %s/%s/%s/%s" % (listname, year, month, msgnum))
|
||||
# We return true to move on to the next message anyway!
|
||||
return True
|
||||
curs.execute("INSERT INTO messages (list, year, month, msgnum, date, subject, author, txt, fti) VALUES (%(listid)s, %(year)s, %(month)s, %(msgnum)s, %(date)s, %(subject)s, %(author)s, %(txt)s, setweight(to_tsvector('pg', %(subject)s), 'A') || to_tsvector('pg', %(txt)s))", {
|
||||
'listid': listid,
|
||||
'year': year,
|
||||
'month': month,
|
||||
'msgnum': msgnum,
|
||||
'date': p.date,
|
||||
'subject': p.subject[:127],
|
||||
'author': p.author[:127],
|
||||
'txt': p.body,
|
||||
})
|
||||
with self.counterlock:
|
||||
self.counter += 1
|
||||
|
||||
return True
|
@ -58,85 +58,6 @@ class GenericHtmlParser(HTMLParser):
|
||||
return self.pagedata.read()
|
||||
|
||||
|
||||
class ArchivesParser(object):
|
||||
rematcher = re.compile("<!--X-Subject: ([^\n]*) -->.*<!--X-From-R13: ([^\n]*) -->.*<!--X-Date: ([^\n]*) -->.*<!--X-Body-of-Message-->(.*)<!--X-Body-of-Message-End-->", re.DOTALL)
|
||||
hp = HTMLParser()
|
||||
|
||||
def __init__(self):
|
||||
self.subject = None
|
||||
self.author = None
|
||||
self.date = None
|
||||
self.body = None
|
||||
|
||||
def parse(self, contents):
|
||||
contents = lossy_unicode(contents)
|
||||
match = self.rematcher.search(contents)
|
||||
if not match:
|
||||
return False
|
||||
self.subject = self.hp.unescape(match.group(1))
|
||||
self.author = self.almost_rot13(self.hp.unescape(match.group(2)))
|
||||
if not self.parse_date(self.hp.unescape(match.group(3))):
|
||||
return False
|
||||
self.body = self.hp.unescape(match.group(4))
|
||||
return True
|
||||
|
||||
_date_multi_re = re.compile(' \((\w+\s\w+|)\)$')
|
||||
_date_trailing_envelope = re.compile('\s+\(envelope.*\)$')
|
||||
|
||||
def parse_date(self, d):
|
||||
# For some reason, we have dates that look like this:
|
||||
# http://archives.postgresql.org/pgsql-bugs/1999-05/msg00018.php
|
||||
# Looks like an mhonarc bug, but let's just remove that trailing
|
||||
# stuff here to be sure...
|
||||
if self._date_trailing_envelope.search(d):
|
||||
d = self._date_trailing_envelope.sub('', d)
|
||||
|
||||
# We have a number of dates in the format
|
||||
# "<full datespace> +0200 (MET DST)"
|
||||
# or similar. The problem coming from the space within the
|
||||
# parenthesis, or if the contents of the parenthesis is
|
||||
# completely empty
|
||||
if self._date_multi_re.search(d):
|
||||
d = self._date_multi_re.sub('', d)
|
||||
# Isn't it wonderful with a string with a trailing quote but no
|
||||
# leading quote? MUA's are weird...
|
||||
if d.endswith('"') and not d.startswith('"'):
|
||||
d = d[:-1]
|
||||
|
||||
# We also have "known incorrect timezone specs".
|
||||
if d.endswith('MST7MDT'):
|
||||
d = d[:-4]
|
||||
elif d.endswith('METDST'):
|
||||
d = d[:-3]
|
||||
elif d.endswith('"MET'):
|
||||
d = d[:-4] + "MET"
|
||||
|
||||
try:
|
||||
self.date = dateutil.parser.parse(d)
|
||||
except ValueError:
|
||||
log("Failed to parse date '%s'" % d)
|
||||
return False
|
||||
|
||||
if self.date.utcoffset():
|
||||
# We have some messages with completely incorrect utc offsets,
|
||||
# so we need to reject those too
|
||||
if self.date.utcoffset() > timedelta(hours=12) or self.date.utcoffset() < timedelta(hours=-12):
|
||||
log("Failed to parse date %s', timezone offset out of range." % d)
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# Semi-hacked rot13, because the one used by mhonarc is broken.
|
||||
# So we copy the brokenness here.
|
||||
# This code is from MHonArc/ewhutil.pl, mrot13()
|
||||
_arot13_trans = dict(list(zip(list(map(ord,
|
||||
'@ABCDEFGHIJKLMNOPQRSTUVWXYZ[abcdefghijklmnopqrstuvwxyz')),
|
||||
'NOPQRSTUVWXYZ[@ABCDEFGHIJKLMnopqrstuvwxyzabcdefghijklm')))
|
||||
|
||||
def almost_rot13(self, s):
|
||||
return str(s).translate(self._arot13_trans)
|
||||
|
||||
|
||||
class RobotsParser(object):
|
||||
def __init__(self, url):
|
||||
try:
|
||||
|
@ -1,62 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from lib.log import log
|
||||
from lib.archives import MultiListCrawler
|
||||
from lib.threadwrapper import threadwrapper
|
||||
from ConfigParser import ConfigParser
|
||||
from optparse import OptionParser
|
||||
import psycopg2
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
||||
def doit(opt):
|
||||
cp = ConfigParser()
|
||||
cp.read("search.ini")
|
||||
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
|
||||
conn = psycopg2.connect(cp.get("search", "db"))
|
||||
|
||||
curs = conn.cursor()
|
||||
|
||||
if opt.list:
|
||||
# Multiple lists can be specified with a comma separator (no spaces)
|
||||
curs.execute("SELECT id,name FROM lists WHERE name=ANY(%(names)s)", {
|
||||
'names': opt.list.split(','),
|
||||
})
|
||||
else:
|
||||
curs.execute("SELECT id,name FROM lists WHERE active ORDER BY id")
|
||||
|
||||
listinfo = [(id, name) for id, name in curs.fetchall()]
|
||||
c = MultiListCrawler(listinfo, conn, opt.status_interval, opt.commit_interval)
|
||||
n = c.crawl(opt.full, opt.month)
|
||||
|
||||
# Update total counts
|
||||
curs.execute("WITH t AS (SELECT list,count(*) AS c FROM messages GROUP BY list) UPDATE lists SET pagecount=t.c FROM t WHERE id=t.list")
|
||||
# Indicate when we crawled
|
||||
curs.execute("UPDATE lastcrawl SET lastcrawl=CURRENT_TIMESTAMP")
|
||||
conn.commit()
|
||||
|
||||
log("Indexed %s messages" % n)
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = OptionParser()
|
||||
parser.add_option("-l", "--list", dest='list', help="Crawl only this list")
|
||||
parser.add_option("-m", "--month", dest='month', help="Crawl only this month")
|
||||
parser.add_option("-f", "--full", dest='full', action="store_true", help="Make a full crawl")
|
||||
parser.add_option("-t", "--status-interval", dest='status_interval', help="Seconds between status updates")
|
||||
parser.add_option("-c", "--commit-interval", dest='commit_interval', help="Messages between each commit")
|
||||
|
||||
(opt, args) = parser.parse_args()
|
||||
|
||||
if opt.full and opt.month:
|
||||
print("Can't use both full and specific month!")
|
||||
sys.exit(1)
|
||||
|
||||
# assign default values
|
||||
opt.status_interval = opt.status_interval and int(opt.status_interval) or 30
|
||||
opt.commit_interval = opt.commit_interval and int(opt.commit_interval) or 500
|
||||
|
||||
threadwrapper(doit, opt)
|
@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from lib.log import log
|
||||
from ConfigParser import ConfigParser
|
||||
import psycopg2
|
||||
import urllib
|
||||
import simplejson as json
|
||||
|
||||
if __name__ == "__main__":
|
||||
cp = ConfigParser()
|
||||
cp.read("search.ini")
|
||||
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
|
||||
conn = psycopg2.connect(cp.get("search", "db"))
|
||||
curs = conn.cursor()
|
||||
|
||||
u = urllib.urlopen("http://%s/community/lists/listinfo/" % cp.get("search", "web"))
|
||||
obj = json.load(u)
|
||||
u.close()
|
||||
|
||||
# We don't care about the groups here, just the lists!
|
||||
curs.execute("SELECT id, name, grp, active FROM lists")
|
||||
lists = curs.fetchall()
|
||||
for id, name, groupid, active in lists:
|
||||
thislist = [x for x in obj['lists'] if x['id'] == id]
|
||||
if len(thislist) == 0:
|
||||
log("List %s should be removed, do that manually!" % name)
|
||||
else:
|
||||
# Compare contents of list
|
||||
l = thislist[0]
|
||||
if l['name'] != name:
|
||||
log("Renaming list %s -> %s" % (name, l['name']))
|
||||
curs.execute("UPDATE lists SET name=%(name)s WHERE id=%(id)s", l)
|
||||
|
||||
if thislist[0]['active'] != active:
|
||||
log("Changing active flag for %s to %s" % (l['name'], l['active']))
|
||||
curs.execute("UPDATE lists SET active=%(active)s WHERE id=%(id)s", l)
|
||||
if thislist[0]['groupid'] != groupid:
|
||||
log("Changing group for %s to %s" % (l['name'], l['groupid']))
|
||||
curs.execute("UPDATE lists SET grp=%(groupid)s WHERE id=%(id)s", l)
|
||||
|
||||
for l in obj['lists']:
|
||||
thislist = [x for x in lists if x[0] == l['id']]
|
||||
if len(thislist) == 0:
|
||||
log("Adding list %s" % l['name'])
|
||||
curs.execute("INSERT INTO lists (id, name, grp, active, pagecount) VALUES (%(id)s, %(name)s, %(groupid)s, %(active)s, 0)",
|
||||
l)
|
||||
|
||||
conn.commit()
|
Reference in New Issue
Block a user