Files
postgres-web/tools/search/crawler/webcrawler.py
Magnus Hagander b8a2015be2 New set of web search crawlers and infrastructure
Replaces the old search code with something that's not quite as much
spaghetti (e.g. not evolved over too much time), and more stable (actual
error handling instead of random crashes)

Crawlers are now also multithreaded to deal with higher latency to some
sites.
2012-01-21 15:27:06 +01:00

42 lines
1.1 KiB
Python
Executable File

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from lib.log import log
from lib.genericsite import GenericSiteCrawler
from lib.sitemapsite import SitemapSiteCrawler
from lib.threadwrapper import threadwrapper
from ConfigParser import ConfigParser
import psycopg2
import time
def doit():
psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
conn = psycopg2.connect(cp.get("search","db"))
curs = conn.cursor()
# Start by indexing the main website
log("Starting indexing of main website")
SitemapSiteCrawler("www.postgresql.org", conn, 1, cp.get("search", "frontendip")).crawl()
conn.commit()
# Skip id=1, which is the main site..
curs.execute("SELECT id, hostname FROM sites WHERE id>1")
for siteid, hostname in curs.fetchall():
log("Starting indexing of %s" % hostname)
GenericSiteCrawler(hostname, conn, siteid).crawl()
conn.commit()
curs.execute("WITH t AS (SELECT site,count(*) AS c FROM webpages GROUP BY site) UPDATE sites SET pagecount=t.c FROM t WHERE id=t.site")
conn.commit()
time.sleep(1)
if __name__=="__main__":
cp = ConfigParser()
cp.read("search.ini")
threadwrapper(doit)