Files
postgres-web/tools/ftp/spider_ftp.py
Magnus Hagander 0b4eff93e2 Exclude repositories from ftp browser
There is no need to browse into the repositories on the main website.
It's still possible to browse there directly on the ftp servers of course,
for those that need to debug a repository install and things like that.
2013-10-18 17:59:52 +02:00

93 lines
2.4 KiB
Python
Executable File

#!/usr/bin/python
#
# spider_ftp.py - spider the ftp site and generate an output file with all
# the metadata we require, that can be transferred over to
# the master web server.
#
import sys
import os
from datetime import datetime
import cPickle as pickle
import codecs
import urllib2
# Directories, specified from the root of the ftp tree and down, that
# will be recursively excluded from the pickle.
exclude_roots = ['/repos', ]
allnodes = {}
def read_file(fn):
f = codecs.open(fn, 'r', encoding='utf-8', errors='replace')
t = f.read()
f.close()
return t
def parse_directory(dirname, rootlen):
mynode = {}
for f in os.listdir(dirname):
if f.startswith(".") and not f == ".message": continue
if f == "sync_timestamp": continue
fn = os.path.join(dirname, f)
if os.path.isdir(fn):
# Can be a directory itself, or a symbolic link to a directory
if os.path.islink(fn):
# This is a symbolic link
mynode[f] = {
't': 'l',
'd': os.readlink(fn),
}
else:
# This is a subdirectory, recurse into it, unless it happens
# to be on our exclude list.
if not fn[rootlen:] in exclude_roots:
parse_directory(fn, rootlen)
mynode[f] = {
't': 'd',
}
else:
# This a file
stat = os.stat(fn)
mynode[f] = {
't': 'f',
's': stat.st_size,
'd': datetime.fromtimestamp(stat.st_mtime),
}
if f == "README" or f == "CURRENT_MAINTAINER" or f == ".message":
mynode[f]['c'] = read_file(fn)
allnodes[dirname[rootlen:].strip("/")] = mynode
def Usage():
print "Usage: spider_ftp.py <ftp_root> <pickle_file>"
print ""
print "If <pickle_file> starts with http[s]://, the file will be uploaded"
print "to that URL instead of written to the filesystem."
sys.exit(1)
if len(sys.argv) != 3: Usage()
parse_directory(sys.argv[1], len(sys.argv[1]))
if sys.argv[2].startswith("http://") or sys.argv[2].startswith("https://"):
o = urllib2.build_opener(urllib2.HTTPHandler)
r = urllib2.Request(sys.argv[2], data=pickle.dumps(allnodes))
r.add_header('Content-type', 'application/octet-stream')
r.add_header('Host', 'www.postgresql.org')
r.get_method = lambda: 'PUT'
u = o.open(r)
x = u.read()
if x != "NOT CHANGED" and x != "OK":
print "Failed to upload: %s" % x
sys.exit(1)
else:
f = open(sys.argv[2] + ".tmp", "wb")
pickle.dump(allnodes, f)
f.close()
os.rename(sys.argv[2] + ".tmp", sys.argv[2])
#pprint(allnodes)