Files
postgres-web/tools/ftp/spider_ftp.py
Magnus Hagander 521920542b Get the information for the ftp browser from a file that is generated
on the ftp server, instead of crawling the directoreis directly. This
removes the requirement to sync almost 10Gb worth of ftp site onto the
web server...

The pickle file for this is currently around 1Mb, so it's not a huge
burden on the server. If it grows larger in the future, we may want to
re-think this and split it up, or put it in a database format or something
like that.
2010-06-10 14:46:16 +02:00

71 lines
1.5 KiB
Python
Executable File

#!/usr/bin/python
#
# spider_ftp.py - spider the ftp site and generate an output file with all
# the metadata we require, that can be transferred over to
# the wwwmaster server.
#
import sys
import os
from datetime import datetime
import cPickle as pickle
#from pprint import pprint
allnodes = {}
def read_file(fn):
f = open(fn, "r")
t = f.read()
f.close()
return t
def parse_directory(dirname, rootlen):
mynode = {}
for f in os.listdir(dirname):
if f.startswith(".") and not f == ".message": continue
if f == "sync_timestamp": continue
fn = os.path.join(dirname, f)
if os.path.isdir(fn):
# Can be a directory itself, or a symbolic link to a directory
if os.path.islink(fn):
# This is a symbolic link
mynode[f] = {
't': 'l',
'd': os.readlink(fn),
}
else:
# This is a subdirectory, recurse into it
parse_directory(fn, rootlen)
mynode[f] = {
't': 'd',
}
else:
# This a file
stat = os.stat(fn)
mynode[f] = {
't': 'f',
's': stat.st_size,
'd': datetime.fromtimestamp(stat.st_mtime),
}
if f == "README" or f == "CURRENT_MAINTAINER" or f == ".message":
mynode[f]['c'] = read_file(fn)
allnodes[dirname[rootlen:].strip("/")] = mynode
def Usage():
print "Usage: spider_ftp.py <ftp_root> <pickle_file>"
sys.exit(1)
if len(sys.argv) != 3: Usage()
parse_directory(sys.argv[1], len(sys.argv[1]))
f = open(sys.argv[2] + ".tmp", "wb")
pickle.dump(allnodes, f)
f.close()
os.rename(sys.argv[2] + ".tmp", sys.argv[2])
#pprint(allnodes)