mirror of
https://github.com/postgres/pgweb.git
synced 2025-08-03 15:38:59 +00:00

on the ftp server, instead of crawling the directoreis directly. This removes the requirement to sync almost 10Gb worth of ftp site onto the web server... The pickle file for this is currently around 1Mb, so it's not a huge burden on the server. If it grows larger in the future, we may want to re-think this and split it up, or put it in a database format or something like that.
71 lines
1.5 KiB
Python
Executable File
71 lines
1.5 KiB
Python
Executable File
#!/usr/bin/python
|
|
|
|
#
|
|
# spider_ftp.py - spider the ftp site and generate an output file with all
|
|
# the metadata we require, that can be transferred over to
|
|
# the wwwmaster server.
|
|
#
|
|
|
|
import sys
|
|
import os
|
|
from datetime import datetime
|
|
import cPickle as pickle
|
|
#from pprint import pprint
|
|
|
|
allnodes = {}
|
|
|
|
def read_file(fn):
|
|
f = open(fn, "r")
|
|
t = f.read()
|
|
f.close()
|
|
return t
|
|
|
|
def parse_directory(dirname, rootlen):
|
|
mynode = {}
|
|
for f in os.listdir(dirname):
|
|
if f.startswith(".") and not f == ".message": continue
|
|
if f == "sync_timestamp": continue
|
|
|
|
fn = os.path.join(dirname, f)
|
|
if os.path.isdir(fn):
|
|
# Can be a directory itself, or a symbolic link to a directory
|
|
if os.path.islink(fn):
|
|
# This is a symbolic link
|
|
mynode[f] = {
|
|
't': 'l',
|
|
'd': os.readlink(fn),
|
|
}
|
|
else:
|
|
# This is a subdirectory, recurse into it
|
|
parse_directory(fn, rootlen)
|
|
mynode[f] = {
|
|
't': 'd',
|
|
}
|
|
else:
|
|
# This a file
|
|
stat = os.stat(fn)
|
|
mynode[f] = {
|
|
't': 'f',
|
|
's': stat.st_size,
|
|
'd': datetime.fromtimestamp(stat.st_mtime),
|
|
}
|
|
if f == "README" or f == "CURRENT_MAINTAINER" or f == ".message":
|
|
mynode[f]['c'] = read_file(fn)
|
|
|
|
allnodes[dirname[rootlen:].strip("/")] = mynode
|
|
|
|
def Usage():
|
|
print "Usage: spider_ftp.py <ftp_root> <pickle_file>"
|
|
sys.exit(1)
|
|
|
|
if len(sys.argv) != 3: Usage()
|
|
|
|
parse_directory(sys.argv[1], len(sys.argv[1]))
|
|
|
|
f = open(sys.argv[2] + ".tmp", "wb")
|
|
pickle.dump(allnodes, f)
|
|
f.close()
|
|
os.rename(sys.argv[2] + ".tmp", sys.argv[2])
|
|
|
|
#pprint(allnodes)
|