Get the information for the ftp browser from a file that is generated

on the ftp server, instead of crawling the directoreis directly. This
removes the requirement to sync almost 10Gb worth of ftp site onto the
web server...

The pickle file for this is currently around 1Mb, so it's not a huge
burden on the server. If it grows larger in the future, we may want to
re-think this and split it up, or put it in a database format or something
like that.
This commit is contained in:
Magnus Hagander
2010-06-10 14:46:16 +02:00
parent 778ffdb158
commit 521920542b
3 changed files with 105 additions and 50 deletions

View File

@ -1,5 +1,5 @@
from django.shortcuts import render_to_response, get_object_or_404
from django.http import HttpResponse, Http404, HttpResponseRedirect
from django.http import HttpResponse, Http404, HttpResponseRedirect, HttpResponseServerError
from django.template import TemplateDoesNotExist, loader, Context
from django.contrib.auth.decorators import login_required
from django.db import connection, transaction
@ -8,6 +8,7 @@ from django.conf import settings
import os
from datetime import datetime
import urlparse
import cPickle as pickle
from pgweb.util.decorators import ssl_required, nocache
from pgweb.util.contexts import NavContext
@ -19,46 +20,6 @@ from forms import *
#######
# FTP browser
#######
def _getfiledata(root, paths):
for path in paths:
fn = "%s/%s" % (root,path)
if not os.path.isfile(fn):
continue
stat = os.stat(fn)
yield {
'name':path,
'mtime': datetime.fromtimestamp(stat.st_mtime),
'size': stat.st_size,
}
def _getdirectorydata(root, paths):
for path in paths:
fn = "%s/%s" % (root,path)
if not os.path.isdir(fn):
continue
if os.path.islink(fn):
# This is a link, so change the url to point directly
# to the link target. We'll just assume the link
# is safe. Oh, and links must be relative
yield {
'link': path,
'url': os.readlink(fn),
}
else:
yield {
'link': path,
'url': path,
}
def _getfile(root, filename):
fn = "%s/%s" % (root,filename)
if os.path.isfile(fn):
f = open(fn)
r = f.read()
f.close()
return r
return None
def ftpbrowser(request, subpath):
if subpath:
# An actual path has been selected. Fancy!
@ -67,20 +28,35 @@ def ftpbrowser(request, subpath):
# Just claim it doesn't exist if the user tries to do this
# type of bad thing
raise Http404
fspath = os.path.join(settings.FTP_ROOT, subpath)
subpath = subpath.strip('/')
else:
fspath = settings.FTP_ROOT
subpath=""
if not os.path.isdir(fspath):
# Pickle up the list of things we need
try:
f = open(settings.FTP_PICKLE, "rb")
allnodes = pickle.load(f)
f.close()
except Exception, e:
return HttpResponseServerError("Failed to load ftp site information: %s" % e)
if not allnodes.has_key(subpath):
raise Http404
everything = [n for n in os.listdir(fspath) if not n.startswith('.')]
node = allnodes[subpath]
del allnodes
directories = list(_getdirectorydata(fspath, everything))
# Add all directories
directories = [{'link': k, 'url': k} for k,v in node.items() if v['t'] == 'd']
# Add all symlinks (only directoreis supported)
directories.extend([{'link': k, 'url': v['d']} for k,v in node.items() if v['t'] == 'l'])
# Add a link to the parent directory
if subpath:
directories.append({'link':'[Parent Directory]', 'url':'..'})
files = list(_getfiledata(fspath, everything))
# Fetch files
files = [{'name': k, 'mtime': v['t'], 'size': v['s']} for k,v in node.items() if v['t'] == 'f']
breadcrumbs = []
if subpath:
@ -95,14 +71,21 @@ def ftpbrowser(request, subpath):
breadroot = pathpiece
breadcrumbs.append({'name': pathpiece, 'path': breadroot});
# Check if there are any "content files" we should render directly on the webpage
file_readme = node.has_key('README') and node['README']['c'] or None;
file_message = node.has_key('.message') and node['.message']['c'] or None;
file_maintainer = node.has_key('CURRENT_MAINTAINER') and node['CURRENT_MAINTAINER']['c'] or None;
del node
return render_to_response('downloads/ftpbrowser.html', {
'basepath': subpath.rstrip('/'),
'directories': sorted(directories),
'files': sorted(files),
'breadcrumbs': breadcrumbs,
'readme': _getfile(fspath, 'README'),
'messagesfile': _getfile(fspath, '.messages'),
'maintainer': _getfile(fspath, 'CURRENT_MAINTAINER'),
'readme': file_readme,
'messagefile': file_message,
'maintainer': file_maintainer,
}, NavContext(request, 'download'))
def _get_numeric_ip(request):

View File

@ -111,6 +111,8 @@ INSTALLED_APPS = [
SITE_ROOT="http://www.postgresql.org"
MASTERSITE_ROOT="http://wwwmaster.postgresql.org"
FTP_PICKLE="/usr/local/pgweb/ftpsite.pickle"
# Load local settings overrides
from settings_local import *

70
tools/ftp/spider_ftp.py Executable file
View File

@ -0,0 +1,70 @@
#!/usr/bin/python
#
# spider_ftp.py - spider the ftp site and generate an output file with all
# the metadata we require, that can be transferred over to
# the wwwmaster server.
#
import sys
import os
from datetime import datetime
import cPickle as pickle
#from pprint import pprint
allnodes = {}
def read_file(fn):
f = open(fn, "r")
t = f.read()
f.close()
return t
def parse_directory(dirname, rootlen):
mynode = {}
for f in os.listdir(dirname):
if f.startswith(".") and not f == ".message": continue
if f == "sync_timestamp": continue
fn = os.path.join(dirname, f)
if os.path.isdir(fn):
# Can be a directory itself, or a symbolic link to a directory
if os.path.islink(fn):
# This is a symbolic link
mynode[f] = {
't': 'l',
'd': os.readlink(fn),
}
else:
# This is a subdirectory, recurse into it
parse_directory(fn, rootlen)
mynode[f] = {
't': 'd',
}
else:
# This a file
stat = os.stat(fn)
mynode[f] = {
't': 'f',
's': stat.st_size,
'd': datetime.fromtimestamp(stat.st_mtime),
}
if f == "README" or f == "CURRENT_MAINTAINER" or f == ".message":
mynode[f]['c'] = read_file(fn)
allnodes[dirname[rootlen:].strip("/")] = mynode
def Usage():
print "Usage: spider_ftp.py <ftp_root> <pickle_file>"
sys.exit(1)
if len(sys.argv) != 3: Usage()
parse_directory(sys.argv[1], len(sys.argv[1]))
f = open(sys.argv[2] + ".tmp", "wb")
pickle.dump(allnodes, f)
f.close()
os.rename(sys.argv[2] + ".tmp", sys.argv[2])
#pprint(allnodes)