Add a lot more code documentation to the documentation loader.

This commit is contained in:
Jonathan S. Katz
2019-03-31 17:02:43 -04:00
parent 1f4c74c2c2
commit 3d3339249a

View File

@ -14,14 +14,19 @@ from configparser import ConfigParser
import psycopg2 import psycopg2
# a counter that keeps track of the total number of pages (HTML, SVG) that are loaded
# into the database
pagecount = 0 pagecount = 0
# if set to "True" -- mutes any output from the script. Controlled by an option
quiet = False quiet = False
# regular expression used to search and extract the title on a given piece of
# documentation, for further use in the application
re_titlematch = re.compile('<title\s*>([^<]+)</title\s*>', re.IGNORECASE) re_titlematch = re.compile('<title\s*>([^<]+)</title\s*>', re.IGNORECASE)
# Load a single page # Load a single page
def load_doc_file(filename, f, c): def load_doc_file(filename, f, c):
"""Prepares and loads a HTML file for import into the documentation database"""
tidyopts = dict( tidyopts = dict(
drop_proprietary_attributes=1, drop_proprietary_attributes=1,
alt_text='', alt_text='',
@ -51,18 +56,26 @@ def load_doc_file(filename, f, c):
if float(ver) < 11 and float(ver) > 0: if float(ver) < 11 and float(ver) > 0:
tidyopts['indent'] = 'no' tidyopts['indent'] = 'no'
# convert the raw contents to the appropriate encoding for the content that will
# be stored in the database
contents = str(rawcontents, encoding) contents = str(rawcontents, encoding)
# extract the title of the page, which is rendered in a few places in the documentation
tm = re_titlematch.search(contents) tm = re_titlematch.search(contents)
if tm: if tm:
title = tm.group(1) title = tm.group(1)
else: else:
title = "" title = ""
# if not in quiet mode, output the (filename, title) pair of the docpage that is being processed
if not quiet: if not quiet:
print("--- file: %s (%s) ---" % (filename, title)) print("--- file: %s (%s) ---" % (filename, title))
# run libtidy on the content
(html, errors) = tidylib.tidy_document(contents, options=tidyopts) (html, errors) = tidylib.tidy_document(contents, options=tidyopts)
# add all of the information to the CSV that will be used to load the updated
# documentation pages into the database
c.writerow([filename, ver, title, html]) c.writerow([filename, ver, title, html])
@ -88,13 +101,16 @@ quiet = options.quiet
ver = sys.argv[1] ver = sys.argv[1]
tarfilename = sys.argv[2] tarfilename = sys.argv[2]
# load the configuration that is used to connect to the database
config = ConfigParser() config = ConfigParser()
config.read(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'docload.ini')) config.read(os.path.join(os.path.abspath(os.path.dirname(__file__)), 'docload.ini'))
# determine if the referenced tarball exists; if not, exit
if not os.path.isfile(tarfilename): if not os.path.isfile(tarfilename):
print("File %s not found" % tarfilename) print("File %s not found" % tarfilename)
sys.exit(1) sys.exit(1)
# open up the tarball as well as a connection to the database
tf = tarfile.open(tarfilename) tf = tarfile.open(tarfilename)
connection = psycopg2.connect(config.get('db', 'dsn')) connection = psycopg2.connect(config.get('db', 'dsn'))
@ -109,11 +125,18 @@ if len(r) != 1:
iscurrent = r[0][0] iscurrent = r[0][0]
# begin creating a CSV that will be used to import the documentation into the database
s = io.StringIO() s = io.StringIO()
c = csv.writer(s, delimiter=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC) c = csv.writer(s, delimiter=';', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
# this regular expression is for "newer" versions of PostgreSQL that keep all of
# the HTML documentation built out
re_htmlfile = re.compile('[^/]*/doc/src/sgml/html/.*') re_htmlfile = re.compile('[^/]*/doc/src/sgml/html/.*')
# this regular expression is for "older" versions of PostgreSQL that keep the
# HTML documentation in a tarball within the tarball
re_tarfile = re.compile('[^/]*/doc/postgres.tar.gz$') re_tarfile = re.compile('[^/]*/doc/postgres.tar.gz$')
# go through each file of the tarball to determine if the file is documentation
# that should be imported
for member in tf: for member in tf:
# newer versions of PostgreSQL will go down this path to find docfiles # newer versions of PostgreSQL will go down this path to find docfiles
if re_htmlfile.match(member.name): if re_htmlfile.match(member.name):
@ -151,12 +174,18 @@ if not quiet:
s.seek(0) s.seek(0)
# Start loading the documentation into the database
# First, load the newly discovered documentation into a temporary table, where we
# can validate that we loaded exactly the number of docs that we thought we would,
# based on the page counter
curs.execute("CREATE TEMP TABLE docsload (file varchar(64) NOT NULL, version numeric(3,1) NOT NULL, title varchar(256) NOT NULL, content text)") curs.execute("CREATE TEMP TABLE docsload (file varchar(64) NOT NULL, version numeric(3,1) NOT NULL, title varchar(256) NOT NULL, content text)")
curs.copy_expert("COPY docsload FROM STDIN WITH CSV DELIMITER AS ';'", s) curs.copy_expert("COPY docsload FROM STDIN WITH CSV DELIMITER AS ';'", s)
if curs.rowcount != pagecount: if curs.rowcount != pagecount:
print("Loaded invalid number of rows! {} rows for {} pages!".format(curs.rowcount, pagecount)) print("Loaded invalid number of rows! {} rows for {} pages!".format(curs.rowcount, pagecount))
sys.exit(1) sys.exit(1)
# If the previous step succeeded, delete all the documentation for the specified version
# and insert into / updatethe doc table the content that was loaded into the temporary table
curs.execute("DELETE FROM docs WHERE version=%(version)s AND NOT EXISTS (SELECT 1 FROM docsload WHERE docsload.file=docs.file)", { curs.execute("DELETE FROM docs WHERE version=%(version)s AND NOT EXISTS (SELECT 1 FROM docsload WHERE docsload.file=docs.file)", {
'version': ver, 'version': ver,
}) })
@ -187,6 +216,7 @@ curs.execute("SELECT varnish_purge('^/docs/' || %(v)s || '/')", {'v': ver})
if iscurrent: if iscurrent:
curs.execute("SELECT varnish_purge('^/docs/current/')") curs.execute("SELECT varnish_purge('^/docs/current/')")
# ensure the changes are committed, and close the connection
connection.commit() connection.commit()
connection.close() connection.close()