#!/usr/bin/env python # # localhtmlvalidate.py - validate local HTML for pgweb # # This is a small tool that you run to validate the HTML output of your # localhost pgweb installation against the W3C validator. Give it the # localhost:8000 URL (or other, depending on what port you're running the # local server on), and it will give you a list of possible issues with # the page. # # In theory it can be used just fine for non-pgweb pages as well, but # for obvious reasons the functionality to show the source line number # based on the pgweb templates won't work. # import sys import urllib import httplib import re import HTMLParser BOUNDARY = "-=--=foobar-=--=" def encode_multipart_formdata(fields, files): L = [] for (key, value) in fields: L.append('--' + BOUNDARY) L.append('Content-Disposition: form-data; name="%s"' % key) L.append('') L.append(value) for (key, filename, value) in files: L.append('--' + BOUNDARY) L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename)) L.append('Content-Type: text/html') L.append('') L.append(value) L.append('--' + BOUNDARY + '--') L.append('') body = "\r\n".join(L) return body if __name__ == "__main__": if len(sys.argv) != 2: print("Usage: localhtmlvalidate.py ") sys.exit(1) contents = urllib.urlopen(sys.argv[1]).read() # Try to figure out where the actual contents start :) try: firstline = contents.splitlines().index('
') except ValueError: firstline = 0 # Generate a form body body = encode_multipart_formdata( [ ('charset', 'utf-8'), ('doctype', 'inline'), ('group', '0'), ('verbose', '1'), ], [('uploaded_file', 'test.html', contents)] ) # Now submit it to the w3c validator h = httplib.HTTP("validator.w3.org") h.putrequest("POST", "/check") h.putheader("User-Agent: localcheck-tester/0.0") h.putheader("content-type", "multipart/form-data; boundary=%s" % BOUNDARY) h.putheader("content-length", str(len(body))) h.endheaders() h.send(body) errcode, errmsg, headers = h.getreply() rbody = h.getfile().read() if headers['x-w3c-validator-status'] == 'Valid': print("Page validates!") sys.exit(0) elif headers['x-w3c-validator-status'] == 'Invalid': print("Invalid!") print("Errors: %s" % headers['x-w3c-validator-errors']) print("Warnings: %s" % headers['x-w3c-validator-warnings']) hp = HTMLParser.HTMLParser() for m in re.findall('
  • .*?
  • ', rbody, re.DOTALL): r = re.search('Line (\d+).*(.*?)', m, re.DOTALL) print("Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2)))) r2 = re.search('(.*?)(.*?)(.*?)', unicode(m, 'utf8'), re.DOTALL) if r2: s = "%s%s%s" % r2.groups() print("Source: %s" % hp.unescape(s).encode('utf-8')) print("") else: print("Unknown status: %s" % headers['x-w3c-validator-status']) print(headers) sys.exit(1)