From d8169e0f0526c59370a822148a36fadb08da686c Mon Sep 17 00:00:00 2001
From: Magnus Hagander <magnus@hagander.net>
Date: Mon, 10 Sep 2012 14:02:19 +0200
Subject: [PATCH] Add small tool to run w3c validator on local pages before
 deployment

---
 tools/localhtmlvalidate/localhtmlvalidate.py | 92 ++++++++++++++++++++
 1 file changed, 92 insertions(+)
 create mode 100755 tools/localhtmlvalidate/localhtmlvalidate.py
diff --git a/tools/localhtmlvalidate/localhtmlvalidate.py b/tools/localhtmlvalidate/localhtmlvalidate.py
new file mode 100755
index 00000000..a41ad659
--- /dev/null
+++ b/tools/localhtmlvalidate/localhtmlvalidate.py
@@ -0,0 +1,92 @@
+#!/usr/bin/env python
+#
+# localhtmlvalidate.py - validate local HTML for pgweb
+#
+# This is a small tool that you run to validate the HTML output of your
+# localhost pgweb installation against the W3C validator. Give it the
+# localhost:8000 URL (or other, depending on what port you're running the
+# local server on), and it will give you a list of possible issues with
+# the page.
+#
+# In theory it can be used just fine for non-pgweb pages as well, but
+# for obvious reasons the functionality to show the source line number
+# based on the pgweb templates won't work.
+#
+
+import sys
+import urllib
+import httplib
+import re
+import HTMLParser
+
+BOUNDARY="-=--=foobar-=--="
+
+def encode_multipart_formdata(fields, files):
+	L = []
+	for (key, value) in fields:
+		L.append('--' + BOUNDARY)
+		L.append('Content-Disposition: form-data; name="%s"' % key)
+		L.append('')
+		L.append(value)
+	for (key, filename, value) in files:
+		L.append('--' + BOUNDARY)
+		L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
+		L.append('Content-Type: text/html')
+		L.append('')
+		L.append(value)
+	L.append('--' + BOUNDARY + '--')
+	L.append('')
+	body = "\r\n".join(L)
+	return body
+
+if __name__=="__main__":
+	if len(sys.argv) != 2:
+		print "Usage: localhtmlvalidate.py <local url>"
+		sys.exit(1)
+
+	contents = urllib.urlopen(sys.argv[1]).read()
+
+	# Try to figure out where the actual contents start :)
+	firstline = contents.splitlines().index('<div id="pgContentWrap">')
+
+	# Generate a form body
+	body = encode_multipart_formdata([
+			('charset', 'utf-8'),
+			('doctype', 'inline'),
+			('group', '0'),
+			('verbose', '1'),
+			],
+									 [('uploaded_file', 'test.html', contents)])
+
+	# Now submit it to the w3c validator
+	h = httplib.HTTP("validator.w3.org")
+	h.putrequest("POST", "/check")
+	h.putheader("content-type", "multipart/form-data; boundary=%s" % BOUNDARY)
+	h.putheader("content-length", str(len(body)))
+	h.endheaders()
+	h.send(body)
+	errcode, errmsg, headers = h.getreply()
+	rbody = h.getfile().read()
+	if headers['x-w3c-validator-status'] == 'Valid':
+		print "Page validates!"
+		sys.exit(0)
+	elif headers['x-w3c-validator-status'] == 'Invalid':
+		print "Invalid!"
+		print "Errors: %s" % headers['x-w3c-validator-errors']
+		print "Warnings: %s" % headers['x-w3c-validator-warnings']
+		hp = HTMLParser.HTMLParser()
+		for m in re.findall('<li class="msg_err">.*?</li>', rbody, re.DOTALL):
+			r = re.search('<em>Line (\d+).*<span class="msg">(.*?)</span>', m, re.DOTALL)
+			print "Line %s (should be around %s): %s" % (r.group(1), int(r.group(1)) - firstline, hp.unescape(r.group(2)))
+
+			r2 = re.search('<code class="input">(.*?)<strong title=".*?">(.*?)</strong>(.*?)</code>', unicode(m, 'utf8'), re.DOTALL)
+			if r2:
+				s = u"%s%s%s" % r2.groups()
+				print "Source: %s" % hp.unescape(s).encode('utf-8')
+			print ""
+	else:
+		print "Unknown status: %s" % headers['x-w3c-validator-status']
+		print headers
+		sys.exit(1)
+	
+