Don't generate http requests with two Host: headers

This would cause http 400 requests at the server, unsurprisingly.
This commit is contained in:
Magnus Hagander
2012-02-07 13:05:41 +01:00
parent 03794e3157
commit f9486b54f1

View File

@ -164,14 +164,14 @@ class BaseSiteCrawler(object):
try:
# Unfortunatley, persistent connections seem quite unreliable,
# so create a new one for each page.
h = httplib.HTTPConnection(host=self.serverip and self.serverip or self.hostname,
port=80,
strict=True,
timeout=10)
h.putrequest("GET", url)
h.putheader("User-agent","pgsearch/0.2")
if self.serverip:
h = httplib.HTTPConnection(host=self.serverip, port=80, strict=True, timeout=10)
h.putrequest("GET", url, skip_host=1)
h.putheader("Host", self.hostname)
else:
h = httplib.HTTPConnection(host=self.hostname, port=80, strict=True, timeout=10)
h.putrequest("GET", url)
h.putheader("User-agent","pgsearch/0.2")
h.putheader("Connection","close")
if self.scantimes.has_key(url):
h.putheader("If-Modified-Since", formatdate(time.mktime(self.scantimes[url].timetuple())))