[Py2to3] Ensure httpdownloader saves data as UTF-8

Python 3 raised a decoding error with the google page which appears to be
encoded with 'latin-1', so extract the content charset to decode and
re-encode in 'utf-8'.
This commit is contained in:
Calum Lind 2018-07-29 07:17:14 +01:00
parent d5133f789a
commit 18d448d4a5
1 changed files with 14 additions and 2 deletions

View File

@ -45,7 +45,7 @@ class CompressionDecoderProtocol(client._GzipProtocol):
class BodyHandler(HTTPClientParser, object):
"""An HTTP parser that saves the response to a file."""
def __init__(self, request, finished, length, agent):
def __init__(self, request, finished, length, agent, encoding=None):
"""BodyHandler init.
Args:
@ -60,6 +60,7 @@ class BodyHandler(HTTPClientParser, object):
self.total_length = length
self.current_length = 0
self.data = b''
self.encoding = encoding
def dataReceived(self, data): # NOQA: N802
self.current_length += len(data)
@ -69,6 +70,8 @@ class BodyHandler(HTTPClientParser, object):
data, self.current_length, self.total_length)
def connectionLost(self, reason): # NOQA: N802
if self.encoding:
self.data = self.data.decode(self.encoding).encode('utf8')
with open(self.agent.filename, 'wb') as _file:
_file.write(self.data)
self.finished.callback(self.agent.filename)
@ -148,8 +151,17 @@ class HTTPDownloaderAgent(object):
self.filename = new_file_name
cont_type = headers.getRawHeaders(b'content-type')[0].decode()
params = cgi.parse_header(cont_type)[1]
encoding = params.get('charset', None)
response.deliverBody(
BodyHandler(response.request, finished, body_length, self))
BodyHandler(
response.request,
finished,
body_length,
self,
encoding,
))
return finished