[#3440] Fix httpdownloader reencoding torrent file downloads

Torrent downloads from rutracker responds with the header:

    Content-Type: application/x-bittorrent; charset=Windows-1251

The problem is that httpdownloader was using the charset to re-encode
the downloaded file, corrupting the binary torrent file download.

Fixed by only re-encoding text content types, since it is very rare
that non-text content types would actually have a non-utf8 codeset and
if there is a requirement we would need to determine it on a type by
type basis.
This commit is contained in:
Calum Lind 2021-02-20 19:39:07 +00:00
parent f331b6c754
commit 4d970754a4
2 changed files with 55 additions and 9 deletions

View File

@ -151,9 +151,12 @@ class HTTPDownloaderAgent(object):
self.filename = new_file_name self.filename = new_file_name
cont_type = headers.getRawHeaders(b'content-type')[0].decode() cont_type_header = headers.getRawHeaders(b'content-type')[0].decode()
params = cgi.parse_header(cont_type)[1] cont_type, params = cgi.parse_header(cont_type_header)
encoding = params.get('charset', None) # Only re-ecode text content types.
encoding = None
if cont_type.startswith('text/'):
encoding = params.get('charset', None)
response.deliverBody( response.deliverBody(
BodyHandler(response.request, finished, body_length, self, encoding) BodyHandler(response.request, finished, body_length, self, encoding)
) )

View File

@ -9,6 +9,7 @@ from __future__ import unicode_literals
import tempfile import tempfile
from email.utils import formatdate from email.utils import formatdate
from io import open
from twisted.internet import reactor from twisted.internet import reactor
from twisted.internet.error import CannotListenError from twisted.internet.error import CannotListenError
@ -47,9 +48,30 @@ class RenameResource(Resource):
class AttachmentResource(Resource): class AttachmentResource(Resource):
def render(self, request): def render(self, request):
request.setHeader(b'Content-Type', b'text/plain') content_type = b'text/plain'
charset = request.getHeader(b'content-charset')
if charset:
content_type += b'; charset=' + charset
request.setHeader(b'Content-Type', content_type)
request.setHeader(b'Content-Disposition', b'attachment') request.setHeader(b'Content-Disposition', b'attachment')
return b'Attachement with no filename set' append = request.getHeader(b'content-append') or b''
content = 'Attachment with no filename set{}'.format(append.decode('utf8'))
return (
content.encode(charset.decode('utf8'))
if charset
else content.encode('utf8')
)
class TorrentResource(Resource):
def render(self, request):
content_type = b'application/x-bittorrent'
charset = request.getHeader(b'content-charset')
if charset:
content_type += b'; charset=' + charset
request.setHeader(b'Content-Type', content_type)
request.setHeader(b'Content-Disposition', b'attachment; filename=test.torrent')
return 'Binary attachment ignore charset 世丕且\n'.encode('utf8')
class CookieResource(Resource): class CookieResource(Resource):
@ -101,6 +123,7 @@ class TopLevelResource(Resource):
self.putChild(b'redirect', self.redirect_rsrc) self.putChild(b'redirect', self.redirect_rsrc)
self.putChild(b'rename', RenameResource()) self.putChild(b'rename', RenameResource())
self.putChild(b'attachment', AttachmentResource()) self.putChild(b'attachment', AttachmentResource())
self.putChild(b'torrent', TorrentResource())
self.putChild(b'partial', PartialDownloadResource()) self.putChild(b'partial', PartialDownloadResource())
def getChild(self, path, request): # NOQA: N802 def getChild(self, path, request): # NOQA: N802
@ -110,7 +133,7 @@ class TopLevelResource(Resource):
return Resource.getChild(self, path, request) return Resource.getChild(self, path, request)
def render(self, request): def render(self, request):
if request.getHeader('If-Modified-Since'): if request.getHeader(b'If-Modified-Since'):
request.setResponseCode(NOT_MODIFIED) request.setResponseCode(NOT_MODIFIED)
return b'<h1>Deluge HTTP Downloader tests webserver here</h1>' return b'<h1>Deluge HTTP Downloader tests webserver here</h1>'
@ -139,7 +162,7 @@ class DownloadFileTestCase(unittest.TestCase):
return self.webserver.stopListening() return self.webserver.stopListening()
def assertContains(self, filename, contents): # NOQA def assertContains(self, filename, contents): # NOQA
with open(filename) as _file: with open(filename, 'r', encoding='utf8') as _file:
try: try:
self.assertEqual(_file.read(), contents) self.assertEqual(_file.read(), contents)
except Exception as ex: except Exception as ex:
@ -147,7 +170,7 @@ class DownloadFileTestCase(unittest.TestCase):
return filename return filename
def assertNotContains(self, filename, contents, file_mode=''): # NOQA def assertNotContains(self, filename, contents, file_mode=''): # NOQA
with open(filename, file_mode) as _file: with open(filename, 'r', encoding='utf8') as _file:
try: try:
self.assertNotEqual(_file.read(), contents) self.assertNotEqual(_file.read(), contents)
except Exception as ex: except Exception as ex:
@ -212,7 +235,7 @@ class DownloadFileTestCase(unittest.TestCase):
url = self.get_url('attachment') url = self.get_url('attachment')
d = download_file(url, fname('original')) d = download_file(url, fname('original'))
d.addCallback(self.assertEqual, fname('original')) d.addCallback(self.assertEqual, fname('original'))
d.addCallback(self.assertContains, 'Attachement with no filename set') d.addCallback(self.assertContains, 'Attachment with no filename set')
return d return d
def test_download_with_rename_prevented(self): def test_download_with_rename_prevented(self):
@ -264,3 +287,23 @@ class DownloadFileTestCase(unittest.TestCase):
d.addCallback(self.fail) d.addCallback(self.fail)
d.addErrback(self.assertIsInstance, Failure) d.addErrback(self.assertIsInstance, Failure)
return d return d
def test_download_text_reencode_charset(self):
"""Re-encode as UTF-8 specified charset for text content-type header"""
url = self.get_url('attachment')
filepath = fname('test.txt')
headers = {'content-charset': 'Windows-1251', 'content-append': 'бвгде'}
d = download_file(url, filepath, headers=headers)
d.addCallback(self.assertEqual, filepath)
d.addCallback(self.assertContains, 'Attachment with no filename setбвгде')
return d
def test_download_binary_ignore_charset(self):
"""Ignore charset for binary content-type header e.g. torrent files"""
url = self.get_url('torrent')
headers = {'content-charset': 'Windows-1251'}
filepath = fname('test.torrent')
d = download_file(url, fname('test.torrent'), headers=headers)
d.addCallback(self.assertEqual, filepath)
d.addCallback(self.assertContains, 'Binary attachment ignore charset 世丕且\n')
return d