deluge/deluge/httpdownloader.py

# -*- coding: utf-8 -*-
#
# Copyright (C) 2009 Andrew Resch <andrewresch@gmail.com>
#
# This file is part of Deluge and is licensed under GNU General Public License 3.0, or later, with
# the additional special exception to link portions of this program with the OpenSSL library.
# See LICENSE for more details.
#

from __future__ import unicode_literals

import cgi
import logging
import os.path
import zlib

from twisted.internet import reactor
from twisted.internet.defer import Deferred
from twisted.python.failure import Failure
from twisted.web import client, http
from twisted.web._newclient import HTTPClientParser
from twisted.web.error import PageRedirect
from twisted.web.http_headers import Headers
from twisted.web.iweb import IAgent
from zope.interface import implementer

from deluge.common import get_version

log = logging.getLogger(__name__)


class CompressionDecoder(client.GzipDecoder):
    """A compression decoder for gzip, x-gzip and deflate."""
    def deliverBody(self, protocol):  # NOQA: N802
        self.original.deliverBody(
            CompressionDecoderProtocol(protocol, self.original))


class CompressionDecoderProtocol(client._GzipProtocol):
    """A compression decoder protocol for CompressionDecoder."""
    def __init__(self, protocol, response):
        super(CompressionDecoderProtocol, self).__init__(protocol, response)
        self._zlibDecompress = zlib.decompressobj(32 + zlib.MAX_WBITS)


class BodyHandler(HTTPClientParser, object):
    """An HTTP parser that saves the response to a file."""
    def __init__(self, request, finished, length, agent):
        """BodyHandler init.

        Args:
            request (t.w.i.IClientRequest): The parser request.
            finished (Deferred): A Deferred to handle the finished response.
            length (int): The length of the response.
            agent (t.w.i.IAgent): The agent from which the request was sent.
        """
        super(BodyHandler, self).__init__(request, finished)
        self.agent = agent
        self.finished = finished
        self.total_length = length
        self.current_length = 0
        self.data = b''

    def dataReceived(self, data):  # NOQA: N802
        self.current_length += len(data)
        self.data += data
        if self.agent.part_callback:
            self.agent.part_callback(
                data, self.current_length, self.total_length)

    def connectionLost(self, reason):  # NOQA: N802
        with open(self.agent.filename, 'wb') as _file:
            _file.write(self.data)
        self.finished.callback(self.agent.filename)
        self.state = u'DONE'
        HTTPClientParser.connectionLost(self, reason)


@implementer(IAgent)
class HTTPDownloaderAgent(object):
    """A File Downloader Agent."""
    def __init__(
        self,
        agent,
        filename,
        part_callback=None,
        force_filename=False,
        allow_compression=True,
        handle_redirect=True,
    ):
        """HTTPDownloaderAgent init.

        Args:
            agent (t.w.c.Agent): The agent which will send the requests.
            filename (str): The filename to save the file as.
            force_filename (bool): Forces use of the supplied filename,
                regardless of header content.
            part_callback (func): A function to be called when a part of data
                is received, it's signature should be:
                    func(data, current_length, total_length)
        """

        self.handle_redirect = handle_redirect
        self.agent = agent
        self.filename = filename
        self.part_callback = part_callback
        self.force_filename = force_filename
        self.allow_compression = allow_compression
        self.decoder = None

    def request_callback(self, response):
        finished = Deferred()

        if not self.handle_redirect and response.code in (
            http.MOVED_PERMANENTLY,
            http.FOUND,
            http.SEE_OTHER,
            http.TEMPORARY_REDIRECT,
        ):
            location = response.headers.getRawHeaders(b'location')[0]
            error = PageRedirect(response.code, location=location)
            finished.errback(Failure(error))
        else:
            headers = response.headers
            body_length = int(
                headers.getRawHeaders(b'content-length', default=[0])[0])

            if (
                headers.hasHeader(b'content-disposition')
                and not self.force_filename
            ):
                content_disp = headers.getRawHeaders(
                    b'content-disposition')[0].decode('utf-8')
                content_disp_params = cgi.parse_header(content_disp)[1]
                if 'filename' in content_disp_params:
                    new_file_name = content_disp_params['filename']
                    new_file_name = sanitise_filename(new_file_name)
                    new_file_name = os.path.join(
                        os.path.split(self.filename)[0], new_file_name)

                    count = 1
                    fileroot = os.path.splitext(new_file_name)[0]
                    fileext = os.path.splitext(new_file_name)[1]
                    while os.path.isfile(new_file_name):
                        # Increment filename if already exists
                        new_file_name = '%s-%s%s' % (fileroot, count, fileext)
                        count += 1

                    self.filename = new_file_name

            response.deliverBody(
                BodyHandler(response.request, finished, body_length, self))

        return finished

    def request(self, method, uri, headers=None, body_producer=None):
        """Issue a new request to the wrapped agent.

        Args:
            method (bytes): The HTTP method to use.
            uri (bytes): The url to download from.
            headers (t.w.h.Headers, optional): Any extra headers to send.
            body_producer (t.w.i.IBodyProducer, optional): Request body data.

        Returns:
            Deferred: The filename of the of the downloaded file.
        """
        if headers is None:
            headers = Headers()

        if not headers.hasHeader(b'User-Agent'):
            version = get_version()
            user_agent = 'Deluge/%s (https://deluge-torrent.org)' % version
            headers.addRawHeader('User-Agent', user_agent)

        d = self.agent.request(
            method=method,
            uri=uri,
            headers=headers,
            bodyProducer=body_producer,
        )
        d.addCallback(self.request_callback)
        return d


def sanitise_filename(filename):
    """Sanitises a filename to use as a download destination file.

    Logs any filenames that could be considered malicious.

    filename (str): The filename to sanitise.

    Returns:
        str: The sanitised filename.
    """

    # Remove any quotes
    filename = filename.strip('\'"')

    if os.path.basename(filename) != filename:
        # Dodgy server, log it
        log.warning(
            'Potentially malicious server: trying to write to file: %s',
            filename,
        )
        # Only use the basename
        filename = os.path.basename(filename)

    filename = filename.strip()
    if filename.startswith('.') or ';' in filename or '|' in filename:
        # Dodgy server, log it
        log.warning(
            'Potentially malicious server: trying to write to file: %s',
            filename,
        )

    return filename


def _download_file(
    url, filename,
    callback=None,
    headers=None,
    force_filename=False,
    allow_compression=True,
    handle_redirects=True,
):
    """Downloads a file from a specific URL and returns a Deferred.

    A callback function can be specified to be called as parts are received.

    Args:
        url (str): The url to download from.
        filename (str): The filename to save the file as.
        callback (func): A function to be called when partial data is received,
            it's signature should be: func(data, current_length, total_length)
        headers (dict): Any optional headers to send.
        force_filename (bool): Force using the filename specified rather than
            one the server may suggest.
        allow_compression (bool): Allows gzip & deflate decoding.

    Returns:
        Deferred: The filename of the downloaded file.

    Raises:
        t.w.e.PageRedirect
        t.w.e.Error: for all other HTTP response errors
    """

    agent = client.Agent(reactor)

    if allow_compression:
        enc_accepted = ['gzip', 'x-gzip', 'deflate']
        decoders = [(enc.encode(), CompressionDecoder) for enc in enc_accepted]
        agent = client.ContentDecoderAgent(agent, decoders)
    if handle_redirects:
        agent = client.RedirectAgent(agent)

    agent = HTTPDownloaderAgent(
        agent,
        filename,
        callback,
        force_filename,
        allow_compression,
        handle_redirects,
    )

    # The Headers init expects dict values to be a list.
    if headers:
        for name, value in list(headers.items()):
            if not isinstance(value, list):
                headers[name] = [value]

    return agent.request(b'GET', url.encode(), Headers(headers))


def download_file(
    url,
    filename,
    callback=None,
    headers=None,
    force_filename=False,
    allow_compression=True,
    handle_redirects=True,
):
    """Downloads a file from a specific URL and returns a Deferred.

    A callback function can be specified to be called as parts are received.

    Args:
        url (str): The url to download from.
        filename (str): The filename to save the file as.
        callback (func): A function to be called when partial data is received,
            it's signature should be: func(data, current_length, total_length).
        headers (dict): Any optional headers to send.
        force_filename (bool): Force the filename specified rather than one the
            server may suggest.
        allow_compression (bool): Allows gzip & deflate decoding.
        handle_redirects (bool): HTTP redirects handled automatically or not.

    Returns:
        Deferred: The filename of the downloaded file.

    Raises:
        t.w.e.PageRedirect: If handle_redirects is False.
        t.w.e.Error: For all other HTTP response errors.
    """
    def on_download_success(result):
        log.debug('Download success!')
        return result

    def on_download_fail(failure):
        log.warning(
            'Error occurred downloading file from "%s": %s',
            url, failure.getErrorMessage(),
        )
        result = failure
        return result

    d = _download_file(
        url, filename, callback=callback, headers=headers,
        force_filename=force_filename, allow_compression=allow_compression,
        handle_redirects=handle_redirects,
    )
    d.addCallbacks(on_download_success, on_download_fail)
    return d