Merge branch 'jukebox' of https://github.com/remitamine/youtube-dl into remitamine...

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 88f9f90707fbd04966638125e94b264af95fedcb..0ed6c45c8090e7d328ec4e09d12875adc768c0f9 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -3,6 +3,7 @@
  
  from __future__ import unicode_literals
  
+import base64
  import calendar
  import codecs
  import contextlib
@@ -35,6 +36,7 @@ import zlib
  from .compat import (
      compat_basestring,
      compat_chr,
+    compat_etree_fromstring,
      compat_html_entities,
      compat_http_client,
      compat_kwargs,
@@ -139,21 +141,24 @@ def write_json_file(obj, fn):
  
  
  if sys.version_info >= (2, 7):
-    def find_xpath_attr(node, xpath, key, val):
+    def find_xpath_attr(node, xpath, key, val=None):
          """ Find the xpath xpath[@key=val] """
-        assert re.match(r'^[a-zA-Z-]+$', key)
-        assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
-        expr = xpath + "[@%s='%s']" % (key, val)
+        assert re.match(r'^[a-zA-Z_-]+$', key)
+        if val:
+            assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
+        expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
          return node.find(expr)
  else:
-    def find_xpath_attr(node, xpath, key, val):
+    def find_xpath_attr(node, xpath, key, val=None):
          # Here comes the crazy part: In 2.6, if the xpath is a unicode,
          # .//node does not match if a node is a direct child of . !
          if isinstance(xpath, compat_str):
              xpath = xpath.encode('ascii')
  
          for f in node.findall(xpath):
-            if f.attrib.get(key) == val:
+            if key not in f.attrib:
+                continue
+            if val is None or f.attrib.get(key) == val:
                  return f
          return None
  
@@ -173,12 +178,21 @@ def xpath_with_ns(path, ns_map):
      return '/'.join(replaced)
  
  
-def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
-    if sys.version_info < (2, 7):  # Crazy 2.6
-        xpath = xpath.encode('ascii')
+def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
+    def _find_xpath(xpath):
+        if sys.version_info < (2, 7):  # Crazy 2.6
+            xpath = xpath.encode('ascii')
+        return node.find(xpath)
  
-    n = node.find(xpath)
-    if n is None or n.text is None:
+    if isinstance(xpath, (str, compat_str)):
+        n = _find_xpath(xpath)
+    else:
+        for xp in xpath:
+            n = _find_xpath(xp)
+            if n is not None:
+                break
+
+    if n is None:
          if default is not NO_DEFAULT:
              return default
          elif fatal:
@@ -186,9 +200,37 @@ def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
              raise ExtractorError('Could not find XML element %s' % name)
          else:
              return None
+    return n
+
+
+def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
+    n = xpath_element(node, xpath, name, fatal=fatal, default=default)
+    if n is None or n == default:
+        return n
+    if n.text is None:
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
+            name = xpath if name is None else name
+            raise ExtractorError('Could not find XML element\'s text %s' % name)
+        else:
+            return None
      return n.text
  
  
+def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
+    n = find_xpath_attr(node, xpath, key)
+    if n is None:
+        if default is not NO_DEFAULT:
+            return default
+        elif fatal:
+            name = '%s[@%s]' % (xpath, key) if name is None else name
+            raise ExtractorError('Could not find XML attribute %s' % name)
+        else:
+            return None
+    return n.attrib[key]
+
+
  def get_element_by_id(id, html):
      """Return the content of the tag with the specified ID in the passed HTML document"""
      return get_element_by_attribute("id", id, html)
@@ -324,13 +366,20 @@ def sanitize_path(s):
      if drive_or_unc:
          norm_path.pop(0)
      sanitized_path = [
-        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
+        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
          for path_part in norm_path]
      if drive_or_unc:
          sanitized_path.insert(0, drive_or_unc + os.path.sep)
      return os.path.join(*sanitized_path)
  
  
+# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
+# unwanted failures due to missing protocol
+def sanitized_Request(url, *args, **kwargs):
+    return compat_urllib_request.Request(
+        'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
+
+
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
@@ -354,10 +403,14 @@ def _htmlentity_transform(entity):
              numstr = '0%s' % numstr
          else:
              base = 10
-        return compat_chr(int(numstr, base))
+        # See https://github.com/rg3/youtube-dl/issues/7518
+        try:
+            return compat_chr(int(numstr, base))
+        except ValueError:
+            pass
  
      # Unknown entity in name, return its literal representation
-    return ('&%s;' % entity)
+    return '&%s;' % entity
  
  
  def unescapeHTML(s):
@@ -584,6 +637,11 @@ class ContentTooShortError(Exception):
  
  
  def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
+    # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
+    # expected HTTP responses to meet HTTP/1.0 or later (see also
+    # https://github.com/rg3/youtube-dl/issues/6727)
+    if sys.version_info < (3, 0):
+        kwargs[b'strict'] = True
      hc = http_class(*args, **kwargs)
      source_address = ydl_handler._params.get('source_address')
      if source_address is not None:
@@ -605,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
      return hc
  
  
+def handle_youtubedl_headers(headers):
+    filtered_headers = headers
+
+    if 'Youtubedl-no-compression' in filtered_headers:
+        filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
+        del filtered_headers['Youtubedl-no-compression']
+
+    return filtered_headers
+
+
  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      """Handler for HTTP requests and responses.
  
@@ -612,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      the standard headers to every HTTP request and handles gzipped and
      deflated responses from web servers. If compression is to be avoided in
      a particular request, the original request in the program code only has
-    to include the HTTP header "Youtubedl-No-Compression", which will be
+    to include the HTTP header "Youtubedl-no-compression", which will be
      removed before making the real request.
  
      Part of this code was copied from:
@@ -648,15 +716,33 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
          return ret
  
      def http_request(self, req):
+        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
+        # always respected by websites, some tend to give out URLs with non percent-encoded
+        # non-ASCII characters (see telemb.py, ard.py [#3412])
+        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
+        # To work around aforementioned issue we will replace request's original URL with
+        # percent-encoded one
+        # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
+        # the code of this workaround has been moved here from YoutubeDL.urlopen()
+        url = req.get_full_url()
+        url_escaped = escape_url(url)
+
+        # Substitute URL if any change after escaping
+        if url != url_escaped:
+            req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+            new_req = req_type(
+                url_escaped, data=req.data, headers=req.headers,
+                origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+            new_req.timeout = req.timeout
+            req = new_req
+
          for h, v in std_headers.items():
              # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
              # The dict keys are capitalized because of this bug by urllib
              if h.capitalize() not in req.headers:
                  req.add_header(h, v)
-        if 'Youtubedl-no-compression' in req.headers:
-            if 'Accept-encoding' in req.headers:
-                del req.headers['Accept-encoding']
-            del req.headers['Youtubedl-no-compression']
+
+        req.headers = handle_youtubedl_headers(req.headers)
  
          if sys.version_info < (2, 7) and '#' in req.get_full_url():
              # Python 2.6 is brain-dead when it comes to fragments
@@ -687,11 +773,25 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
                      raise original_ioerror
              resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
+            del resp.headers['Content-encoding']
          # deflate
          if resp.headers.get('Content-encoding', '') == 'deflate':
              gz = io.BytesIO(self.deflate(resp.read()))
              resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
+            del resp.headers['Content-encoding']
+        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
+        # https://github.com/rg3/youtube-dl/issues/6457).
+        if 300 <= resp.code < 400:
+            location = resp.headers.get('Location')
+            if location:
+                # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
+                if sys.version_info >= (3, 0):
+                    location = location.encode('iso-8859-1').decode('utf-8')
+                location_escaped = escape_url(location)
+                if location != location_escaped:
+                    del resp.headers['Location']
+                    resp.headers['Location'] = location_escaped
          return resp
  
      https_request = http_request
@@ -715,15 +815,41 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
              req, **kwargs)
  
  
+class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
+    def __init__(self, cookiejar=None):
+        compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
+
+    def http_response(self, request, response):
+        # Python 2 will choke on next HTTP request in row if there are non-ASCII
+        # characters in Set-Cookie HTTP header of last response (see
+        # https://github.com/rg3/youtube-dl/issues/6769).
+        # In order to at least prevent crashing we will percent encode Set-Cookie
+        # header before HTTPCookieProcessor starts processing it.
+        # if sys.version_info < (3, 0) and response.headers:
+        #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
+        #         set_cookie = response.headers.get(set_cookie_header)
+        #         if set_cookie:
+        #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
+        #             if set_cookie != set_cookie_escaped:
+        #                 del response.headers[set_cookie_header]
+        #                 response.headers[set_cookie_header] = set_cookie_escaped
+        return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
+
+    https_request = compat_urllib_request.HTTPCookieProcessor.http_request
+    https_response = http_response
+
+
  def parse_iso8601(date_str, delimiter='T', timezone=None):
      """ Return a UNIX timestamp from the given date """
  
      if date_str is None:
          return None
  
+    date_str = re.sub(r'\.[0-9]+', '', date_str)
+
      if timezone is None:
          m = re.search(
-            r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
              date_str)
          if not m:
              timezone = datetime.timedelta()
@@ -736,9 +862,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
                  timezone = datetime.timedelta(
                      hours=sign * int(m.group('hours')),
                      minutes=sign * int(m.group('minutes')))
-    date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
-    dt = datetime.datetime.strptime(date_str, date_format) - timezone
-    return calendar.timegm(dt.timetuple())
+    try:
+        date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+        dt = datetime.datetime.strptime(date_str, date_format) - timezone
+        return calendar.timegm(dt.timetuple())
+    except ValueError:
+        pass
  
  
  def unified_strdate(date_str, day_first=True):
@@ -803,7 +932,8 @@ def unified_strdate(date_str, day_first=True):
          timetuple = email.utils.parsedate_tz(date_str)
          if timetuple:
              upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
-    return upload_date
+    if upload_date is not None:
+        return compat_str(upload_date)
  
  
  def determine_ext(url, default_ext='unknown_video'):
@@ -812,6 +942,21 @@ def determine_ext(url, default_ext='unknown_video'):
      guess = url.partition('?')[0].rpartition('.')[2]
      if re.match(r'^[A-Za-z0-9]+$', guess):
          return guess
+    elif guess.rstrip('/') in (
+            'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+            'flv', 'f4v', 'f4a', 'f4b',
+            'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+            'mkv', 'mka', 'mk3d',
+            'avi', 'divx',
+            'mov',
+            'asf', 'wmv', 'wma',
+            '3gp', '3g2',
+            'mp3',
+            'flac',
+            'ape',
+            'wav',
+            'f4f', 'f4m', 'm3u8', 'smil'):
+        return guess.rstrip('/')
      else:
          return default_ext
  
@@ -1263,6 +1408,15 @@ def remove_end(s, end):
      return s
  
  
+def remove_quotes(s):
+    if s is None or len(s) < 2:
+        return s
+    for quote in ('"', "'", ):
+        if s[0] == quote and s[-1] == quote:
+            return s[1:-1]
+    return s
+
+
  def url_basename(url):
      path = compat_urlparse.urlparse(url).path
      return path.strip('/').split('/')[-1]
@@ -1279,7 +1433,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
              v = getattr(v, get_attr, None)
      if v == '':
          v = None
-    return default if v is None else (int(v) * invscale // scale)
+    if v is None:
+        return default
+    try:
+        return int(v) * invscale // scale
+    except ValueError:
+        return default
  
  
  def str_or_none(v, default=None):
@@ -1295,7 +1454,12 @@ def str_to_int(int_str):
  
  
  def float_or_none(v, scale=1, invscale=1, default=None):
-    return default if v is None else (float(v) * invscale / scale)
+    if v is None:
+        return default
+    try:
+        return float(v) * invscale / scale
+    except ValueError:
+        return default
  
  
  def parse_duration(s):
@@ -1544,27 +1708,14 @@ def urlencode_postdata(*args, **kargs):
      return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
  
  
-try:
-    etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError:  # Python <=2.6
-    etree_iter = lambda n: n.findall('.//*')
+def encode_dict(d, encoding='utf-8'):
+    def encode(v):
+        return v.encode(encoding) if isinstance(v, compat_basestring) else v
+    return dict((encode(k), encode(v)) for k, v in d.items())
  
  
-def parse_xml(s):
-    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
-        def doctype(self, name, pubid, system):
-            pass  # Ignore doctypes
-
-    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
-    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
-    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
-    # Fix up XML parser in Python 2.x
-    if sys.version_info < (3, 0):
-        for n in etree_iter(tree):
-            if n.text is not None:
-                if not isinstance(n.text, compat_str):
-                    n.text = n.text.decode('utf-8')
-    return tree
+def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
+    return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
  
  
  US_RATINGS = {
@@ -1594,8 +1745,8 @@ def js_to_json(code):
          if v in ('true', 'false', 'null'):
              return v
          if v.startswith('"'):
-            return v
-        if v.startswith("'"):
+            v = re.sub(r"\\'", "'", v[1:-1])
+        elif v.startswith("'"):
              v = v[1:-1]
              v = re.sub(r"\\\\|\\'|\"", lambda m: {
                  '\\\\': '\\\\',
@@ -1661,6 +1812,15 @@ def args_to_str(args):
      return ' '.join(shlex_quote(a) for a in args)
  
  
+def error_to_compat_str(err):
+    err_str = str(err)
+    # On python 2 error byte string must be decoded with proper
+    # encoding rather than ascii
+    if sys.version_info[0] < 3:
+        err_str = err_str.decode(preferredencoding())
+    return err_str
+
+
  def mimetype2ext(mt):
      _, _, res = mt.rpartition('/')
  
@@ -1689,6 +1849,10 @@ def urlhandle_detect_ext(url_handle):
      return mimetype2ext(getheader('Content-Type'))
  
  
+def encode_data_uri(data, mime_type):
+    return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
+
+
  def age_restricted(content_limit, age_limit):
      """ Returns True iff the content should be blocked """
  
@@ -1827,15 +1991,15 @@ def match_filter_func(filter_str):
  
  def parse_dfxp_time_expr(time_expr):
      if not time_expr:
-        return 0.0
+        return
  
      mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
      if mobj:
          return float(mobj.group('time_offset'))
  
-    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
      if mobj:
-        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
  
  
  def srt_subtitles_timecode(seconds):
@@ -1863,7 +2027,7 @@ def dfxp2srt(dfxp_data):
  
          return out
  
-    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
      out = []
      paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
  
@@ -1871,10 +2035,15 @@ def dfxp2srt(dfxp_data):
          raise ValueError('Invalid dfxp/TTML subtitle')
  
      for para, index in zip(paras, itertools.count(1)):
-        begin_time = parse_dfxp_time_expr(para.attrib['begin'])
+        begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
          end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+        dur = parse_dfxp_time_expr(para.attrib.get('dur'))
+        if begin_time is None:
+            continue
          if not end_time:
-            end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
+            if not dur:
+                continue
+            end_time = begin_time + dur
          out.append('%d\n%s --> %s\n%s\n\n' % (
              index,
              srt_subtitles_timecode(begin_time),
@@ -1884,6 +2053,32 @@ def dfxp2srt(dfxp_data):
      return ''.join(out)
  
  
+def cli_option(params, command_option, param):
+    param = params.get(param)
+    return [command_option, param] if param is not None else []
+
+
+def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
+    param = params.get(param)
+    assert isinstance(param, bool)
+    if separator:
+        return [command_option + separator + (true_value if param else false_value)]
+    return [command_option, true_value if param else false_value]
+
+
+def cli_valueless_option(params, command_option, param, expected_value=True):
+    param = params.get(param)
+    return [command_option] if param == expected_value else []
+
+
+def cli_configuration_args(params, param, default=[]):
+    ex_args = params.get(param)
+    if ex_args is None:
+        return default
+    assert isinstance(ex_args, list)
+    return ex_args
+
+
  class ISO639Utils(object):
      # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
      _lang_map = {