Merge pull request #9110 from remitamine/parse_duration

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index bad1c4ea87dd086a34d93f6872d9ce5d9f2209f9..f333e471275a69cbd158828c90f0ed1b5522582f 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -47,6 +47,7 @@ from .compat import (
      compat_str,
      compat_urllib_error,
      compat_urllib_parse,
+    compat_urllib_parse_urlencode,
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urlparse,
@@ -416,9 +417,12 @@ def sanitize_path(s):
  
  # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
  # unwanted failures due to missing protocol
+def sanitize_url(url):
+    return 'http:%s' % url if url.startswith('//') else url
+
+
  def sanitized_Request(url, *args, **kwargs):
-    return compat_urllib_request.Request(
-        'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
+    return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
  
  
  def orderedSet(iterable):
@@ -774,12 +778,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
  
          # Substitute URL if any change after escaping
          if url != url_escaped:
-            req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
-            new_req = req_type(
-                url_escaped, data=req.data, headers=req.headers,
-                origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
-            new_req.timeout = req.timeout
-            req = new_req
+            req = update_Request(req, url=url_escaped)
  
          for h, v in std_headers.items():
              # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
@@ -1315,7 +1314,7 @@ def shell_quote(args):
  def smuggle_url(url, data):
      """ Pass additional data in a URL for internal use. """
  
-    sdata = compat_urllib_parse.urlencode(
+    sdata = compat_urllib_parse_urlencode(
          {'__youtubedl_smuggle': json.dumps(data)})
      return url + '#' + sdata
  
@@ -1346,7 +1345,7 @@ def format_bytes(bytes):
  def lookup_unit_table(unit_table, s):
      units_re = '|'.join(re.escape(u) for u in unit_table)
      m = re.match(
-        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)$' % units_re, s)
+        r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
      if not m:
          return None
      num_str = m.group('num').replace(',', '.')
@@ -1541,44 +1540,46 @@ def parse_duration(s):
  
      s = s.strip()
  
-    m = re.match(
-        r'''(?ix)(?:P?T)?
-        (?:
-            (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
-            (?P<only_hours>[0-9.]+)\s*(?:hours?)|
-
-            \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
-            (?:
+    days, hours, mins, secs, ms = [None] * 5
+    m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
+    if m:
+        days, hours, mins, secs, ms = m.groups()
+    else:
+        m = re.match(
+            r'''(?ix)(?:P?T)?
                  (?:
-                    (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
-                    (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
+                    (?P<days>[0-9]+)\s*d(?:ays?)?\s*
                  )?
-                (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
-            )?
-            (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
-        )$''', s)
-    if not m:
-        return None
-    res = 0
-    if m.group('only_mins'):
-        return float_or_none(m.group('only_mins'), invscale=60)
-    if m.group('only_hours'):
-        return float_or_none(m.group('only_hours'), invscale=60 * 60)
-    if m.group('secs'):
-        res += int(m.group('secs'))
-    if m.group('mins_reversed'):
-        res += int(m.group('mins_reversed')) * 60
-    if m.group('mins'):
-        res += int(m.group('mins')) * 60
-    if m.group('hours'):
-        res += int(m.group('hours')) * 60 * 60
-    if m.group('hours_reversed'):
-        res += int(m.group('hours_reversed')) * 60 * 60
-    if m.group('days'):
-        res += int(m.group('days')) * 24 * 60 * 60
-    if m.group('ms'):
-        res += float(m.group('ms'))
-    return res
+                (?:
+                    (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
+                )?
+                (?:
+                    (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
+                )?
+                (?:
+                    (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
+                )?$''', s)
+        if m:
+            days, hours, mins, secs, ms = m.groups()
+        else:
+            m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
+            if m:
+                hours, mins = m.groups()
+            else:
+                return None
+
+    duration = 0
+    if secs:
+        duration += float(secs)
+    if mins:
+        duration += float(mins) * 60
+    if hours:
+        duration += float(hours) * 60 * 60
+    if days:
+        duration += float(days) * 24 * 60 * 60
+    if ms:
+        duration += float(ms)
+    return duration
  
  
  def prepend_extension(filename, ext, expected_real_ext=None):
@@ -1746,6 +1747,7 @@ def escape_url(url):
      """Escape URL as suggested by RFC 3986"""
      url_parsed = compat_urllib_parse_urlparse(url)
      return url_parsed._replace(
+        netloc=url_parsed.netloc.encode('idna').decode('ascii'),
          path=escape_rfc3986(url_parsed.path),
          params=escape_rfc3986(url_parsed.params),
          query=escape_rfc3986(url_parsed.query),
@@ -1755,7 +1757,8 @@ def escape_url(url):
  try:
      struct.pack('!I', 0)
  except TypeError:
-    # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
+    # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
+    # See https://bugs.python.org/issue19099
      def struct_pack(spec, *args):
          if isinstance(spec, compat_str):
              spec = spec.encode('ascii')
@@ -1787,22 +1790,31 @@ def read_batch_urls(batch_fd):
  
  
  def urlencode_postdata(*args, **kargs):
-    return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
+    return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
  
  
  def update_url_query(url, query):
+    if not query:
+        return url
      parsed_url = compat_urlparse.urlparse(url)
      qs = compat_parse_qs(parsed_url.query)
      qs.update(query)
-    qs = encode_dict(qs)
      return compat_urlparse.urlunparse(parsed_url._replace(
-        query=compat_urllib_parse.urlencode(qs, True)))
+        query=compat_urllib_parse_urlencode(qs, True)))
  
  
-def encode_dict(d, encoding='utf-8'):
-    def encode(v):
-        return v.encode(encoding) if isinstance(v, compat_basestring) else v
-    return dict((encode(k), encode(v)) for k, v in d.items())
+def update_Request(req, url=None, data=None, headers={}, query={}):
+    req_headers = req.headers.copy()
+    req_headers.update(headers)
+    req_data = data or req.data
+    req_url = update_url_query(url or req.get_full_url(), query)
+    req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+    new_req = req_type(
+        req_url, data=req_data, headers=req_headers,
+        origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
+    if hasattr(req, 'timeout'):
+        new_req.timeout = req.timeout
+    return new_req
  
  
  def dict_get(d, key_or_keys, default=None, skip_false_values=True):
@@ -2123,6 +2135,7 @@ def dfxp2srt(dfxp_data):
      _x = functools.partial(xpath_with_ns, ns_map={
          'ttml': 'http://www.w3.org/ns/ttml',
          'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
+        'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
      })
  
      class TTMLPElementParser(object):
@@ -2149,7 +2162,7 @@ def dfxp2srt(dfxp_data):
  
      dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
      out = []
-    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
+    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
  
      if not paras:
          raise ValueError('Invalid dfxp/TTML subtitle')