Merge branch 'ping-viki-shows'

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 5e1c4525db400941c8b312fa825debfe9a1d11e6..52d198fa3c2eb36a1a3d41620cd645b90d52f854 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -37,6 +37,7 @@ from .compat import (
      compat_chr,
      compat_html_entities,
      compat_http_client,
+    compat_kwargs,
      compat_parse_qs,
      compat_socket_create_connection,
      compat_str,
@@ -114,7 +115,7 @@ def write_json_file(obj, fn):
              'encoding': 'utf-8',
          })
  
-    tf = tempfile.NamedTemporaryFile(**args)
+    tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
  
      try:
          with tf:
@@ -326,13 +327,6 @@ def sanitize_path(s):
      return os.path.join(*sanitized_path)
  
  
-def sanitize_url_path_consecutive_slashes(url):
-    """Collapses consecutive slashes in URLs' path"""
-    parsed_url = list(compat_urlparse.urlparse(url))
-    parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
-    return compat_urlparse.urlunparse(parsed_url)
-
-
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
@@ -371,6 +365,18 @@ def unescapeHTML(s):
          r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
  
  
+def get_subprocess_encoding():
+    if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
+        # For subprocess calls, encode with locale encoding
+        # Refer to http://stackoverflow.com/a/9951851/35070
+        encoding = preferredencoding()
+    else:
+        encoding = sys.getfilesystemencoding()
+    if encoding is None:
+        encoding = 'utf-8'
+    return encoding
+
+
  def encodeFilename(s, for_subprocess=False):
      """
      @param s The name of the file
@@ -382,21 +388,24 @@ def encodeFilename(s, for_subprocess=False):
      if sys.version_info >= (3, 0):
          return s
  
-    if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
-        # Pass '' directly to use Unicode APIs on Windows 2000 and up
-        # (Detecting Windows NT 4 is tricky because 'major >= 4' would
-        # match Windows 9x series as well. Besides, NT 4 is obsolete.)
-        if not for_subprocess:
-            return s
-        else:
-            # For subprocess calls, encode with locale encoding
-            # Refer to http://stackoverflow.com/a/9951851/35070
-            encoding = preferredencoding()
-    else:
-        encoding = sys.getfilesystemencoding()
-    if encoding is None:
-        encoding = 'utf-8'
-    return s.encode(encoding, 'ignore')
+    # Pass '' directly to use Unicode APIs on Windows 2000 and up
+    # (Detecting Windows NT 4 is tricky because 'major >= 4' would
+    # match Windows 9x series as well. Besides, NT 4 is obsolete.)
+    if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
+        return s
+
+    return s.encode(get_subprocess_encoding(), 'ignore')
+
+
+def decodeFilename(b, for_subprocess=False):
+
+    if sys.version_info >= (3, 0):
+        return b
+
+    if not isinstance(b, bytes):
+        return b
+
+    return b.decode(get_subprocess_encoding(), 'ignore')
  
  
  def encodeArgument(s):
@@ -408,6 +417,10 @@ def encodeArgument(s):
      return encodeFilename(s, True)
  
  
+def decodeArgument(b):
+    return decodeFilename(b, True)
+
+
  def decodeOption(optval):
      if optval is None:
          return optval
@@ -1109,15 +1122,6 @@ def shell_quote(args):
      return ' '.join(quoted_args)
  
  
-def takewhile_inclusive(pred, seq):
-    """ Like itertools.takewhile, but include the latest evaluated element
-        (the first element so that Not pred(e)) """
-    for e in seq:
-        yield e
-        if not pred(e):
-            return
-
-
  def smuggle_url(url, data):
      """ Pass additional data in a URL for internal use. """
  
@@ -1338,9 +1342,19 @@ def parse_duration(s):
      return res
  
  
-def prepend_extension(filename, ext):
+def prepend_extension(filename, ext, expected_real_ext=None):
      name, real_ext = os.path.splitext(filename)
-    return '{0}.{1}{2}'.format(name, ext, real_ext)
+    return (
+        '{0}.{1}{2}'.format(name, ext, real_ext)
+        if not expected_real_ext or real_ext[1:] == expected_real_ext
+        else '{0}.{1}'.format(filename, ext))
+
+
+def replace_extension(filename, ext, expected_real_ext=None):
+    name, real_ext = os.path.splitext(filename)
+    return '{0}.{1}'.format(
+        name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
+        ext)
  
  
  def check_executable(exe, args=[]):
@@ -1359,7 +1373,7 @@ def get_exe_version(exe, args=['--version'],
      or False if the executable is not present """
      try:
          out, _ = subprocess.Popen(
-            [exe] + args,
+            [encodeArgument(exe)] + args,
              stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
      except OSError:
          return False
@@ -1465,6 +1479,14 @@ def uppercase_escape(s):
          s)
  
  
+def lowercase_escape(s):
+    unicode_escape = codecs.getdecoder('unicode_escape')
+    return re.sub(
+        r'\\u[0-9a-fA-F]{4}',
+        lambda m: unicode_escape(m.group(0))[0],
+        s)
+
+
  def escape_rfc3986(s):
      """Escape non-ASCII characters as suggested by RFC 3986"""
      if sys.version_info < (3, 0) and isinstance(s, compat_str):
@@ -1643,6 +1665,7 @@ def mimetype2ext(mt):
      return {
          'x-ms-wmv': 'wmv',
          'x-mp4-fragmented': 'mp4',
+        'ttml+xml': 'ttml',
      }.get(res, res)
  
  
@@ -1813,12 +1836,8 @@ def parse_dfxp_time_expr(time_expr):
          return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
  
  
-def format_srt_time(seconds):
-    (mins, secs) = divmod(seconds, 60)
-    (hours, mins) = divmod(mins, 60)
-    millisecs = (secs - int(secs)) * 1000
-    secs = int(secs)
-    return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
+def srt_subtitles_timecode(seconds):
+    return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
  
  
  def dfxp2srt(dfxp_data):
@@ -1830,9 +1849,9 @@ def dfxp2srt(dfxp_data):
          out = str_or_empty(node.text)
  
          for child in node:
-            if child.tag == _x('ttml:br'):
+            if child.tag in (_x('ttml:br'), 'br'):
                  out += '\n' + str_or_empty(child.tail)
-            elif child.tag == _x('ttml:span'):
+            elif child.tag in (_x('ttml:span'), 'span'):
                  out += str_or_empty(parse_node(child))
              else:
                  out += str_or_empty(xml.etree.ElementTree.tostring(child))
@@ -1841,13 +1860,20 @@ def dfxp2srt(dfxp_data):
  
      dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
      out = []
-    paras = dfxp.findall(_x('.//ttml:p'))
+    paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
+
+    if not paras:
+        raise ValueError('Invalid dfxp/TTML subtitle')
  
      for para, index in zip(paras, itertools.count(1)):
+        begin_time = parse_dfxp_time_expr(para.attrib['begin'])
+        end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+        if not end_time:
+            end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
          out.append('%d\n%s --> %s\n%s\n\n' % (
              index,
-            format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
-            format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
+            srt_subtitles_timecode(begin_time),
+            srt_subtitles_timecode(end_time),
              parse_node(para)))
  
      return ''.join(out)