Merge pull request #7045 from remitamine/ign

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index b7013a6aaef400633d45ac33c4a7fcb949802be7..1737ac5f6d72a3d53d6454f8eac98bda5032e285 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -373,6 +373,13 @@ def sanitize_path(s):
      return os.path.join(*sanitized_path)
  
  
      return os.path.join(*sanitized_path)
  
  
+# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
+# unwanted failures due to missing protocol
+def sanitized_Request(url, *args, **kwargs):
+    return compat_urllib_request.Request(
+        'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
+
+
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
@@ -403,7 +410,7 @@ def _htmlentity_transform(entity):
              pass
  
      # Unknown entity in name, return its literal representation
              pass
  
      # Unknown entity in name, return its literal representation
-    return ('&%s;' % entity)
+    return '&%s;' % entity
  
  
  def unescapeHTML(s):
  
  
  def unescapeHTML(s):
@@ -656,6 +663,16 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
      return hc
  
  
      return hc
  
  
+def handle_youtubedl_headers(headers):
+    filtered_headers = headers
+
+    if 'Youtubedl-no-compression' in filtered_headers:
+        filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
+        del filtered_headers['Youtubedl-no-compression']
+
+    return filtered_headers
+
+
  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      """Handler for HTTP requests and responses.
  
  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      """Handler for HTTP requests and responses.
  
@@ -663,7 +680,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      the standard headers to every HTTP request and handles gzipped and
      deflated responses from web servers. If compression is to be avoided in
      a particular request, the original request in the program code only has
      the standard headers to every HTTP request and handles gzipped and
      deflated responses from web servers. If compression is to be avoided in
      a particular request, the original request in the program code only has
-    to include the HTTP header "Youtubedl-No-Compression", which will be
+    to include the HTTP header "Youtubedl-no-compression", which will be
      removed before making the real request.
  
      Part of this code was copied from:
      removed before making the real request.
  
      Part of this code was copied from:
@@ -724,10 +741,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
              # The dict keys are capitalized because of this bug by urllib
              if h.capitalize() not in req.headers:
                  req.add_header(h, v)
              # The dict keys are capitalized because of this bug by urllib
              if h.capitalize() not in req.headers:
                  req.add_header(h, v)
-        if 'Youtubedl-no-compression' in req.headers:
-            if 'Accept-encoding' in req.headers:
-                del req.headers['Accept-encoding']
-            del req.headers['Youtubedl-no-compression']
+
+        req.headers = handle_youtubedl_headers(req.headers)
  
          if sys.version_info < (2, 7) and '#' in req.get_full_url():
              # Python 2.6 is brain-dead when it comes to fragments
  
          if sys.version_info < (2, 7) and '#' in req.get_full_url():
              # Python 2.6 is brain-dead when it comes to fragments
@@ -925,6 +940,21 @@ def determine_ext(url, default_ext='unknown_video'):
      guess = url.partition('?')[0].rpartition('.')[2]
      if re.match(r'^[A-Za-z0-9]+$', guess):
          return guess
      guess = url.partition('?')[0].rpartition('.')[2]
      if re.match(r'^[A-Za-z0-9]+$', guess):
          return guess
+    elif guess.rstrip('/') in (
+            'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+            'flv', 'f4v', 'f4a', 'f4b',
+            'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+            'mkv', 'mka', 'mk3d',
+            'avi', 'divx',
+            'mov',
+            'asf', 'wmv', 'wma',
+            '3gp', '3g2',
+            'mp3',
+            'flac',
+            'ape',
+            'wav',
+            'f4f', 'f4m', 'm3u8', 'smil'):
+        return guess.rstrip('/')
      else:
          return default_ext
  
      else:
          return default_ext
  
@@ -1376,6 +1406,15 @@ def remove_end(s, end):
      return s
  
  
      return s
  
  
+def remove_quotes(s):
+    if s is None or len(s) < 2:
+        return s
+    for quote in ('"', "'", ):
+        if s[0] == quote and s[-1] == quote:
+            return s[1:-1]
+    return s
+
+
  def url_basename(url):
      path = compat_urlparse.urlparse(url).path
      return path.strip('/').split('/')[-1]
  def url_basename(url):
      path = compat_urlparse.urlparse(url).path
      return path.strip('/').split('/')[-1]
@@ -1668,7 +1707,13 @@ def urlencode_postdata(*args, **kargs):
  
  
  def encode_dict(d, encoding='utf-8'):
  
  
  def encode_dict(d, encoding='utf-8'):
-    return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
+    def encode(v):
+        return v.encode(encoding) if isinstance(v, compat_basestring) else v
+    return dict((encode(k), encode(v)) for k, v in d.items())
+
+
+def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
+    return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
  
  
  US_RATINGS = {
  
  
  US_RATINGS = {
@@ -1765,6 +1810,15 @@ def args_to_str(args):
      return ' '.join(shlex_quote(a) for a in args)
  
  
      return ' '.join(shlex_quote(a) for a in args)
  
  
+def error_to_compat_str(err):
+    err_str = str(err)
+    # On python 2 error byte string must be decoded with proper
+    # encoding rather than ascii
+    if sys.version_info[0] < 3:
+        err_str = err_str.decode(preferredencoding())
+    return err_str
+
+
  def mimetype2ext(mt):
      _, _, res = mt.rpartition('/')
  
  def mimetype2ext(mt):
      _, _, res = mt.rpartition('/')
  
@@ -1935,15 +1989,15 @@ def match_filter_func(filter_str):
  
  def parse_dfxp_time_expr(time_expr):
      if not time_expr:
  
  def parse_dfxp_time_expr(time_expr):
      if not time_expr:
-        return 0.0
+        return
  
      mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
      if mobj:
          return float(mobj.group('time_offset'))
  
  
      mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
      if mobj:
          return float(mobj.group('time_offset'))
  
-    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
      if mobj:
      if mobj:
-        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
  
  
  def srt_subtitles_timecode(seconds):
  
  
  def srt_subtitles_timecode(seconds):
@@ -1979,10 +2033,15 @@ def dfxp2srt(dfxp_data):
          raise ValueError('Invalid dfxp/TTML subtitle')
  
      for para, index in zip(paras, itertools.count(1)):
          raise ValueError('Invalid dfxp/TTML subtitle')
  
      for para, index in zip(paras, itertools.count(1)):
-        begin_time = parse_dfxp_time_expr(para.attrib['begin'])
+        begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
          end_time = parse_dfxp_time_expr(para.attrib.get('end'))
          end_time = parse_dfxp_time_expr(para.attrib.get('end'))
+        dur = parse_dfxp_time_expr(para.attrib.get('dur'))
+        if begin_time is None:
+            continue
          if not end_time:
          if not end_time:
-            end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
+            if not dur:
+                continue
+            end_time = begin_time + dur
          out.append('%d\n%s --> %s\n%s\n\n' % (
              index,
              srt_subtitles_timecode(begin_time),
          out.append('%d\n%s --> %s\n%s\n\n' % (
              index,
              srt_subtitles_timecode(begin_time),