Switch codebase to use sanitized_Request instead of

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 65556d056a7edfe12b94e82d0f55baa93360aa70..d7b737e21639679fe665d22e6ca4089f5c791618 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -259,15 +259,6 @@ def get_element_by_attribute(attribute, value, html):
      return unescapeHTML(res)
  
  
-def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
-    attributes = re.findall(attributes_regex, attributes_str)
-    attributes_dict = {}
-    if attributes:
-        for (attribute_name, attribute_value) in attributes:
-            attributes_dict[attribute_name] = attribute_value
-    return attributes_dict
-
-
  def clean_html(html):
      """Clean an HTML snippet into a readable string"""
  
@@ -382,6 +373,13 @@ def sanitize_path(s):
      return os.path.join(*sanitized_path)
  
  
+# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
+# unwanted failures due to missing protocol
+def sanitized_Request(url, *args, **kwargs):
+    return compat_urllib_request.Request(
+        'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
+
+
  def orderedSet(iterable):
      """ Remove all duplicates from the input iterable """
      res = []
@@ -405,10 +403,14 @@ def _htmlentity_transform(entity):
              numstr = '0%s' % numstr
          else:
              base = 10
-        return compat_chr(int(numstr, base))
+        # See https://github.com/rg3/youtube-dl/issues/7518
+        try:
+            return compat_chr(int(numstr, base))
+        except ValueError:
+            pass
  
      # Unknown entity in name, return its literal representation
-    return ('&%s;' % entity)
+    return '&%s;' % entity
  
  
  def unescapeHTML(s):
@@ -930,6 +932,21 @@ def determine_ext(url, default_ext='unknown_video'):
      guess = url.partition('?')[0].rpartition('.')[2]
      if re.match(r'^[A-Za-z0-9]+$', guess):
          return guess
+    elif guess.rstrip('/') in (
+            'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
+            'flv', 'f4v', 'f4a', 'f4b',
+            'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
+            'mkv', 'mka', 'mk3d',
+            'avi', 'divx',
+            'mov',
+            'asf', 'wmv', 'wma',
+            '3gp', '3g2',
+            'mp3',
+            'flac',
+            'ape',
+            'wav',
+            'f4f', 'f4m', 'm3u8', 'smil'):
+        return guess.rstrip('/')
      else:
          return default_ext
  
@@ -1673,7 +1690,9 @@ def urlencode_postdata(*args, **kargs):
  
  
  def encode_dict(d, encoding='utf-8'):
-    return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
+    def encode(v):
+        return v.encode(encoding) if isinstance(v, compat_basestring) else v
+    return dict((encode(k), encode(v)) for k, v in d.items())
  
  
  US_RATINGS = {