[utils] Don't transform numbers not starting with a zero

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 26f21602c0a1162bdcef711d87c87fd35481fb3d..82f67f6cdca2345d8dff1a10dfe2444a898362b9 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -39,6 +39,7 @@ from .compat import (
      compat_chr,
      compat_etree_fromstring,
      compat_html_entities,
+    compat_html_entities_html5,
      compat_http_client,
      compat_kwargs,
      compat_parse_qs,
@@ -75,7 +76,7 @@ def register_socks_protocols():
  compiled_regex_type = type(re.compile(''))
  
  std_headers = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate',
@@ -105,9 +106,9 @@ KNOWN_EXTENSIONS = (
      'f4f', 'f4m', 'm3u8', 'smil')
  
  # needed for sanitizing filenames in restricted mode
-ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
-                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
-                                        'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
+                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
  
  
  def preferredencoding():
@@ -456,12 +457,19 @@ def orderedSet(iterable):
      return res
  
  
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity_with_semicolon):
      """Transforms an HTML entity to a character."""
+    entity = entity_with_semicolon[:-1]
+
      # Known non-numeric HTML entity
      if entity in compat_html_entities.name2codepoint:
          return compat_chr(compat_html_entities.name2codepoint[entity])
  
+    # TODO: HTML5 allows entities without a semicolon. For example,
+    # '&Eacuteric' should be decoded as 'Éric'.
+    if entity_with_semicolon in compat_html_entities_html5:
+        return compat_html_entities_html5[entity_with_semicolon]
+
      mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
      if mobj is not None:
          numstr = mobj.group(1)
@@ -486,7 +494,7 @@ def unescapeHTML(s):
      assert type(s) == compat_str
  
      return re.sub(
-        r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
+        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  
  
  def get_subprocess_encoding():
@@ -1893,6 +1901,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):
      return d.get(key_or_keys, default)
  
  
+def try_get(src, getter, expected_type=None):
+    try:
+        v = getter(src)
+    except (AttributeError, KeyError, TypeError, IndexError):
+        pass
+    else:
+        if expected_type is None or isinstance(v, expected_type):
+            return v
+
+
  def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
      return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
  
@@ -1952,7 +1970,7 @@ def js_to_json(code):
          '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
          /\*.*?\*/|,(?=\s*[\]}])|
          [a-zA-Z_][.a-zA-Z_0-9]*|
-        (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
+        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
          [0-9]+(?=\s*:)
          ''', fix_kv, code)
  
@@ -2020,6 +2038,9 @@ def mimetype2ext(mt):
  
      ext = {
          'audio/mp4': 'm4a',
+        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
+        # it's the most popular one
+        'audio/mpeg': 'mp3',
      }.get(mt)
      if ext is not None:
          return ext