X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=f77ab865093604f217ab6b2da824f340327df652;hb=daa0df9e8beac1325e5fb55d828e7a3a38e74bf6;hp=52a20632fab160d93edef8f1ac1e0e0a77b12c0a;hpb=bd1e484448c84904ce0d99fe05c3721053aa3c00;p=youtube-dl

diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 52a20632f..f77ab8650 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -39,6 +39,7 @@ from .compat import (
     compat_chr,
     compat_etree_fromstring,
     compat_html_entities,
+    compat_html_entities_html5,
     compat_http_client,
     compat_kwargs,
     compat_parse_qs,
@@ -105,9 +106,9 @@ KNOWN_EXTENSIONS = (
     'f4f', 'f4m', 'm3u8', 'smil')
 
 # needed for sanitizing filenames in restricted mode
-ACCENT_CHARS = dict(zip('ÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÅÃÃÃÃÃÃÃÃ Ã¡Ã¢Ã£Ã¤Ã¥Ã¦Ã§Ã¨Ã©ÃªÃ«Ã¬Ã­Ã®Ã¯Ã°Ã±Ã²Ã³Ã´ÃµÃ¶Ã¸ÅÃ¹ÃºÃ»Ã¼Ã½Ã¾Ã¿',
-                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
-                                        'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
+ACCENT_CHARS = dict(zip('ÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÃÅÃÅÃÃÃÃÅ°ÃÃÃÃ Ã¡Ã¢Ã£Ã¤Ã¥Ã¦Ã§Ã¨Ã©ÃªÃ«Ã¬Ã­Ã®Ã¯Ã°Ã±Ã²Ã³Ã´ÃµÃ¶ÅÃ¸ÅÃ¹ÃºÃ»Ã¼Å±Ã½Ã¾Ã¿',
+                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
+                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
 
 
 def preferredencoding():
@@ -456,12 +457,19 @@ def orderedSet(iterable):
     return res
 
 
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity_with_semicolon):
     """Transforms an HTML entity to a character."""
+    entity = entity_with_semicolon[:-1]
+
     # Known non-numeric HTML entity
     if entity in compat_html_entities.name2codepoint:
         return compat_chr(compat_html_entities.name2codepoint[entity])
 
+    # TODO: HTML5 allows entities without a semicolon. For example,
+    # '&Eacuteric' should be decoded as 'Ãric'.
+    if entity_with_semicolon in compat_html_entities_html5:
+        return compat_html_entities_html5[entity_with_semicolon]
+
     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
     if mobj is not None:
         numstr = mobj.group(1)
@@ -486,7 +494,7 @@ def unescapeHTML(s):
     assert type(s) == compat_str
 
     return re.sub(
-        r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
+        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
 
 
 def get_subprocess_encoding():
@@ -861,9 +869,13 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
                 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
                 if sys.version_info >= (3, 0):
                     location = location.encode('iso-8859-1').decode('utf-8')
+                else:
+                    location = location.decode('utf-8')
                 location_escaped = escape_url(location)
                 if location != location_escaped:
                     del resp.headers['Location']
+                    if sys.version_info < (3, 0):
+                        location_escaped = location_escaped.encode('utf-8')
                     resp.headers['Location'] = location_escaped
         return resp
 
@@ -883,12 +895,17 @@ def make_socks_conn_class(base_class, socks_proxy):
     elif url_components.scheme.lower() == 'socks4a':
         socks_type = ProxyType.SOCKS4A
 
+    def unquote_if_non_empty(s):
+        if not s:
+            return s
+        return compat_urllib_parse_unquote_plus(s)
+
     proxy_args = (
         socks_type,
         url_components.hostname, url_components.port or 1080,
         True,  # Remote DNS
-        compat_urllib_parse_unquote_plus(url_components.username),
-        compat_urllib_parse_unquote_plus(url_components.password),
+        unquote_if_non_empty(url_components.username),
+        unquote_if_non_empty(url_components.password),
     )
 
     class SocksConnection(base_class):
@@ -1030,6 +1047,7 @@ def unified_strdate(date_str, day_first=True):
         format_expressions.extend([
             '%d-%m-%Y',
             '%d.%m.%Y',
+            '%d.%m.%y',
             '%d/%m/%Y',
             '%d/%m/%y',
             '%d/%m/%Y %H:%M:%S',
@@ -1050,7 +1068,10 @@ def unified_strdate(date_str, day_first=True):
     if upload_date is None:
         timetuple = email.utils.parsedate_tz(date_str)
         if timetuple:
-            upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+            try:
+                upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
+            except ValueError:
+                pass
     if upload_date is not None:
         return compat_str(upload_date)
 
@@ -1544,15 +1565,11 @@ def setproctitle(title):
 
 
 def remove_start(s, start):
-    if s.startswith(start):
-        return s[len(start):]
-    return s
+    return s[len(start):] if s is not None and s.startswith(start) else s
 
 
 def remove_end(s, end):
-    if s.endswith(end):
-        return s[:-len(end)]
-    return s
+    return s[:-len(end)] if s is not None and s.endswith(end) else s
 
 
 def remove_quotes(s):
@@ -1906,7 +1923,7 @@ def parse_age_limit(s):
 
 def strip_jsonp(code):
     return re.sub(
-        r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
+        r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
 
 
 def js_to_json(code):
@@ -1925,6 +1942,17 @@ def js_to_json(code):
                 '\\x': '\\u00',
             }.get(m.group(0), m.group(0)), v[1:-1])
 
+        INTEGER_TABLE = (
+            (r'^0[xX][0-9a-fA-F]+', 16),
+            (r'^0+[0-7]+', 8),
+        )
+
+        for regex, base in INTEGER_TABLE:
+            im = re.match(regex, v)
+            if im:
+                i = int(im.group(0), base)
+                return '"%d":' % i if v.endswith(':') else '%d' % i
+
         return '"%s"' % v
 
     return re.sub(r'''(?sx)
@@ -1932,6 +1960,7 @@ def js_to_json(code):
         '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
         /\*.*?\*/|,(?=\s*[\]}])|
         [a-zA-Z_][.a-zA-Z_0-9]*|
+        (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
         [0-9]+(?=\s*:)
         ''', fix_kv, code)
 
@@ -1999,6 +2028,9 @@ def mimetype2ext(mt):
 
     ext = {
         'audio/mp4': 'm4a',
+        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
+        # it's the most popular one
+        'audio/mpeg': 'mp3',
     }.get(mt)
     if ext is not None:
         return ext
@@ -2019,11 +2051,7 @@ def mimetype2ext(mt):
 
 
 def urlhandle_detect_ext(url_handle):
-    try:
-        url_handle.headers
-        getheader = lambda h: url_handle.headers[h]
-    except AttributeError:  # Python < 3
-        getheader = url_handle.info().getheader
+    getheader = url_handle.headers.get
 
     cd = getheader('Content-Disposition')
     if cd: