Merge branch 'brightcove_in_page_embed' of https://github.com/remitamine/youtube...
[youtube-dl] / youtube_dl / utils.py
index 9bc292a39a5a30f5bee6d16c9ff8557eccf7dff0..65556d056a7edfe12b94e82d0f55baa93360aa70 100644 (file)
@@ -3,6 +3,7 @@
 
 from __future__ import unicode_literals
 
+import base64
 import calendar
 import codecs
 import contextlib
@@ -35,6 +36,7 @@ import zlib
 from .compat import (
     compat_basestring,
     compat_chr,
+    compat_etree_fromstring,
     compat_html_entities,
     compat_http_client,
     compat_kwargs,
@@ -177,10 +179,19 @@ def xpath_with_ns(path, ns_map):
 
 
 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
-    if sys.version_info < (2, 7):  # Crazy 2.6
-        xpath = xpath.encode('ascii')
+    def _find_xpath(xpath):
+        if sys.version_info < (2, 7):  # Crazy 2.6
+            xpath = xpath.encode('ascii')
+        return node.find(xpath)
+
+    if isinstance(xpath, (str, compat_str)):
+        n = _find_xpath(xpath)
+    else:
+        for xp in xpath:
+            n = _find_xpath(xp)
+            if n is not None:
+                break
 
-    n = node.find(xpath)
     if n is None:
         if default is not NO_DEFAULT:
             return default
@@ -248,6 +259,15 @@ def get_element_by_attribute(attribute, value, html):
     return unescapeHTML(res)
 
 
+def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
+    attributes = re.findall(attributes_regex, attributes_str)
+    attributes_dict = {}
+    if attributes:
+        for (attribute_name, attribute_value) in attributes:
+            attributes_dict[attribute_name] = attribute_value
+    return attributes_dict
+
+
 def clean_html(html):
     """Clean an HTML snippet into a readable string"""
 
@@ -355,7 +375,7 @@ def sanitize_path(s):
     if drive_or_unc:
         norm_path.pop(0)
     sanitized_path = [
-        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
+        path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
         for path_part in norm_path]
     if drive_or_unc:
         sanitized_path.insert(0, drive_or_unc + os.path.sep)
@@ -619,7 +639,7 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
     # expected HTTP responses to meet HTTP/1.0 or later (see also
     # https://github.com/rg3/youtube-dl/issues/6727)
     if sys.version_info < (3, 0):
-        kwargs['strict'] = True
+        kwargs[b'strict'] = True
     hc = http_class(*args, **kwargs)
     source_address = ydl_handler._params.get('source_address')
     if source_address is not None:
@@ -748,7 +768,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
             gz = io.BytesIO(self.deflate(resp.read()))
             resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
             resp.msg = old_resp.msg
-        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986
+        # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
+        # https://github.com/rg3/youtube-dl/issues/6457).
         if 300 <= resp.code < 400:
             location = resp.headers.get('Location')
             if location:
@@ -792,16 +813,14 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
         # https://github.com/rg3/youtube-dl/issues/6769).
         # In order to at least prevent crashing we will percent encode Set-Cookie
         # header before HTTPCookieProcessor starts processing it.
-        if sys.version_info < (3, 0) and response.headers:
-            for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
-                set_cookie = response.headers.get(set_cookie_header)
-                if set_cookie:
-                    set_cookie_escaped = '; '.join([
-                        escape_rfc3986(cookie_attr.strip())
-                        for cookie_attr in set_cookie.decode('iso-8859-1').split(';')]).encode('iso-8859-1')
-                    if set_cookie != set_cookie_escaped:
-                        del response.headers[set_cookie_header]
-                        response.headers[set_cookie_header] = set_cookie_escaped
+        # if sys.version_info < (3, 0) and response.headers:
+        #     for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
+        #         set_cookie = response.headers.get(set_cookie_header)
+        #         if set_cookie:
+        #             set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
+        #             if set_cookie != set_cookie_escaped:
+        #                 del response.headers[set_cookie_header]
+        #                 response.headers[set_cookie_header] = set_cookie_escaped
         return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
 
     https_request = compat_urllib_request.HTTPCookieProcessor.http_request
@@ -814,9 +833,11 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
     if date_str is None:
         return None
 
+    date_str = re.sub(r'\.[0-9]+', '', date_str)
+
     if timezone is None:
         m = re.search(
-            r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
             date_str)
         if not m:
             timezone = datetime.timedelta()
@@ -829,9 +850,12 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
                 timezone = datetime.timedelta(
                     hours=sign * int(m.group('hours')),
                     minutes=sign * int(m.group('minutes')))
-    date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
-    dt = datetime.datetime.strptime(date_str, date_format) - timezone
-    return calendar.timegm(dt.timetuple())
+    try:
+        date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
+        dt = datetime.datetime.strptime(date_str, date_format) - timezone
+        return calendar.timegm(dt.timetuple())
+    except ValueError:
+        pass
 
 
 def unified_strdate(date_str, day_first=True):
@@ -896,7 +920,8 @@ def unified_strdate(date_str, day_first=True):
         timetuple = email.utils.parsedate_tz(date_str)
         if timetuple:
             upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
-    return upload_date
+    if upload_date is not None:
+        return compat_str(upload_date)
 
 
 def determine_ext(url, default_ext='unknown_video'):
@@ -1372,7 +1397,12 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
             v = getattr(v, get_attr, None)
     if v == '':
         v = None
-    return default if v is None else (int(v) * invscale // scale)
+    if v is None:
+        return default
+    try:
+        return int(v) * invscale // scale
+    except ValueError:
+        return default
 
 
 def str_or_none(v, default=None):
@@ -1388,7 +1418,12 @@ def str_to_int(int_str):
 
 
 def float_or_none(v, scale=1, invscale=1, default=None):
-    return default if v is None else (float(v) * invscale / scale)
+    if v is None:
+        return default
+    try:
+        return float(v) * invscale / scale
+    except ValueError:
+        return default
 
 
 def parse_duration(s):
@@ -1637,27 +1672,8 @@ def urlencode_postdata(*args, **kargs):
     return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
 
 
-try:
-    etree_iter = xml.etree.ElementTree.Element.iter
-except AttributeError:  # Python <=2.6
-    etree_iter = lambda n: n.findall('.//*')
-
-
-def parse_xml(s):
-    class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
-        def doctype(self, name, pubid, system):
-            pass  # Ignore doctypes
-
-    parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
-    kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
-    tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
-    # Fix up XML parser in Python 2.x
-    if sys.version_info < (3, 0):
-        for n in etree_iter(tree):
-            if n.text is not None:
-                if not isinstance(n.text, compat_str):
-                    n.text = n.text.decode('utf-8')
-    return tree
+def encode_dict(d, encoding='utf-8'):
+    return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
 
 
 US_RATINGS = {
@@ -1687,8 +1703,8 @@ def js_to_json(code):
         if v in ('true', 'false', 'null'):
             return v
         if v.startswith('"'):
-            return v
-        if v.startswith("'"):
+            v = re.sub(r"\\'", "'", v[1:-1])
+        elif v.startswith("'"):
             v = v[1:-1]
             v = re.sub(r"\\\\|\\'|\"", lambda m: {
                 '\\\\': '\\\\',
@@ -1782,6 +1798,10 @@ def urlhandle_detect_ext(url_handle):
     return mimetype2ext(getheader('Content-Type'))
 
 
+def encode_data_uri(data, mime_type):
+    return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
+
+
 def age_restricted(content_limit, age_limit):
     """ Returns True iff the content should be blocked """
 
@@ -1956,7 +1976,7 @@ def dfxp2srt(dfxp_data):
 
         return out
 
-    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
     out = []
     paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')