[metacafe] fix info extraction(closes #8539)(closes #3253)

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 6ab1747b3cc058f1e3e22a3b5c5b8967488d5090..d302f39e471f2273e8aadfff93069560c7c8adf8 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -39,6 +39,7 @@ from .compat import (
      compat_chr,
      compat_etree_fromstring,
      compat_html_entities,
+    compat_html_entities_html5,
      compat_http_client,
      compat_kwargs,
      compat_parse_qs,
@@ -75,7 +76,7 @@ def register_socks_protocols():
  compiled_regex_type = type(re.compile(''))
  
  std_headers = {
-    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
+    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
      'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
      'Accept-Encoding': 'gzip, deflate',
@@ -105,9 +106,52 @@ KNOWN_EXTENSIONS = (
      'f4f', 'f4m', 'm3u8', 'smil')
  
  # needed for sanitizing filenames in restricted mode
-ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØŒÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøœùúûüýþÿ',
-                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOO', ['OE'], 'UUUUYP', ['ss'],
-                                        'aaaaaa', ['ae'], 'ceeeeiiiionoooooo', ['oe'], 'uuuuypy')))
+ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
+                        itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
+                                        'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
+
+DATE_FORMATS = (
+    '%d %B %Y',
+    '%d %b %Y',
+    '%B %d %Y',
+    '%b %d %Y',
+    '%b %dst %Y %I:%M',
+    '%b %dnd %Y %I:%M',
+    '%b %dth %Y %I:%M',
+    '%Y %m %d',
+    '%Y-%m-%d',
+    '%Y/%m/%d',
+    '%Y/%m/%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S',
+    '%Y-%m-%d %H:%M:%S.%f',
+    '%d.%m.%Y %H:%M',
+    '%d.%m.%Y %H.%M',
+    '%Y-%m-%dT%H:%M:%SZ',
+    '%Y-%m-%dT%H:%M:%S.%fZ',
+    '%Y-%m-%dT%H:%M:%S.%f0Z',
+    '%Y-%m-%dT%H:%M:%S',
+    '%Y-%m-%dT%H:%M:%S.%f',
+    '%Y-%m-%dT%H:%M',
+)
+
+DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_DAY_FIRST.extend([
+    '%d-%m-%Y',
+    '%d.%m.%Y',
+    '%d.%m.%y',
+    '%d/%m/%Y',
+    '%d/%m/%y',
+    '%d/%m/%Y %H:%M:%S',
+])
+
+DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
+DATE_FORMATS_MONTH_FIRST.extend([
+    '%m-%d-%Y',
+    '%m.%d.%Y',
+    '%m/%d/%Y',
+    '%m/%d/%y',
+    '%m/%d/%Y %H:%M:%S',
+])
  
  
  def preferredencoding():
@@ -456,12 +500,19 @@ def orderedSet(iterable):
      return res
  
  
-def _htmlentity_transform(entity):
+def _htmlentity_transform(entity_with_semicolon):
      """Transforms an HTML entity to a character."""
+    entity = entity_with_semicolon[:-1]
+
      # Known non-numeric HTML entity
      if entity in compat_html_entities.name2codepoint:
          return compat_chr(compat_html_entities.name2codepoint[entity])
  
+    # TODO: HTML5 allows entities without a semicolon. For example,
+    # '&Eacuteric' should be decoded as 'Éric'.
+    if entity_with_semicolon in compat_html_entities_html5:
+        return compat_html_entities_html5[entity_with_semicolon]
+
      mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
      if mobj is not None:
          numstr = mobj.group(1)
@@ -486,7 +537,7 @@ def unescapeHTML(s):
      assert type(s) == compat_str
  
      return re.sub(
-        r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
+        r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
  
  
  def get_subprocess_encoding():
@@ -866,6 +917,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
                  location_escaped = escape_url(location)
                  if location != location_escaped:
                      del resp.headers['Location']
+                    if sys.version_info < (3, 0):
+                        location_escaped = location_escaped.encode('utf-8')
                      resp.headers['Location'] = location_escaped
          return resp
  
@@ -965,6 +1018,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
      https_response = http_response
  
  
+def extract_timezone(date_str):
+    m = re.search(
+        r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
+        date_str)
+    if not m:
+        timezone = datetime.timedelta()
+    else:
+        date_str = date_str[:-len(m.group('tz'))]
+        if not m.group('sign'):
+            timezone = datetime.timedelta()
+        else:
+            sign = 1 if m.group('sign') == '+' else -1
+            timezone = datetime.timedelta(
+                hours=sign * int(m.group('hours')),
+                minutes=sign * int(m.group('minutes')))
+    return timezone, date_str
+
+
  def parse_iso8601(date_str, delimiter='T', timezone=None):
      """ Return a UNIX timestamp from the given date """
  
@@ -974,20 +1045,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
      date_str = re.sub(r'\.[0-9]+', '', date_str)
  
      if timezone is None:
-        m = re.search(
-            r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
-            date_str)
-        if not m:
-            timezone = datetime.timedelta()
-        else:
-            date_str = date_str[:-len(m.group(0))]
-            if not m.group('sign'):
-                timezone = datetime.timedelta()
-            else:
-                sign = 1 if m.group('sign') == '+' else -1
-                timezone = datetime.timedelta(
-                    hours=sign * int(m.group('hours')),
-                    minutes=sign * int(m.group('minutes')))
+        timezone, date_str = extract_timezone(date_str)
+
      try:
          date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
          dt = datetime.datetime.strptime(date_str, date_format) - timezone
@@ -996,6 +1055,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None):
          pass
  
  
+def date_formats(day_first=True):
+    return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
+
+
  def unified_strdate(date_str, day_first=True):
      """Return a string with the date in the format YYYYMMDD"""
  
@@ -1004,53 +1067,11 @@ def unified_strdate(date_str, day_first=True):
      upload_date = None
      # Replace commas
      date_str = date_str.replace(',', ' ')
-    # %z (UTC offset) is only supported in python>=3.2
-    if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
-        date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
      # Remove AM/PM + timezone
      date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+    _, date_str = extract_timezone(date_str)
  
-    format_expressions = [
-        '%d %B %Y',
-        '%d %b %Y',
-        '%B %d %Y',
-        '%b %d %Y',
-        '%b %dst %Y %I:%M',
-        '%b %dnd %Y %I:%M',
-        '%b %dth %Y %I:%M',
-        '%Y %m %d',
-        '%Y-%m-%d',
-        '%Y/%m/%d',
-        '%Y/%m/%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S',
-        '%Y-%m-%d %H:%M:%S.%f',
-        '%d.%m.%Y %H:%M',
-        '%d.%m.%Y %H.%M',
-        '%Y-%m-%dT%H:%M:%SZ',
-        '%Y-%m-%dT%H:%M:%S.%fZ',
-        '%Y-%m-%dT%H:%M:%S.%f0Z',
-        '%Y-%m-%dT%H:%M:%S',
-        '%Y-%m-%dT%H:%M:%S.%f',
-        '%Y-%m-%dT%H:%M',
-    ]
-    if day_first:
-        format_expressions.extend([
-            '%d-%m-%Y',
-            '%d.%m.%Y',
-            '%d.%m.%y',
-            '%d/%m/%Y',
-            '%d/%m/%y',
-            '%d/%m/%Y %H:%M:%S',
-        ])
-    else:
-        format_expressions.extend([
-            '%m-%d-%Y',
-            '%m.%d.%Y',
-            '%m/%d/%Y',
-            '%m/%d/%y',
-            '%m/%d/%Y %H:%M:%S',
-        ])
-    for expression in format_expressions:
+    for expression in date_formats(day_first):
          try:
              upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
          except ValueError:
@@ -1066,6 +1087,29 @@ def unified_strdate(date_str, day_first=True):
          return compat_str(upload_date)
  
  
+def unified_timestamp(date_str, day_first=True):
+    if date_str is None:
+        return None
+
+    date_str = date_str.replace(',', ' ')
+
+    pm_delta = datetime.timedelta(hours=12 if re.search(r'(?i)PM', date_str) else 0)
+    timezone, date_str = extract_timezone(date_str)
+
+    # Remove AM/PM + timezone
+    date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+
+    for expression in date_formats(day_first):
+        try:
+            dt = datetime.datetime.strptime(date_str, expression) - timezone + pm_delta
+            return calendar.timegm(dt.timetuple())
+        except ValueError:
+            pass
+    timetuple = email.utils.parsedate_tz(date_str)
+    if timetuple:
+        return calendar.timegm(timetuple.timetuple())
+
+
  def determine_ext(url, default_ext='unknown_video'):
      if url is None:
          return default_ext
@@ -1400,6 +1444,8 @@ def shell_quote(args):
  def smuggle_url(url, data):
      """ Pass additional data in a URL for internal use. """
  
+    url, idata = unsmuggle_url(url, {})
+    data.update(idata)
      sdata = compat_urllib_parse_urlencode(
          {'__youtubedl_smuggle': json.dumps(data)})
      return url + '#' + sdata
@@ -1581,6 +1627,11 @@ class HEADRequest(compat_urllib_request.Request):
          return 'HEAD'
  
  
+class PUTRequest(compat_urllib_request.Request):
+    def get_method(self):
+        return 'PUT'
+
+
  def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
      if get_attr:
          if v is not None:
@@ -1616,6 +1667,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):
          return default
  
  
+def strip_or_none(v):
+    return None if v is None else v.strip()
+
+
  def parse_duration(s):
      if not isinstance(s, compat_basestring):
          return None
@@ -1872,7 +1927,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
      req_headers.update(headers)
      req_data = data or req.data
      req_url = update_url_query(url or req.get_full_url(), query)
-    req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
+    req_get_method = req.get_method()
+    if req_get_method == 'HEAD':
+        req_type = HEADRequest
+    elif req_get_method == 'PUT':
+        req_type = PUTRequest
+    else:
+        req_type = compat_urllib_request.Request
      new_req = req_type(
          req_url, data=req_data, headers=req_headers,
          origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
@@ -1891,6 +1952,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):
      return d.get(key_or_keys, default)
  
  
+def try_get(src, getter, expected_type=None):
+    try:
+        v = getter(src)
+    except (AttributeError, KeyError, TypeError, IndexError):
+        pass
+    else:
+        if expected_type is None or isinstance(v, expected_type):
+            return v
+
+
  def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
      return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
  
@@ -1950,7 +2021,7 @@ def js_to_json(code):
          '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
          /\*.*?\*/|,(?=\s*[\]}])|
          [a-zA-Z_][.a-zA-Z_0-9]*|
-        (?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
+        \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
          [0-9]+(?=\s*:)
          ''', fix_kv, code)
  
@@ -2018,6 +2089,9 @@ def mimetype2ext(mt):
  
      ext = {
          'audio/mp4': 'm4a',
+        # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
+        # it's the most popular one
+        'audio/mpeg': 'mp3',
      }.get(mt)
      if ext is not None:
          return ext
@@ -2829,3 +2903,16 @@ def decode_packed_codes(code):
      return re.sub(
          r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
          obfucasted_code)
+
+
+def parse_m3u8_attributes(attrib):
+    info = {}
+    for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
+        if val.startswith('"'):
+            val = val[1:-1]
+        info[key] = val
+    return info
+
+
+def urshift(val, n):
+    return val >> n if val >= 0 else (val + 0x100000000) >> n