Merge branch 'extract_info_rewrite'

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 8361382774c372c714d1de4cc1c839bdbbce9bf4..017f06c42e9a019e18e25480c5e5d8d3aaaef335 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -3,10 +3,12 @@
  
  import gzip
  import io
+import json
  import locale
  import os
  import re
  import sys
+import traceback
  import zlib
  import email.utils
  import json
@@ -26,6 +28,11 @@ try:
  except ImportError: # Python 2
      import urllib as compat_urllib_parse
  
+try:
+    from urllib.parse import urlparse as compat_urllib_parse_urlparse
+except ImportError: # Python 2
+    from urlparse import urlparse as compat_urllib_parse_urlparse
+
  try:
      import http.cookiejar as compat_cookiejar
  except ImportError: # Python 2
@@ -46,6 +53,12 @@ try:
  except ImportError: # Python 2
      import httplib as compat_http_client
  
+try:
+    from subprocess import DEVNULL
+    compat_subprocess_get_DEVNULL = lambda: DEVNULL
+except ImportError:
+    compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
+
  try:
      from urllib.parse import parse_qs as compat_parse_qs
  except ImportError: # Python 2
@@ -142,6 +155,7 @@ std_headers = {
      'Accept-Encoding': 'gzip, deflate',
      'Accept-Language': 'en-us,en;q=0.5',
  }
+
  def preferredencoding():
      """Get preferred encoding.
  
@@ -164,6 +178,17 @@ else:
          assert type(s) == type(u'')
          print(s)
  
+# In Python 2.x, json.dump expects a bytestream.
+# In Python 3.x, it writes to a character stream
+if sys.version_info < (3,0):
+    def write_json_file(obj, fn):
+        with open(fn, 'wb') as f:
+            json.dump(obj, f)
+else:
+    def write_json_file(obj, fn):
+        with open(fn, 'w', encoding='utf-8') as f:
+            json.dump(obj, f)
+
  def htmlentity_transform(matchobj):
      """Transforms an HTML entity to a character.
  
@@ -190,10 +215,11 @@ def htmlentity_transform(matchobj):
      return (u'&%s;' % entity)
  
  compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class IDParser(compat_html_parser.HTMLParser):
-    """Modified HTMLParser that isolates a tag with the specified id"""
-    def __init__(self, id):
-        self.id = id
+class AttrParser(compat_html_parser.HTMLParser):
+    """Modified HTMLParser that isolates a tag with the specified attribute"""
+    def __init__(self, attribute, value):
+        self.attribute = attribute
+        self.value = value
          self.result = None
          self.started = False
          self.depth = {}
@@ -218,7 +244,7 @@ class IDParser(compat_html_parser.HTMLParser):
          attrs = dict(attrs)
          if self.started:
              self.find_startpos(None)
-        if 'id' in attrs and attrs['id'] == self.id:
+        if self.attribute in attrs and attrs[self.attribute] == self.value:
              self.result = [tag]
              self.started = True
              self.watch_startpos = True
@@ -254,10 +280,20 @@ class IDParser(compat_html_parser.HTMLParser):
              lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
          lines[-1] = lines[-1][:self.result[2][1]]
          return '\n'.join(lines).strip()
+# Hack for https://github.com/rg3/youtube-dl/issues/662
+if sys.version_info < (2, 7, 3):
+    AttrParser.parse_endtag = (lambda self, i:
+        i + len("</scr'+'ipt>")
+        if self.rawdata[i:].startswith("</scr'+'ipt>")
+        else compat_html_parser.HTMLParser.parse_endtag(self, i))
  
  def get_element_by_id(id, html):
-    """Return the content of the tag with the specified id in the passed HTML document"""
-    parser = IDParser(id)
+    """Return the content of the tag with the specified ID in the passed HTML document"""
+    return get_element_by_attribute("id", id, html)
+
+def get_element_by_attribute(attribute, value, html):
+    """Return the content of the tag with the specified attribute in the passed HTML document"""
+    parser = AttrParser(attribute, value)
      try:
          parser.loads(html)
      except compat_html_parser.HTMLParseError:
@@ -269,12 +305,13 @@ def clean_html(html):
      """Clean an HTML snippet into a readable string"""
      # Newline vs <br />
      html = html.replace('\n', ' ')
-    html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
+    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
      # Strip html tags
      html = re.sub('<.*?>', '', html)
      # Replace html entities
      html = unescapeHTML(html)
-    return html
+    return html.strip()
  
  
  def sanitize_open(filename, open_mode):
@@ -292,7 +329,7 @@ def sanitize_open(filename, open_mode):
              if sys.platform == 'win32':
                  import msvcrt
                  msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
-            return (sys.stdout, filename)
+            return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
          stream = open(encodeFilename(filename), open_mode)
          return (stream, filename)
      except (IOError, OSError) as err:
@@ -312,9 +349,10 @@ def timeconvert(timestr):
          timestamp = email.utils.mktime_tz(timetuple)
      return timestamp
  
-def sanitize_filename(s, restricted=False):
+def sanitize_filename(s, restricted=False, is_id=False):
      """Sanitizes a string so it could be used as part of a filename.
      If restricted is set, use a stricter subset of allowed characters.
+    Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
      """
      def replace_insane(char):
          if char == '?' or ord(char) < 32 or ord(char) == 127:
@@ -325,21 +363,22 @@ def sanitize_filename(s, restricted=False):
              return '_-' if restricted else ' -'
          elif char in '\\/|*<>':
              return '_'
-        if restricted and (char in '!&\'' or char.isspace()):
+        if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
              return '_'
          if restricted and ord(char) > 127:
              return '_'
          return char
  
      result = u''.join(map(replace_insane, s))
-    while '__' in result:
-        result = result.replace('__', '_')
-    result = result.strip('_')
-    # Common case of "Foreign band name - English song title"
-    if restricted and result.startswith('-_'):
-        result = result[2:]
-    if not result:
-        result = '_'
+    if not is_id:
+        while '__' in result:
+            result = result.replace('__', '_')
+        result = result.strip('_')
+        # Common case of "Foreign band name - English song title"
+        if restricted and result.startswith('-_'):
+            result = result[2:]
+        if not result:
+            result = '_'
      return result
  
  def orderedSet(iterable):
@@ -376,7 +415,33 @@ def encodeFilename(s):
          # match Windows 9x series as well. Besides, NT 4 is obsolete.)
          return s
      else:
-        return s.encode(sys.getfilesystemencoding(), 'ignore')
+        encoding = sys.getfilesystemencoding()
+        if encoding is None:
+            encoding = 'utf-8'
+        return s.encode(encoding, 'ignore')
+
+def decodeOption(optval):
+    if optval is None:
+        return optval
+    if isinstance(optval, bytes):
+        optval = optval.decode(preferredencoding())
+
+    assert isinstance(optval, compat_str)
+    return optval
+
+class ExtractorError(Exception):
+    """Error during info extraction."""
+    def __init__(self, msg, tb=None):
+        """ tb, if given, is the original traceback (so that it can be printed out). """
+        super(ExtractorError, self).__init__(msg)
+        self.traceback = tb
+        self.exc_info = sys.exc_info()  # preserve original exception
+
+    def format_traceback(self):
+        if self.traceback is None:
+            return None
+        return u''.join(traceback.format_tb(self.traceback))
+
  
  class DownloadError(Exception):
      """Download Error exception.
@@ -385,7 +450,10 @@ class DownloadError(Exception):
      configured to continue on errors. They will contain the appropriate
      error message.
      """
-    pass
+    def __init__(self, msg, exc_info=None):
+        """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
+        super(DownloadError, self).__init__(msg)
+        self.exc_info = exc_info
  
  
  class SameFileError(Exception):
@@ -403,7 +471,8 @@ class PostProcessingError(Exception):
      This exception may be raised by PostProcessor's .run() method to
      indicate an error in the postprocessing task.
      """
-    pass
+    def __init__(self, msg):
+        self.msg = msg
  
  class MaxDownloadsReached(Exception):
      """ --max-downloads limit has been reached. """
@@ -434,14 +503,6 @@ class ContentTooShortError(Exception):
          self.downloaded = downloaded
          self.expected = expected
  
-
-class Trouble(Exception):
-    """Trouble helper exception
-
-    This is an exception to be handled with
-    FileDownloader.trouble
-    """
-
  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      """Handler for HTTP requests and responses.
  
@@ -476,14 +537,19 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
          return ret
  
      def http_request(self, req):
-        for h in std_headers:
+        for h,v in std_headers.items():
              if h in req.headers:
                  del req.headers[h]
-            req.add_header(h, std_headers[h])
+            req.add_header(h, v)
          if 'Youtubedl-no-compression' in req.headers:
              if 'Accept-encoding' in req.headers:
                  del req.headers['Accept-encoding']
              del req.headers['Youtubedl-no-compression']
+        if 'Youtubedl-user-agent' in req.headers:
+            if 'User-agent' in req.headers:
+                del req.headers['User-agent']
+            req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
+            del req.headers['Youtubedl-user-agent']
          return req
  
      def http_response(self, req, resp):
@@ -499,3 +565,6 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
              resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
              resp.msg = old_resp.msg
          return resp
+
+    https_request = http_request
+    https_response = http_response