Merge branch 'master' of https://github.com/rg3/youtube-dl

[youtube-dl] / youtube_dl / utils.py
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 4e64f327a67338a7cc9bb53ed347945df3f04b34..8f856ee8c073dd9095f2b62b1567eabdb321c117 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -8,6 +8,7 @@ import locale
  import os
  import re
  import sys
+import traceback
  import zlib
  import email.utils
  import json
@@ -154,6 +155,7 @@ std_headers = {
      'Accept-Encoding': 'gzip, deflate',
      'Accept-Language': 'en-us,en;q=0.5',
  }
+
  def preferredencoding():
      """Get preferred encoding.
  
@@ -187,7 +189,6 @@ else:
          with open(fn, 'w', encoding='utf-8') as f:
              json.dump(obj, f)
  
-
  def htmlentity_transform(matchobj):
      """Transforms an HTML entity to a character.
  
@@ -214,10 +215,11 @@ def htmlentity_transform(matchobj):
      return (u'&%s;' % entity)
  
  compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
-class IDParser(compat_html_parser.HTMLParser):
-    """Modified HTMLParser that isolates a tag with the specified id"""
-    def __init__(self, id):
-        self.id = id
+class AttrParser(compat_html_parser.HTMLParser):
+    """Modified HTMLParser that isolates a tag with the specified attribute"""
+    def __init__(self, attribute, value):
+        self.attribute = attribute
+        self.value = value
          self.result = None
          self.started = False
          self.depth = {}
@@ -242,7 +244,7 @@ class IDParser(compat_html_parser.HTMLParser):
          attrs = dict(attrs)
          if self.started:
              self.find_startpos(None)
-        if 'id' in attrs and attrs['id'] == self.id:
+        if self.attribute in attrs and attrs[self.attribute] == self.value:
              self.result = [tag]
              self.started = True
              self.watch_startpos = True
@@ -280,8 +282,12 @@ class IDParser(compat_html_parser.HTMLParser):
          return '\n'.join(lines).strip()
  
  def get_element_by_id(id, html):
-    """Return the content of the tag with the specified id in the passed HTML document"""
-    parser = IDParser(id)
+    """Return the content of the tag with the specified ID in the passed HTML document"""
+    return get_element_by_attribute("id", id, html)
+
+def get_element_by_attribute(attribute, value, html):
+    """Return the content of the tag with the specified attribute in the passed HTML document"""
+    parser = AttrParser(attribute, value)
      try:
          parser.loads(html)
      except compat_html_parser.HTMLParseError:
@@ -293,7 +299,8 @@ def clean_html(html):
      """Clean an HTML snippet into a readable string"""
      # Newline vs <br />
      html = html.replace('\n', ' ')
-    html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+    html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
+    html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
      # Strip html tags
      html = re.sub('<.*?>', '', html)
      # Replace html entities
@@ -404,6 +411,20 @@ def encodeFilename(s):
      else:
          return s.encode(sys.getfilesystemencoding(), 'ignore')
  
+
+class ExtractorError(Exception):
+    """Error during info extraction."""
+    def __init__(self, msg, tb=None):
+        """ tb, if given, is the original traceback (so that it can be printed out). """
+        super(ExtractorError, self).__init__(msg)
+        self.traceback = tb
+
+    def format_traceback(self):
+        if self.traceback is None:
+            return None
+        return u''.join(traceback.format_tb(self.traceback))
+
+
  class DownloadError(Exception):
      """Download Error exception.
  
@@ -460,14 +481,6 @@ class ContentTooShortError(Exception):
          self.downloaded = downloaded
          self.expected = expected
  
-
-class Trouble(Exception):
-    """Trouble helper exception
-
-    This is an exception to be handled with
-    FileDownloader.trouble
-    """
-
  class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
      """Handler for HTTP requests and responses.