Use a dictionary for storing the subtitles

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 0f6049cb43306537be089d27404ccb05d69d10a9..e2e192beff593b4768c626e6df94e61a20dbf04e 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -14,6 +14,7 @@ from ..utils import (
      clean_html,
      compiled_regex_type,
      ExtractorError,
+    unescapeHTML,
  )
  
  class InfoExtractor(object):
@@ -37,6 +38,8 @@ class InfoExtractor(object):
      The following fields are optional:
  
      format:         The video format, defaults to ext (used for --get-format)
+    thumbnails:     A list of dictionaries (with the entries "resolution" and
+                    "url") for the varying thumbnails
      thumbnail:      Full URL to a video thumbnail image.
      description:    One-line video description.
      uploader:       Full name of the video uploader.
@@ -44,7 +47,8 @@ class InfoExtractor(object):
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
      player_url:     SWF Player URL (used for rtmpdump).
-    subtitles:      The subtitle file contents.
+    subtitles:      The subtitle file contents as a dictionary in the format
+                    {language: subtitles}.
      view_count:     How many users have watched the video on the platform.
      urlhandle:      [internal] The urlHandle to be used to download the file,
                      like returned by urllib.request.urlopen
@@ -123,6 +127,11 @@ class InfoExtractor(object):
  
      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
          """ Returns a tuple (page content as string, URL handle) """
+
+        # Strip hashes from the URL (#1038)
+        if isinstance(url_or_request, (compat_str, str)):
+            url_or_request = url_or_request.partition('#')[0]
+
          urlh = self._request_webpage(url_or_request, video_id, note, errnote)
          content_type = urlh.headers.get('Content-Type', '')
          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -167,11 +176,6 @@ class InfoExtractor(object):
          self.to_screen(u'Logging in')
  
      #Methods for following #608
-    #They set the correct value of the '_type' key
-    def video_result(self, video_info):
-        """Returns a video"""
-        video_info['_type'] = 'video'
-        return video_info
      def url_result(self, url, ie=None):
          """Returns a url that points to a page that should be processed"""
          #TODO: ie should be the class used for getting the info
@@ -260,6 +264,31 @@ class InfoExtractor(object):
          
          return (username, password)
  
+    # Helper functions for extracting OpenGraph info
+    @staticmethod
+    def _og_regex(prop):
+        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+
+    def _og_search_property(self, prop, html, name=None, **kargs):
+        if name is None:
+            name = 'OpenGraph %s' % prop
+        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+        return unescapeHTML(escaped)
+
+    def _og_search_thumbnail(self, html, **kargs):
+        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+
+    def _og_search_description(self, html, **kargs):
+        return self._og_search_property('description', html, fatal=False, **kargs)
+
+    def _og_search_title(self, html, **kargs):
+        return self._og_search_property('title', html, **kargs)
+
+    def _og_search_video_url(self, html, name='video url', **kargs):
+        return self._html_search_regex([self._og_regex('video:secure_url'),
+                                        self._og_regex('video')],
+                                       html, name, **kargs)
+
  class SearchInfoExtractor(InfoExtractor):
      """
      Base class for paged search queries extractors.