Use a dictionary for storing the subtitles
[youtube-dl] / youtube_dl / extractor / common.py
index 0f6049cb43306537be089d27404ccb05d69d10a9..e2e192beff593b4768c626e6df94e61a20dbf04e 100644 (file)
@@ -14,6 +14,7 @@ from ..utils import (
     clean_html,
     compiled_regex_type,
     ExtractorError,
+    unescapeHTML,
 )
 
 class InfoExtractor(object):
@@ -37,6 +38,8 @@ class InfoExtractor(object):
     The following fields are optional:
 
     format:         The video format, defaults to ext (used for --get-format)
+    thumbnails:     A list of dictionaries (with the entries "resolution" and
+                    "url") for the varying thumbnails
     thumbnail:      Full URL to a video thumbnail image.
     description:    One-line video description.
     uploader:       Full name of the video uploader.
@@ -44,7 +47,8 @@ class InfoExtractor(object):
     uploader_id:    Nickname or id of the video uploader.
     location:       Physical location of the video.
     player_url:     SWF Player URL (used for rtmpdump).
-    subtitles:      The subtitle file contents.
+    subtitles:      The subtitle file contents as a dictionary in the format
+                    {language: subtitles}.
     view_count:     How many users have watched the video on the platform.
     urlhandle:      [internal] The urlHandle to be used to download the file,
                     like returned by urllib.request.urlopen
@@ -123,6 +127,11 @@ class InfoExtractor(object):
 
     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
         """ Returns a tuple (page content as string, URL handle) """
+
+        # Strip hashes from the URL (#1038)
+        if isinstance(url_or_request, (compat_str, str)):
+            url_or_request = url_or_request.partition('#')[0]
+
         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
         content_type = urlh.headers.get('Content-Type', '')
         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@ -167,11 +176,6 @@ class InfoExtractor(object):
         self.to_screen(u'Logging in')
 
     #Methods for following #608
-    #They set the correct value of the '_type' key
-    def video_result(self, video_info):
-        """Returns a video"""
-        video_info['_type'] = 'video'
-        return video_info
     def url_result(self, url, ie=None):
         """Returns a url that points to a page that should be processed"""
         #TODO: ie should be the class used for getting the info
@@ -260,6 +264,31 @@ class InfoExtractor(object):
         
         return (username, password)
 
+    # Helper functions for extracting OpenGraph info
+    @staticmethod
+    def _og_regex(prop):
+        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+
+    def _og_search_property(self, prop, html, name=None, **kargs):
+        if name is None:
+            name = 'OpenGraph %s' % prop
+        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+        return unescapeHTML(escaped)
+
+    def _og_search_thumbnail(self, html, **kargs):
+        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+
+    def _og_search_description(self, html, **kargs):
+        return self._og_search_property('description', html, fatal=False, **kargs)
+
+    def _og_search_title(self, html, **kargs):
+        return self._og_search_property('title', html, **kargs)
+
+    def _og_search_video_url(self, html, name='video url', **kargs):
+        return self._html_search_regex([self._og_regex('video:secure_url'),
+                                        self._og_regex('video')],
+                                       html, name, **kargs)
+
 class SearchInfoExtractor(InfoExtractor):
     """
     Base class for paged search queries extractors.