[extractor/common] Extract more metadata for VideoObject in _json_ld
[youtube-dl] / youtube_dl / extractor / common.py
index 4eda4e2ea6e378eaeee997c5586931bb649cb7dd..9ffe64d05533ae5c5ac2a8d21bb1223b388e9789 100644 (file)
@@ -44,6 +44,7 @@ from ..utils import (
     sanitized_Request,
     unescapeHTML,
     unified_strdate,
+    unified_timestamp,
     url_basename,
     xpath_element,
     xpath_text,
@@ -161,6 +162,7 @@ class InfoExtractor(object):
                         * "height" (optional, int)
                         * "resolution" (optional, string "{width}x{height"},
                                         deprecated)
+                        * "filesize" (optional, int)
     thumbnail:      Full URL to a video thumbnail image.
     description:    Full video description.
     uploader:       Full name of the video uploader.
@@ -837,6 +839,19 @@ class InfoExtractor(object):
                     'title': unescapeHTML(json_ld.get('headline')),
                     'description': unescapeHTML(json_ld.get('articleBody')),
                 })
+            elif item_type == 'VideoObject':
+                info.update({
+                    'url': json_ld.get('contentUrl'),
+                    'title': unescapeHTML(json_ld.get('name')),
+                    'description': unescapeHTML(json_ld.get('description')),
+                    'thumbnail': json_ld.get('thumbnailUrl'),
+                    'duration': parse_duration(json_ld.get('duration')),
+                    'timestamp': unified_timestamp(json_ld.get('uploadDate')),
+                    'filesize': float_or_none(json_ld.get('contentSize')),
+                    'tbr': int_or_none(json_ld.get('bitrate')),
+                    'width': int_or_none(json_ld.get('width')),
+                    'height': int_or_none(json_ld.get('height')),
+                })
         return dict((k, v) for k, v in info.items() if v is not None)
 
     @staticmethod
@@ -878,7 +893,11 @@ class InfoExtractor(object):
                 f['ext'] = determine_ext(f['url'])
 
             if isinstance(field_preference, (list, tuple)):
-                return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference)
+                return tuple(
+                    f.get(field)
+                    if f.get(field) is not None
+                    else ('' if field == 'format_id' else -1)
+                    for field in field_preference)
 
             preference = f.get('preference')
             if preference is None:
@@ -1725,6 +1744,13 @@ class InfoExtractor(object):
     def _mark_watched(self, *args, **kwargs):
         raise NotImplementedError('This method must be implemented by subclasses')
 
+    def geo_verification_headers(self):
+        headers = {}
+        geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
+        if geo_verification_proxy:
+            headers['Ytdl-request-proxy'] = geo_verification_proxy
+        return headers
+
 
 class SearchInfoExtractor(InfoExtractor):
     """