[extractor/common] Add support for AMP tags in _parse_html5_media_entries
[youtube-dl] / youtube_dl / extractor / generic.py
index 2792ea3cfae43ad4a9c5d39c4fa01166f57c9692..919f4f987eb6f5af3963656329ca9fc1c540d9a8 100644 (file)
@@ -1185,7 +1185,7 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': ['Kaltura'],
         },
-        # Eagle.Platform embed (generic URL)
+        # EaglePlatform embed (generic URL)
         {
             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
             # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1199,8 +1199,26 @@ class GenericIE(InfoExtractor):
                 'view_count': int,
                 'age_limit': 0,
             },
+            'params': {
+                'skip_download': True,
+            },
         },
-        # ClipYou (Eagle.Platform) embed (custom URL)
+        # referrer protected EaglePlatform embed
+        {
+            'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
+            'info_dict': {
+                'id': '582306',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+                'thumbnail': r're:^https?://.*\.jpg$',
+                'duration': 3382,
+                'view_count': int,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        # ClipYou (EaglePlatform) embed (custom URL)
         {
             'url': 'http://muz-tv.ru/play/7129/',
             # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1212,6 +1230,9 @@ class GenericIE(InfoExtractor):
                 'duration': 216,
                 'view_count': int,
             },
+            'params': {
+                'skip_download': True,
+            },
         },
         # Pladform embed
         {
@@ -1749,6 +1770,16 @@ class GenericIE(InfoExtractor):
             },
             'add_ie': [MediasetIE.ie_key()],
         },
+        {
+            # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
+            'url': 'https://tvrain.ru/amp/418921/',
+            'md5': 'cc00413936695987e8de148b67d14f1d',
+            'info_dict': {
+                'id': '418921',
+                'ext': 'mp4',
+                'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+            },
+        },
         # {
         #     # TODO: find another test
         #     # http://schema.org/VideoObject
@@ -2048,6 +2079,13 @@ class GenericIE(InfoExtractor):
         video_description = self._og_search_description(webpage, default=None)
         video_thumbnail = self._og_search_thumbnail(webpage, default=None)
 
+        info_dict.update({
+            'title': video_title,
+            'description': video_description,
+            'thumbnail': video_thumbnail,
+            'age_limit': age_limit,
+        })
+
         # Look for Brightcove Legacy Studio embeds
         bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
         if bc_urls:
@@ -2436,12 +2474,12 @@ class GenericIE(InfoExtractor):
         if kaltura_url:
             return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
 
-        # Look for Eagle.Platform embeds
+        # Look for EaglePlatform embeds
         eagleplatform_url = EaglePlatformIE._extract_url(webpage)
         if eagleplatform_url:
-            return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
+            return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
 
-        # Look for ClipYou (uses Eagle.Platform) embeds
+        # Look for ClipYou (uses EaglePlatform) embeds
         mobj = re.search(
             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
         if mobj is not None:
@@ -2684,18 +2722,26 @@ class GenericIE(InfoExtractor):
             return self.playlist_from_matches(
                 mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
 
+        def merge_dicts(dict1, dict2):
+            merged = {}
+            for k, v in dict1.items():
+                if v is not None:
+                    merged[k] = v
+            for k, v in dict2.items():
+                if v is None:
+                    continue
+                if (k not in merged or
+                        (isinstance(v, compat_str) and v and
+                            isinstance(merged[k], compat_str) and
+                            not merged[k])):
+                    merged[k] = v
+            return merged
+
         # Looking for http://schema.org/VideoObject
         json_ld = self._search_json_ld(
             webpage, video_id, default={}, expected_type='VideoObject')
         if json_ld.get('url'):
-            info_dict.update({
-                'title': video_title or info_dict['title'],
-                'description': video_description,
-                'thumbnail': video_thumbnail,
-                'age_limit': age_limit
-            })
-            info_dict.update(json_ld)
-            return info_dict
+            return merge_dicts(json_ld, info_dict)
 
         # Look for HTML5 media
         entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
@@ -2713,9 +2759,7 @@ class GenericIE(InfoExtractor):
         if jwplayer_data:
             info = self._parse_jwplayer_data(
                 jwplayer_data, video_id, require_title=False, base_url=url)
-            if not info.get('title'):
-                info['title'] = video_title
-            return info
+            return merge_dicts(info, info_dict)
 
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):