X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=b66e623f84c803130d371397a44b9b40d3e6b5cc;hb=ea69624992c711b40781fa573a71fabbbf488ae1;hp=8c2ff39d58200671bb7b374722823b5adf790f87;hpb=2583c0b54e56f6dbce85a079d91a05e9b13c2dce;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8c2ff39d5..b66e623f8 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -22,6 +22,8 @@ from ..utils import ( HEADRequest, is_html, js_to_json, + KNOWN_EXTENSIONS, + mimetype2ext, orderedSet, sanitized_Request, smuggle_url, @@ -36,6 +38,10 @@ from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, ) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -53,10 +59,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .vimeo import VimeoIE -from .dailymotion import ( - DailymotionIE, - DailymotionCloudIE, -) +from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE @@ -93,6 +96,12 @@ from .washingtonpost import WashingtonPostIE from .wistia import WistiaIE from .mediaset import MediasetIE from .joj import JojIE +from .megaphone import MegaphoneIE +from .vzaar import VzaarIE +from .channel9 import Channel9IE +from .vshare import VShareIE +from .mediasite import MediasiteIE +from .springboardplatform import SpringboardPlatformIE class GenericIE(InfoExtractor): @@ -570,6 +579,19 @@ class GenericIE(InfoExtractor): }, 'skip': 'movie expired', }, + # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js + { + 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/', + 'info_dict': { + 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2', + 'ext': 'mp4', + 'title': 'Steampunk Fest Comes to Honesdale', + 'duration': 43.276, + }, + 'params': { + 'skip_download': True, + } + }, # embed.ly video { 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/', @@ -1069,23 +1091,24 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20150212', 'uploader': 'The National Archives UK', - 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6', + 'description': 'md5:8078af856dca76edc42910b61273dbbf', 'uploader_id': 'NationalArchives08', 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', }, }, # jwplayer rtmp { - 'url': 'http://www.suffolk.edu/sjc/', + 'url': 'http://www.suffolk.edu/sjc/live.php', 'info_dict': { - 'id': 'sjclive', + 'id': 'live', 'ext': 'flv', 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', 'uploader': 'www.suffolk.edu', }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', }, # Complex jwplayer { @@ -1094,6 +1117,7 @@ class GenericIE(InfoExtractor): 'id': 'videos', 'ext': 'mp4', 'title': 'king machine trailer 1', + 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', 'thumbnail': r're:^https?://.*\.jpg$', }, }, @@ -1111,13 +1135,55 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + { + # JWPlatform iframe + 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', + 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', + 'info_dict': { + 'id': 'O0c5JcKT', + 'ext': 'mp4', + 'upload_date': '20171122', + 'timestamp': 1511366290, + 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, + { + # Video.js embed, multiple formats + 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', + 'info_dict': { + 'id': 'yygqldloqIk', + 'ext': 'mp4', + 'title': 'SolidWorks. Урок 6 Настройка чертежа', + 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', + 'upload_date': '20130314', + 'uploader': 'PROстое3D', + 'uploader_id': 'PROstoe3D', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Video.js embed, single format + 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=', + 'info_dict': { + 'id': 'watch', + 'ext': 'mp4', + 'title': 'Step 1 - Good Foundation', + 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4', + }, + 'params': { + 'skip_download': True, + }, + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', 'playlist_mincount': 5, 'info_dict': { 'id': 'aanslagen-kopenhagen', - 'title': 'Aanslagen Kopenhagen | RTL Nieuws', + 'title': 'Aanslagen Kopenhagen', } }, # Zapiks embed @@ -1249,6 +1315,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is unavailable.', }, # Pladform embed { @@ -1262,6 +1329,7 @@ class GenericIE(InfoExtractor): 'duration': 694, 'age_limit': 0, }, + 'skip': 'HTTP Error 404: Not Found', }, # Playwire embed { @@ -1282,6 +1350,14 @@ class GenericIE(InfoExtractor): 'id': '518726732', 'ext': 'mp4', 'title': 'Facebook Creates "On This Day" | Crunch Report', + 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', + 'timestamp': 1427237531, + 'uploader': 'Crunch Report', + 'upload_date': '20150324', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, # SVT embed @@ -1333,16 +1409,20 @@ class GenericIE(InfoExtractor): 'upload_date': '20140107', 'timestamp': 1389118457, }, + 'skip': 'Invalid Page URL', }, # NBC News embed { 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', 'md5': '1aa589c675898ae6d37a17913cf68d66', 'info_dict': { - 'id': '701714499682', + 'id': 'x_dtl_oa_LettermanliftPR_160608', 'ext': 'mp4', - 'title': 'PREVIEW: On Assignment: David Letterman', + 'title': 'David Letterman: A Preview', 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', + 'upload_date': '20160609', + 'timestamp': 1465431544, + 'uploader': 'NBCU-NEWS', }, }, # UDN embed @@ -1359,6 +1439,7 @@ class GenericIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Failed to parse JSON Expecting value'], }, # Ooyala embed { @@ -1366,7 +1447,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', + 'description': 'Index/Match versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', 'duration': 191.933, }, @@ -1390,22 +1471,6 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # Dailymotion Cloud video - { - 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', - 'info_dict': { - 'id': 'x2uy8t3', - 'ext': 'mp4', - 'title': 'Sauvons les abeilles ! - Le débat', - 'description': 'md5:d9082128b1c5277987825d684939ca26', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1434970506, - 'upload_date': '20150622', - 'uploader': 'Public Sénat', - 'uploader_id': 'xa9gza', - } - }, # OnionStudios embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -1500,14 +1565,27 @@ class GenericIE(InfoExtractor): # LiveLeak embed { 'url': 'http://www.wykop.pl/link/3088787/', - 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d', + 'md5': '7619da8c820e835bef21a1efa2a0fc71', 'info_dict': { 'id': '874_1459135191', 'ext': 'mp4', 'title': 'Man shows poor quality of new apartment building', 'description': 'The wall is like a sand pile.', 'uploader': 'Lake8737', - } + }, + 'add_ie': [LiveLeakIE.ie_key()], + }, + # Another LiveLeak embed pattern (#13336) + { + 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/', + 'info_dict': { + 'id': '2eb_1496309988', + 'ext': 'mp4', + 'title': 'Thief robs place where everyone was armed', + 'description': 'md5:694d73ee79e535953cf2488562288eee', + 'uploader': 'brazilwtf', + }, + 'add_ie': [LiveLeakIE.ie_key()], }, # Duplicated embedded video URLs { @@ -1750,6 +1828,21 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 5, }, + { + # Limelight embed (LimelightPlayerUtil.embed) + 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri', + 'info_dict': { + 'id': '95d035dc5c8a401588e9c0e6bd1e9c92', + 'ext': 'mp4', + 'title': '07448641', + 'timestamp': 1499890639, + 'upload_date': '20170712', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['LimelightMedia'], + }, { 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', 'info_dict': { @@ -1806,6 +1899,78 @@ class GenericIE(InfoExtractor): 'title': 'Стас Намин: «Мы нарушили девственность Кремля»', }, }, + { + # vzaar embed + 'url': 'http://help.vzaar.com/article/165-embedding-video', + 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4', + 'info_dict': { + 'id': '8707641', + 'ext': 'mp4', + 'title': 'Building A Business Online: Principal Chairs Q & A', + }, + }, + { + # multiple HTML5 videos on one page + 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', + 'info_dict': { + 'id': 'keyscenarios', + 'title': 'Rescue Kit 14 Free Edition - Getting started', + }, + 'playlist_count': 4, + }, + { + # vshare embed + 'url': 'https://youtube-dl-demo.neocities.org/vshare.html', + 'md5': '17b39f55b5497ae8b59f5fbce8e35886', + 'info_dict': { + 'id': '0f64ce6', + 'title': 'vl14062007715967', + 'ext': 'mp4', + } + }, + { + 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', + 'md5': 'aecd089f55b1cb5a59032cb049d3a356', + 'info_dict': { + 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', + 'ext': 'mp4', + 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', + 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', + 'timestamp': 1474354800, + 'upload_date': '20160920', + } + }, + { + 'url': 'http://www.kidzworld.com/article/30935-trolls-the-beat-goes-on-interview-skylar-astin-and-amanda-leighton', + 'info_dict': { + 'id': '1731611', + 'ext': 'mp4', + 'title': 'Official Trailer | TROLLS: THE BEAT GOES ON!', + 'description': 'md5:eb5f23826a027ba95277d105f248b825', + 'timestamp': 1516100691, + 'upload_date': '20180116', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [SpringboardPlatformIE.ie_key()], + }, + { + 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', + 'info_dict': { + 'id': 'uPDB5I9wfp8', + 'ext': 'webm', + 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', + 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', + 'upload_date': '20160219', + 'uploader': 'Pocoyo - Português (BR)', + 'uploader_id': 'PocoyoBrazil', + }, + 'add_ie': [YoutubeIE.ie_key()], + 'params': { + 'skip_download': True, + }, + } # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1955,7 +2120,7 @@ class GenericIE(InfoExtractor): if head_response is not False: # Check for redirect - new_url = head_response.geturl() + new_url = compat_str(head_response.geturl()) if url != new_url: self.report_following_redirect(new_url) if force_videoid: @@ -2055,8 +2220,8 @@ class GenericIE(InfoExtractor): return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( - doc, video_id, - mpd_base_url=full_response.geturl().rpartition('/')[0], + doc, + mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) return info_dict @@ -2093,7 +2258,7 @@ class GenericIE(InfoExtractor): # And then there are the jokers who advertise that they use RTA, # but actually don't. AGE_LIMIT_MARKERS = [ - r'Proudly Labeled RTA', + r'Proudly Labeled RTA', ] if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): age_limit = 18 @@ -2131,7 +2296,20 @@ class GenericIE(InfoExtractor): # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(self, webpage) if bc_urls: - return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') + return self.playlist_from_matches( + bc_urls, video_id, video_title, + getter=lambda x: smuggle_url(x, {'referrer': url}), + ie='BrightcoveNew') + + # Look for Nexx embeds + nexx_urls = NexxIE._extract_urls(webpage) + if nexx_urls: + return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) + + # Look for Nexx iFrame embeds + nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) + if nexx_embed_urls: + return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) @@ -2145,7 +2323,7 @@ class GenericIE(InfoExtractor): # Look for embedded rtl.nl player matches = re.findall( - r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', + r']+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', webpage) if matches: return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') @@ -2160,36 +2338,11 @@ class GenericIE(InfoExtractor): if vid_me_embed_url is not None: return self.url_result(vid_me_embed_url, 'Vidme') - # Look for embedded YouTube player - matches = re.findall(r'''(?x) - (?: - ]+?src=| - data-video-url=| - ]+?src=| - embedSWF\(?:\s*| - ]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) - \1''', webpage) - if matches: + # Look for YouTube embeds + youtube_urls = YoutubeIE._extract_urls(webpage) + if youtube_urls: return self.playlist_from_matches( - matches, video_id, video_title, lambda m: unescapeHTML(m[1])) - - # Look for lazyYT YouTube embed - matches = re.findall( - r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) - - # Look for Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)]+ - class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) + youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) matches = DailymotionIE._extract_urls(webpage) if matches: @@ -2262,6 +2415,7 @@ class GenericIE(InfoExtractor): # Look for Ooyala videos mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage) or + re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P.{32})[\'"]', webpage) or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: @@ -2568,7 +2722,7 @@ class GenericIE(InfoExtractor): # Look for UDN embeds mobj = re.search( - r']+src="(?P%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) + r']+src="(?:https?:)?(?P%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) if mobj is not None: return self.url_result( compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') @@ -2578,11 +2732,6 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') - # Look for Dailymotion Cloud videos - dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) - if dmcloud_url: - return self.url_result(dmcloud_url, 'DailymotionCloud') - # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: @@ -2594,9 +2743,9 @@ class GenericIE(InfoExtractor): return self.url_result(viewlift_url) # Look for JWPlatform embeds - jwplatform_url = JWPlatformIE._extract_url(webpage) - if jwplatform_url: - return self.url_result(jwplatform_url, 'JWPlatform') + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) @@ -2686,9 +2835,9 @@ class GenericIE(InfoExtractor): self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) # Look for LiveLeak embeds - liveleak_url = LiveLeakIE._extract_url(webpage) - if liveleak_url: - return self.url_result(liveleak_url, 'LiveLeak') + liveleak_urls = LiveLeakIE._extract_urls(webpage) + if liveleak_urls: + return self.playlist_from_matches(liveleak_urls, video_id, video_title) # Look for 3Q SDN embeds threeqsdn_url = ThreeQSDNIE._extract_url(webpage) @@ -2760,6 +2909,44 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( joj_urls, video_id, video_title, ie=JojIE.ie_key()) + # Look for megaphone.fm embeds + mpfn_urls = MegaphoneIE._extract_urls(webpage) + if mpfn_urls: + return self.playlist_from_matches( + mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) + + # Look for vzaar embeds + vzaar_urls = VzaarIE._extract_urls(webpage) + if vzaar_urls: + return self.playlist_from_matches( + vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + + channel9_urls = Channel9IE._extract_urls(webpage) + if channel9_urls: + return self.playlist_from_matches( + channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) + + vshare_urls = VShareIE._extract_urls(webpage) + if vshare_urls: + return self.playlist_from_matches( + vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) + + # Look for Mediasite embeds + mediasite_urls = MediasiteIE._extract_urls(webpage) + if mediasite_urls: + entries = [ + self.url_result(smuggle_url( + compat_urlparse.urljoin(url, mediasite_url), + {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) + for mediasite_url in mediasite_urls] + return self.playlist_result(entries, video_id, video_title) + + springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) + if springboardplatform_urls: + return self.playlist_from_matches( + springboardplatform_urls, video_id, video_title, + ie=SpringboardPlatformIE.ie_key()) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): @@ -2775,22 +2962,23 @@ class GenericIE(InfoExtractor): merged[k] = v return merged - # Looking for http://schema.org/VideoObject - json_ld = self._search_json_ld( - webpage, video_id, default={}, expected_type='VideoObject') - if json_ld.get('url'): - return merge_dicts(json_ld, info_dict) - # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: - for entry in entries: - entry.update({ + if len(entries) == 1: + entries[0].update({ 'id': video_id, 'title': video_title, }) + else: + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': '%s-%s' % (video_id, num), + 'title': '%s (%d)' % (video_title, num), + }) + for entry in entries: self._sort_formats(entry['formats']) - return self.playlist_result(entries) + return self.playlist_result(entries, video_id, video_title) jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) @@ -2799,6 +2987,52 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False, base_url=url) return merge_dicts(info, info_dict) + # Video.js embed + mobj = re.search( + r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', + webpage) + if mobj is not None: + sources = self._parse_json( + mobj.group(1), video_id, transform_source=js_to_json, + fatal=False) or [] + if not isinstance(sources, list): + sources = [sources] + formats = [] + for source in sources: + src = source.get('src') + if not src or not isinstance(src, compat_str): + continue + src = compat_urlparse.urljoin(url, src) + src_type = source.get('type') + if isinstance(src_type, compat_str): + src_type = src_type.lower() + ext = determine_ext(src).lower() + if src_type == 'video/youtube': + return self.url_result(src, YoutubeIE.ie_key()) + if src_type == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + elif src_type == 'application/x-mpegurl' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': (mimetype2ext(src_type) or + ext if ext in KNOWN_EXTENSIONS else 'mp4'), + }) + if formats: + self._sort_formats(formats) + info_dict['formats'] = formats + return info_dict + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + return merge_dicts(json_ld, info_dict) + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2886,7 +3120,7 @@ class GenericIE(InfoExtractor): # be supported by youtube-dl thus this is checked the very last (see # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) embed_url = self._html_search_meta('twitter:player', webpage, default=None) - if embed_url: + if embed_url and embed_url != url: return self.url_result(embed_url) if not found: