X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Fgeneric.py;h=b4138381d115a78f3203f6ef0017fd83a8df41d1;hb=c93b4eacebe62e3993da790cbea8fbf62e161f07;hp=5b22b6b5e4568fbe60845491845c468f3606d262;hpb=b8f67449ecafa76d4d925c7ad72f9c8f8338aba7;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5b22b6b5e..b4138381d 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -51,7 +51,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE from .onionstudios import OnionStudiosIE -from .snagfilms import SnagFilmsEmbedIE +from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE from .pladform import PladformIE @@ -61,6 +61,8 @@ from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE +from .threeqsdn import ThreeQSDNIE +from .theplatform import ThePlatformIE class GenericIE(InfoExtractor): @@ -105,7 +107,8 @@ class GenericIE(InfoExtractor): 'skip_download': True, # infinite live stream }, 'expected_warnings': [ - r'501.*Not Implemented' + r'501.*Not Implemented', + r'400.*Bad Request', ], }, # Direct link with incorrect MIME type @@ -236,6 +239,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', + 'upload_date': '20130904', }, 'params': { 'format': 'bestvideo', @@ -595,7 +599,11 @@ class GenericIE(InfoExtractor): 'id': 'k2mm4bCdJ6CQ2i7c8o2', 'ext': 'mp4', 'title': 'Le Zap de Spi0n n°216 - Zapping du Web', + 'description': 'md5:faf028e48a461b8b7fad38f1e104b119', 'uploader': 'Spi0n', + 'uploader_id': 'xgditw', + 'upload_date': '20140425', + 'timestamp': 1398441542, }, 'add_ie': ['Dailymotion'], }, @@ -710,15 +718,18 @@ class GenericIE(InfoExtractor): }, # Wistia embed { - 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '8788b683c777a5cf25621eaf286d0c23', + 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', 'info_dict': { - 'id': '1cfaf6b7ea', + 'id': '6e2wtrbdaf', 'ext': 'mov', - 'title': 'md5:51364a8d3d009997ba99656004b5e20d', - 'duration': 643.0, - 'filesize': 182808282, - 'uploader': 'education-portal.com', + 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', + 'description': 'a Paywall Videos video from Remilon', + 'duration': 644.072, + 'uploader': 'study.com', + 'timestamp': 1459678540, + 'upload_date': '20160403', + 'filesize': 24687186, }, }, { @@ -727,11 +738,30 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': 'uxjb0lwrcz', 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'title': 'Conversation about Hexagonal Rails Part 1', + 'description': 'a Martin Fowler video from ThoughtWorks', 'duration': 1715.0, 'uploader': 'thoughtworks.wistia.com', + 'timestamp': 1401832161, + 'upload_date': '20140603', }, }, + # Wistia standard embed (async) + { + 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', + 'info_dict': { + 'id': '807fafadvk', + 'ext': 'mp4', + 'title': 'Drip Brennan Dunn Workshop', + 'description': 'a JV Webinars video from getdrip-1', + 'duration': 4986.95, + 'timestamp': 1463607249, + 'upload_date': '20160518', + }, + 'params': { + 'skip_download': True, + } + }, # Soundcloud embed { 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', @@ -754,6 +784,19 @@ class GenericIE(InfoExtractor): 'title': 'Rosetta #CometLanding webcast HL 10', } }, + # Another Livestream embed, without 'new.' in URL + { + 'url': 'https://www.freespeech.org/', + 'info_dict': { + 'id': '123537347', + 'ext': 'mp4', + 'title': 're:^FSTV [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # Live stream + 'skip_download': True, + }, + }, # LazyYT { 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', @@ -838,18 +881,6 @@ class GenericIE(InfoExtractor): 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !', } }, - # Kaltura embed - { - 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15', - 'info_dict': { - 'id': '1_eergr3h1', - 'ext': 'mp4', - 'upload_date': '20150226', - 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com', - 'timestamp': int, - 'title': 'John Carlson Postgame 2/25/15', - }, - }, # Kaltura embed (different embed code) { 'url': 'http://www.premierchristianradio.com/Shows/Saturday/Unbelievable/Conference-Videos/Os-Guinness-Is-It-Fools-Talk-Unbelievable-Conference-2014', @@ -875,9 +906,23 @@ class GenericIE(InfoExtractor): 'uploader_id': 'echojecka', }, }, + # Kaltura embed with single quotes + { + 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', + 'info_dict': { + 'id': '0_izeg5utt', + 'ext': 'mp4', + 'title': '35871', + 'timestamp': 1355743100, + 'upload_date': '20121217', + 'uploader_id': 'batchUser', + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '227304', 'ext': 'mp4', @@ -892,6 +937,7 @@ class GenericIE(InfoExtractor): # ClipYou (Eagle.Platform) embed (custom URL) { 'url': 'http://muz-tv.ru/play/7129/', + # Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used 'info_dict': { 'id': '12820', 'ext': 'mp4', @@ -980,18 +1026,25 @@ class GenericIE(InfoExtractor): 'ext': 'flv', 'title': "PFT Live: New leader in the 'new-look' defense", 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e', + 'uploader': 'NBCU-SPORTS', + 'upload_date': '20140107', + 'timestamp': 1389118457, }, }, # UDN embed { - 'url': 'http://www.udn.com/news/story/7314/822787', + 'url': 'https://video.udn.com/news/300346', 'md5': 'fd2060e988c326991037b9aff9df21a6', 'info_dict': { 'id': '300346', 'ext': 'mp4', 'title': '中一中男師變性 全校師生力挺', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, # Ooyala embed { @@ -1032,6 +1085,9 @@ class GenericIE(InfoExtractor): 'title': 'SN Presents: Russell Martin, World Citizen', 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'uploader': 'Rogers Sportsnet', + 'uploader_id': '1704050871', + 'upload_date': '20150525', + 'timestamp': 1432570283, }, }, # Dailymotion Cloud video @@ -1123,6 +1179,9 @@ class GenericIE(InfoExtractor): 'title': 'The Cardinal Pell Interview', 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', 'uploader': 'GlobeCast Australia - GlobeStream', + 'uploader_id': '2733773828001', + 'upload_date': '20160304', + 'timestamp': 1457083087, }, 'params': { # m3u8 downloads @@ -1153,6 +1212,16 @@ class GenericIE(InfoExtractor): 'uploader': 'Lake8737', } }, + # Duplicated embedded video URLs + { + 'url': 'http://www.hudl.com/athlete/2538180/highlights/149298443', + 'info_dict': { + 'id': '149298443_480_16c25b74_2', + 'ext': 'mp4', + 'title': 'vs. Blue Orange Spring Game', + 'uploader': 'www.hudl.com', + }, + }, ] def report_following_redirect(self, new_url): @@ -1407,7 +1476,8 @@ class GenericIE(InfoExtractor): # Site Name | Video Title # Video Title - Tagline | Site Name # and so on and so forth; it's just not practical - video_title = self._html_search_regex( + video_title = self._og_search_title( + webpage, default=None) or self._html_search_regex( r'(?s)(.*?)', webpage, 'video title', default='video') @@ -1425,6 +1495,9 @@ class GenericIE(InfoExtractor): video_uploader = self._search_regex( r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') + video_description = self._og_search_description(webpage, default=None) + video_thumbnail = self._og_search_thumbnail(webpage, default=None) + # Helper method def _playlist_from_matches(matches, getter=None, ie=None): urlrs = orderedSet( @@ -1455,6 +1528,11 @@ class GenericIE(InfoExtractor): if bc_urls: return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + # Look for ThePlatform embeds + tp_urls = ThePlatformIE._extract_urls(webpage) + if tp_urls: + return _playlist_from_matches(tp_urls, ie='ThePlatform') + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1523,21 +1601,26 @@ class GenericIE(InfoExtractor): 'url': embed_url, 'ie_key': 'Wistia', 'uploader': video_uploader, - 'title': video_title, - 'id': video_id, } match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) if match: return { '_type': 'url_transparent', - 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), + 'url': 'wistia:%s' % match.group('id'), 'ie_key': 'Wistia', 'uploader': video_uploader, - 'title': video_title, - 'id': match.group('id') } + match = re.search( + r'''(?sx) + ]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + ]+class=(["']).*?\bwistia_async_(?P[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return self.url_result(self._proto_relative_url( + 'wistia:%s' % match.group('id')), 'Wistia') + # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: @@ -1813,7 +1896,7 @@ class GenericIE(InfoExtractor): return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') mobj = re.search( - r']+src="(?Phttps?://new\.livestream\.com/[^"]+/player[^"]+)"', + r']+src="(?Phttps?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', webpage) if mobj is not None: return self.url_result(mobj.group('url'), 'Livestream') @@ -1825,7 +1908,7 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P[^']+)',.*?'entry_?[Ii]d'\s*:\s*'(?P[^']+)',", webpage) or + mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P['\"])wid(?P=q1)\s*:\s*(?P['\"])_?(?P[^'\"]+)(?P=q2),.*?(?P['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P['\"])(?P[^'\"]+)(?P=q4),", webpage) or re.search(r'(?s)(?P["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P["\'])(?P.+?)(?P=q2)', webpage)) if mobj is not None: return self.url_result(smuggle_url( @@ -1904,10 +1987,10 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) - # Look for SnagFilms embeds - snagfilms_url = SnagFilmsEmbedIE._extract_url(webpage) - if snagfilms_url: - return self.url_result(snagfilms_url) + # Look for ViewLift embeds + viewlift_url = ViewLiftEmbedIE._extract_url(webpage) + if viewlift_url: + return self.url_result(viewlift_url) # Look for JWPlatform embeds jwplatform_url = JWPlatformIE._extract_url(webpage) @@ -1955,13 +2038,27 @@ class GenericIE(InfoExtractor): # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: - return self.url_result(instagram_embed_url, InstagramIE.ie_key()) + return self.url_result( + self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) # Look for LiveLeak embeds liveleak_url = LiveLeakIE._extract_url(webpage) if liveleak_url: return self.url_result(liveleak_url, 'LiveLeak') + # Look for 3Q SDN embeds + threeqsdn_url = ThreeQSDNIE._extract_url(webpage) + if threeqsdn_url: + return { + '_type': 'url_transparent', + 'ie_key': ThreeQSDNIE.ie_key(), + 'url': self._proto_relative_url(threeqsdn_url), + 'title': video_title, + 'description': video_description, + 'thumbnail': video_thumbnail, + 'uploader': video_uploader, + } + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2042,7 +2139,8 @@ class GenericIE(InfoExtractor): raise UnsupportedError(url) entries = [] - for video_url in found: + for video_url in orderedSet(found): + video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') video_url = compat_urlparse.urljoin(url, video_url) video_id = compat_urllib_parse_unquote(os.path.basename(video_url))