X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=cb6308d29e16989b46c2a2036f610f5e9083eb53;hb=a3ccd6bd11454b9760ef2c5f09f02f3afdb11af5;hp=a3ac7d26bd79d4719edfbd3f273d3297d7d6d2a5;hpb=16e2c8f7710bffb462921dbc93adfa6274bd9334;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a3ac7d26b..cb6308d29 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -20,6 +20,7 @@ from ..utils import ( float_or_none, HEADRequest, is_html, + js_to_json, orderedSet, sanitized_Request, smuggle_url, @@ -29,6 +30,7 @@ from ..utils import ( UnsupportedError, xpath_text, ) +from .commonprotocols import RtmpIE from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, @@ -78,6 +80,11 @@ from .vbox7 import Vbox7IE from .dbtv import DBTVIE from .piksel import PikselIE from .videa import VideaIE +from .twentymin import TwentyMinutenIE +from .ustream import UstreamIE +from .openload import OpenloadIE +from .videopress import VideoPressIE +from .rutube import RutubeIE class GenericIE(InfoExtractor): @@ -442,6 +449,23 @@ class GenericIE(InfoExtractor): }, }], }, + { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -587,17 +611,6 @@ class GenericIE(InfoExtractor): 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, - # Embedded Ustream video - { - 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm', - 'md5': '27b99cdb639c9b12a79bca876a073417', - 'info_dict': { - 'id': '45734260', - 'ext': 'flv', - 'uploader': 'AU SPA: The NSA and Privacy', - 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman' - } - }, # nowvideo embed hidden behind percent encoding { 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/', @@ -954,6 +967,29 @@ class GenericIE(InfoExtractor): 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', }, }, + # jwplayer rtmp + { + 'url': 'http://www.suffolk.edu/sjc/', + 'info_dict': { + 'id': 'sjclive', + 'ext': 'flv', + 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', + 'uploader': 'www.suffolk.edu', + }, + 'params': { + 'skip_download': True, + } + }, + # Complex jwplayer + { + 'url': 'http://www.indiedb.com/games/king-machine/videos', + 'info_dict': { + 'id': 'videos', + 'ext': 'mp4', + 'title': 'king machine trailer 1', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', @@ -984,19 +1020,6 @@ class GenericIE(InfoExtractor): 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', }, }, - # Kaltura embed protected with referrer - { - 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero', - 'info_dict': { - 'id': '1_g4fbemnq', - 'ext': 'mp4', - 'title': 'Violetta - Achter De Schermen - Ruggero', - 'description': 'Achter de schermen met Ruggero', - 'timestamp': 1435133761, - 'upload_date': '20150624', - 'uploader_id': 'echojecka', - }, - }, # Kaltura embed with single quotes { 'url': 'http://fod.infobase.com/p_ViewPlaylist.aspx?AssignmentID=NUN8ZY', @@ -1468,6 +1491,68 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 2, }, + { + # 20 minuten embed + 'url': 'http://www.20min.ch/schweiz/news/story/So-kommen-Sie-bei-Eis-und-Schnee-sicher-an-27032552', + 'info_dict': { + 'id': '523629', + 'ext': 'mp4', + 'title': 'So kommen Sie bei Eis und Schnee sicher an', + 'description': 'md5:117c212f64b25e3d95747e5276863f7d', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TwentyMinutenIE.ie_key()], + }, + { + # VideoPress embed + 'url': 'https://en.support.wordpress.com/videopress/', + 'info_dict': { + 'id': 'OcobLTqC', + 'ext': 'm4v', + 'title': 'IMG_5786', + 'timestamp': 1435711927, + 'upload_date': '20150701', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [VideoPressIE.ie_key()], + }, + { + # Rutube embed + 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', + 'info_dict': { + 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', + 'ext': 'flv', + 'title': 'Магаззино: Казань 2', + 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', + 'uploader': 'Магаззино', + 'upload_date': '20170228', + 'uploader_id': '996642', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [RutubeIE.ie_key()], + }, + { + # ThePlatform embedded with whitespaces in URLs + 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', + 'only_matching': True, + }, + { + # Senate ISVP iframe https + 'url': 'https://www.hsgac.senate.gov/hearings/canadas-fast-track-refugee-plan-unanswered-questions-and-implications-for-us-national-security', + 'md5': 'fb8c70b0b515e5037981a2492099aab8', + 'info_dict': { + 'id': 'govtaff020316', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + }, + 'add_ie': [SenateISVPIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1767,14 +1852,6 @@ class GenericIE(InfoExtractor): video_description = self._og_search_description(webpage, default=None) video_thumbnail = self._og_search_thumbnail(webpage, default=None) - # Helper method - def _playlist_from_matches(matches, getter=None, ie=None): - urlrs = orderedSet( - self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) - for m in matches) - return self.playlist_result( - urlrs, playlist_id=video_id, playlist_title=video_title) - # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -1795,28 +1872,28 @@ class GenericIE(InfoExtractor): # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(webpage) if bc_urls: - return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') # Look for ThePlatform embeds tp_urls = ThePlatformIE._extract_urls(webpage) if tp_urls: - return _playlist_from_matches(tp_urls, ie='ThePlatform') + return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') # Look for Vessel embeds vessel_urls = VesselIE._extract_urls(webpage) if vessel_urls: - return _playlist_from_matches(vessel_urls, ie=VesselIE.ie_key()) + return self.playlist_from_matches(vessel_urls, video_id, video_title, ie=VesselIE.ie_key()) # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, ie='RtlNl') + return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') vimeo_urls = VimeoIE._extract_urls(url, webpage) if vimeo_urls: - return _playlist_from_matches(vimeo_urls, ie=VimeoIE.ie_key()) + return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) vid_me_embed_url = self._search_regex( r'src=[\'"](https?://vid\.me/[^\'"]+)[\'"]', @@ -1838,25 +1915,25 @@ class GenericIE(InfoExtractor): (?:embed|v|p)/.+?) \1''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return self.playlist_from_matches( + matches, video_id, video_title, lambda m: unescapeHTML(m[1])) # Look for lazyYT YouTube embed matches = re.findall( r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) if matches: - return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m)) # Look for Wordpress "YouTube Video Importer" plugin matches = re.findall(r'''(?x)]+ class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches(matches, lambda m: m[-1]) + return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1]) matches = DailymotionIE._extract_urls(webpage) if matches: - return _playlist_from_matches(matches) + return self.playlist_from_matches(matches, video_id, video_title) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1865,8 +1942,8 @@ class GenericIE(InfoExtractor): playlists = re.findall( r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) if playlists: - return _playlist_from_matches( - playlists, lambda p: '//dailymotion.com/playlist/%s' % p) + return self.playlist_from_matches( + playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player match = re.search( @@ -1973,8 +2050,9 @@ class GenericIE(InfoExtractor): if mobj is not None: embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: - return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') + return self.playlist_from_matches( + embeds, video_id, video_title, + getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'