X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=9b64988943b16bc93eedce5c878ebdc088d95906;hb=0551a02b82b4209422e0734150e0186210cd0723;hp=6961dfec76de7f9733cf5e7508f6870379f64a1e;hpb=aff216edf4aa7536924a7496686a7a6b6d0f15af;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 6961dfec7..9b6498894 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -12,6 +12,7 @@ from ..utils import ( compat_urlparse, compat_xml_parse_error, + determine_ext, ExtractorError, float_or_none, HEADRequest, @@ -27,6 +28,7 @@ from .brightcove import BrightcoveIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .smotri import SmotriIE +from .condenast import CondeNastIE class GenericIE(InfoExtractor): @@ -154,7 +156,6 @@ class GenericIE(InfoExtractor): # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'md5': '7cf780be104d40fea7bae52eed4a470e', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', @@ -179,13 +180,13 @@ class GenericIE(InfoExtractor): # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': 'deeeabcc1085eb2ba205474e7235a3d5', + 'md5': '65fdff94098e4a607385a60c5177c638', 'info_dict': { - 'id': '981', + 'id': '1969', 'ext': 'mp4', - 'title': 'My web playroom', - 'uploader': 'Ze Frank', - 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', + 'title': 'Hidden miracles of the natural world', + 'uploader': 'Louie Schwartzberg', + 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, # Embeded Ustream video @@ -225,21 +226,6 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, - # smotri embed - { - 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', - 'md5': 'ec40048448e9284c9a1de77bb188108b', - 'info_dict': { - 'id': 'v27008541fad', - 'ext': 'mp4', - 'title': 'Крым и Севастополь вошли в состав России', - 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', - 'duration': 900, - 'upload_date': '20140318', - 'uploader': 'rbctv_2012_4', - 'uploader_id': 'rbctv_2012_4', - }, - }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -294,13 +280,13 @@ class GenericIE(InfoExtractor): { 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', 'info_dict': { - 'id': 'jpSGZsgga_I', + 'id': '4vAffPZIT44', 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Launch Trailer', + 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', 'uploader': 'Gameloft', 'uploader_id': 'gameloft', - 'upload_date': '20130821', - 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', + 'upload_date': '20140828', + 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', }, 'params': { 'skip_download': True, @@ -351,15 +337,62 @@ class GenericIE(InfoExtractor): 'description': 're:' }, 'playlist_mincount': 11, - } + }, + # Multiple brightcove videos + # https://github.com/rg3/youtube-dl/issues/2283 + { + 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', + 'info_dict': { + 'id': 'always-never', + 'title': 'Always / Never - The New Yorker', + }, + 'playlist_count': 3, + 'params': { + 'extract_flat': False, + 'skip_download': True, + } + }, + # MLB embed + { + 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', + 'md5': '96f09a37e44da40dd083e12d9a683327', + 'info_dict': { + 'id': '33322633', + 'ext': 'mp4', + 'title': 'Ump changes call to ball', + 'description': 'md5:71c11215384298a172a6dcb4c2e20685', + 'duration': 48, + 'timestamp': 1401537900, + 'upload_date': '20140531', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, + # Wistia embed + { + 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '8788b683c777a5cf25621eaf286d0c23', + 'info_dict': { + 'id': '1cfaf6b7ea', + 'ext': 'mov', + 'title': 'md5:51364a8d3d009997ba99656004b5e20d', + 'duration': 643.0, + 'filesize': 182808282, + 'uploader': 'education-portal.com', + }, + }, + { + 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', + 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', + 'info_dict': { + 'id': 'uxjb0lwrcz', + 'ext': 'mp4', + 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'duration': 1715.0, + 'uploader': 'thoughtworks.wistia.com', + }, + }, ] - def report_download_webpage(self, video_id): - """Report webpage download.""" - if not self._downloader.params.get('test', False): - self._downloader.report_warning('Falling back on generic information extractor.') - super(GenericIE, self).report_download_webpage(video_id) - def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) @@ -454,11 +487,13 @@ class GenericIE(InfoExtractor): 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: - assert ':' in default_search + if ':' not in default_search: + default_search += ':' return self.url_result(default_search + url) url, smuggled_data = unsmuggle_url(url) force_videoid = None + is_intentional = smuggled_data and smuggled_data.get('to_generic') if smuggled_data and 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid @@ -501,6 +536,9 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } + if not self._downloader.params.get('test', False) and not is_intentional: + self._downloader.report_warning('Falling back on generic information extractor.') + try: webpage = self._download_webpage(url, video_id) except ValueError: @@ -554,7 +592,9 @@ class GenericIE(InfoExtractor): # Helper method def _playlist_from_matches(matches, getter, ie=None): - urlrs = orderedSet(self.url_result(getter(m), ie) for m in matches) + urlrs = orderedSet( + self.url_result(self._proto_relative_url(getter(m)), ie) + for m in matches) return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) @@ -581,13 +621,13 @@ class GenericIE(InfoExtractor): if mobj: player_url = unescapeHTML(mobj.group('url')) surl = smuggle_url(player_url, {'Referer': url}) - return self.url_result(surl, 'Vimeo') + return self.url_result(surl) # Look for embedded (swf embed) Vimeo player mobj = re.search( - r']+?src="(https?://(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) + r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: - return self.url_result(mobj.group(1), 'Vimeo') + return self.url_result(mobj.group(1)) # Look for embedded YouTube player matches = re.findall(r'''(?x) @@ -598,12 +638,12 @@ class GenericIE(InfoExtractor): embedSWF\(?:\s* ) (["\']) - (?P(?:https?:)?//(?:www\.)?youtube\.com/ - (?:embed|v)/.+?) + (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/.+?) \1''', webpage) if matches: return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1]), ie='Youtube') + matches, lambda m: unescapeHTML(m[1])) # Look for embedded Dailymotion player matches = re.findall( @@ -612,18 +652,41 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) + # Look for embedded Dailymotion playlist player (#3822) + m = re.search( + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) + if m: + playlists = re.findall( + r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) + if playlists: + return _playlist_from_matches( + playlists, lambda p: '//dailymotion.com/playlist/%s' % p) + # Look for embedded Wistia player match = re.search( - r']+?src=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) if match: + embed_url = self._proto_relative_url( + unescapeHTML(match.group('url'))) return { '_type': 'url_transparent', - 'url': unescapeHTML(match.group('url')), + 'url': embed_url, 'ie_key': 'Wistia', 'uploader': video_uploader, 'title': video_title, 'id': video_id, } + + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) + if match: + return { + '_type': 'url_transparent', + 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')), + 'ie_key': 'Wistia', + 'uploader': video_uploader, + 'title': video_title, + 'id': match.group('id') + } # Look for embedded blip.tv player mobj = re.search(r']*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) @@ -794,54 +857,70 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'SBS') + mobj = re.search( + r']+?src=(["\'])(?Phttps?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'MLB') + + mobj = re.search( + r']+?src=(["\'])(?P%s)\1' % CondeNastIE.EMBED_URL, + webpage) + if mobj is not None: + return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') + + def check_video(vurl): + vpath = compat_urlparse.urlparse(vurl).path + vext = determine_ext(vpath) + return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml') + + def filter_video(urls): + return list(filter(check_video, urls)) + # Start with something easy: JW Player in SWFObject - found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) + found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) if not found: # Look for gorilla-vid style embedding - found = re.findall(r'''(?sx) + found = filter_video(re.findall(r'''(?sx) (?: jw_plugins| JWPlayerOptions| jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup ) - .*?file\s*:\s*["\'](.*?)["\']''', webpage) + .*?file\s*:\s*["\'](.*?)["\']''', webpage)) if not found: # Broaden the search a little bit - found = re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage) + found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) if not found: # Broaden the findall a little bit: JWPlayer JS loader - found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + found = filter_video(re.findall( + r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) if not found: # Flow player - found = re.findall(r'''(?xs) + found = filter_video(re.findall(r'''(?xs) flowplayer\("[^"]+",\s* \{[^}]+?\}\s*, \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] - ''', webpage) + ''', webpage)) if not found: # Try to find twitter cards info - found = re.findall(r'.*?]+)? src="([^"]+)"', webpage) if not found: found = re.search( r'(?i)