X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=9b64988943b16bc93eedce5c878ebdc088d95906;hb=0551a02b82b4209422e0734150e0186210cd0723;hp=8b2d1d033ab2f260e49b18c8ecc7d1d3fc695eaf;hpb=1f8b6af77341433603601473d33766005177dacd;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 8b2d1d033..9b6498894 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -8,12 +8,11 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( - compat_urllib_error, compat_urllib_parse, - compat_urllib_request, compat_urlparse, compat_xml_parse_error, + determine_ext, ExtractorError, float_or_none, HEADRequest, @@ -22,12 +21,14 @@ from ..utils import ( smuggle_url, unescapeHTML, unified_strdate, + unsmuggle_url, url_basename, ) from .brightcove import BrightcoveIE from .ooyala import OoyalaIE from .rutv import RUTVIE from .smotri import SmotriIE +from .condenast import CondeNastIE class GenericIE(InfoExtractor): @@ -155,7 +156,6 @@ class GenericIE(InfoExtractor): # funnyordie embed { 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns', - 'md5': '7cf780be104d40fea7bae52eed4a470e', 'info_dict': { 'id': '18e820ec3f', 'ext': 'mp4', @@ -180,13 +180,13 @@ class GenericIE(InfoExtractor): # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', - 'md5': 'deeeabcc1085eb2ba205474e7235a3d5', + 'md5': '65fdff94098e4a607385a60c5177c638', 'info_dict': { - 'id': '981', + 'id': '1969', 'ext': 'mp4', - 'title': 'My web playroom', - 'uploader': 'Ze Frank', - 'description': 'md5:ddb2a40ecd6b6a147e400e535874947b', + 'title': 'Hidden miracles of the natural world', + 'uploader': 'Louie Schwartzberg', + 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, # Embeded Ustream video @@ -226,21 +226,6 @@ class GenericIE(InfoExtractor): 'skip_download': 'Requires rtmpdump' } }, - # smotri embed - { - 'url': 'http://rbctv.rbc.ru/archive/news/562949990879132.shtml', - 'md5': 'ec40048448e9284c9a1de77bb188108b', - 'info_dict': { - 'id': 'v27008541fad', - 'ext': 'mp4', - 'title': 'ÐÑÑм и СеваÑÑÐ¾Ð¿Ð¾Ð»Ñ Ð²Ð¾Ñли в ÑоÑÑав РоÑÑии', - 'description': 'md5:fae01b61f68984c7bd2fa741e11c3175', - 'duration': 900, - 'upload_date': '20140318', - 'uploader': 'rbctv_2012_4', - 'uploader_id': 'rbctv_2012_4', - }, - }, # Condé Nast embed { 'url': 'http://www.wired.com/2014/04/honda-asimo/', @@ -295,13 +280,13 @@ class GenericIE(InfoExtractor): { 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM', 'info_dict': { - 'id': 'jpSGZsgga_I', + 'id': '4vAffPZIT44', 'ext': 'mp4', - 'title': 'Asphalt 8: Airborne - Launch Trailer', + 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!', 'uploader': 'Gameloft', 'uploader_id': 'gameloft', - 'upload_date': '20130821', - 'description': 'md5:87bd95f13d8be3e7da87a5f2c443106a', + 'upload_date': '20140828', + 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4', }, 'params': { 'skip_download': True, @@ -330,71 +315,88 @@ class GenericIE(InfoExtractor): 'info_dict': { 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', } - } + }, + # Flowplayer + { + 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', + 'md5': '9d65602bf31c6e20014319c7d07fba27', + 'info_dict': { + 'id': '5123ea6d5e5a7', + 'ext': 'mp4', + 'age_limit': 18, + 'uploader': 'www.handjobhub.com', + 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', + } + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:' + }, + 'playlist_mincount': 11, + }, + # Multiple brightcove videos + # https://github.com/rg3/youtube-dl/issues/2283 + { + 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', + 'info_dict': { + 'id': 'always-never', + 'title': 'Always / Never - The New Yorker', + }, + 'playlist_count': 3, + 'params': { + 'extract_flat': False, + 'skip_download': True, + } + }, + # MLB embed + { + 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', + 'md5': '96f09a37e44da40dd083e12d9a683327', + 'info_dict': { + 'id': '33322633', + 'ext': 'mp4', + 'title': 'Ump changes call to ball', + 'description': 'md5:71c11215384298a172a6dcb4c2e20685', + 'duration': 48, + 'timestamp': 1401537900, + 'upload_date': '20140531', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, + # Wistia embed + { + 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': '8788b683c777a5cf25621eaf286d0c23', + 'info_dict': { + 'id': '1cfaf6b7ea', + 'ext': 'mov', + 'title': 'md5:51364a8d3d009997ba99656004b5e20d', + 'duration': 643.0, + 'filesize': 182808282, + 'uploader': 'education-portal.com', + }, + }, + { + 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', + 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', + 'info_dict': { + 'id': 'uxjb0lwrcz', + 'ext': 'mp4', + 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks', + 'duration': 1715.0, + 'uploader': 'thoughtworks.wistia.com', + }, + }, ] - def report_download_webpage(self, video_id): - """Report webpage download.""" - if not self._downloader.params.get('test', False): - self._downloader.report_warning('Falling back on generic information extractor.') - super(GenericIE, self).report_download_webpage(video_id) - def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - def _send_head(self, url): - """Check if it is a redirect, like url shorteners, in case return the new url.""" - - class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - """ - Subclass the HTTPRedirectHandler to make it use our - HEADRequest also on the redirected URL - """ - def redirect_request(self, req, fp, code, msg, headers, newurl): - if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - try: - # This function was deprecated in python 3.3 and removed in 3.4 - origin_req_host = req.get_origin_req_host() - except AttributeError: - origin_req_host = req.origin_req_host - return HEADRequest(newurl, - headers=newheaders, - origin_req_host=origin_req_host, - unverifiable=True) - else: - raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - - class HTTPMethodFallback(compat_urllib_request.BaseHandler): - """ - Fallback to GET if HEAD is not allowed (405 HTTP error) - """ - def http_error_405(self, req, fp, code, msg, headers): - fp.read() - fp.close() - - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return self.parent.open(compat_urllib_request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True)) - - # Build our opener - opener = compat_urllib_request.OpenerDirector() - for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, - HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: - opener.add_handler(handler()) - - response = opener.open(HEADRequest(url)) - if response is None: - raise ExtractorError('Invalid URL protocol') - return response - def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -485,19 +487,35 @@ class GenericIE(InfoExtractor): 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' ) % (url, url), expected=True) else: - assert ':' in default_search + if ':' not in default_search: + default_search += ':' return self.url_result(default_search + url) - video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + + url, smuggled_data = unsmuggle_url(url) + force_videoid = None + is_intentional = smuggled_data and smuggled_data.get('to_generic') + if smuggled_data and 'force_videoid' in smuggled_data: + force_videoid = smuggled_data['force_videoid'] + video_id = force_videoid + else: + video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] self.to_screen('%s: Requesting header' % video_id) - try: - response = self._send_head(url) + head_req = HEADRequest(url) + response = self._request_webpage( + head_req, video_id, + note=False, errnote='Could not send HEAD request to %s' % url, + fatal=False) + if response is not False: # Check for redirect new_url = response.geturl() if url != new_url: self.report_following_redirect(new_url) + if force_videoid: + new_url = smuggle_url( + new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) # Check for direct link to a video @@ -518,9 +536,8 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } - except compat_urllib_error.HTTPError: - # This may be a stupid server that doesn't like HEAD, our UA, or so - pass + if not self._downloader.params.get('test', False) and not is_intentional: + self._downloader.report_warning('Falling back on generic information extractor.') try: webpage = self._download_webpage(url, video_id) @@ -559,13 +576,25 @@ class GenericIE(InfoExtractor): r'(?s)