X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=2bfa20606cd7846b0d15e8c441de3fce2a8982f6;hb=fc96eb4e2180d9a2371d84daa4305a5f34f12321;hp=5bd315051e7cfe2f003c675e0142fdf6234fc566;hpb=fa8deaf38b0d576d693a6565dcdb3b29877a4c94;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5bd315051..2bfa20606 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -8,12 +8,11 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( - compat_urllib_error, compat_urllib_parse, - compat_urllib_request, compat_urlparse, compat_xml_parse_error, + determine_ext, ExtractorError, float_or_none, HEADRequest, @@ -22,6 +21,7 @@ from ..utils import ( smuggle_url, unescapeHTML, unified_strdate, + unsmuggle_url, url_basename, ) from .brightcove import BrightcoveIE @@ -330,7 +330,58 @@ class GenericIE(InfoExtractor): 'info_dict': { 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', } - } + }, + # Flowplayer + { + 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', + 'md5': '9d65602bf31c6e20014319c7d07fba27', + 'info_dict': { + 'id': '5123ea6d5e5a7', + 'ext': 'mp4', + 'age_limit': 18, + 'uploader': 'www.handjobhub.com', + 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', + } + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:' + }, + 'playlist_mincount': 11, + }, + # Multiple brightcove videos + # https://github.com/rg3/youtube-dl/issues/2283 + { + 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', + 'info_dict': { + 'id': 'always-never', + 'title': 'Always / Never - The New Yorker', + }, + 'playlist_count': 3, + 'params': { + 'extract_flat': False, + 'skip_download': True, + } + }, + # MLB embed + { + 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', + 'md5': '96f09a37e44da40dd083e12d9a683327', + 'info_dict': { + 'id': '33322633', + 'ext': 'mp4', + 'title': 'Ump changes call to ball', + 'description': 'md5:71c11215384298a172a6dcb4c2e20685', + 'duration': 48, + 'timestamp': 1401537900, + 'upload_date': '20140531', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, ] def report_download_webpage(self, video_id): @@ -343,58 +394,6 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - def _send_head(self, url): - """Check if it is a redirect, like url shorteners, in case return the new url.""" - - class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - """ - Subclass the HTTPRedirectHandler to make it use our - HEADRequest also on the redirected URL - """ - def redirect_request(self, req, fp, code, msg, headers, newurl): - if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - try: - # This function was deprecated in python 3.3 and removed in 3.4 - origin_req_host = req.get_origin_req_host() - except AttributeError: - origin_req_host = req.origin_req_host - return HEADRequest(newurl, - headers=newheaders, - origin_req_host=origin_req_host, - unverifiable=True) - else: - raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - - class HTTPMethodFallback(compat_urllib_request.BaseHandler): - """ - Fallback to GET if HEAD is not allowed (405 HTTP error) - """ - def http_error_405(self, req, fp, code, msg, headers): - fp.read() - fp.close() - - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return self.parent.open(compat_urllib_request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True)) - - # Build our opener - opener = compat_urllib_request.OpenerDirector() - for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, - HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: - opener.add_handler(handler()) - - response = opener.open(HEADRequest(url)) - if response is None: - raise ExtractorError('Invalid URL protocol') - return response - def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -487,17 +486,31 @@ class GenericIE(InfoExtractor): else: assert ':' in default_search return self.url_result(default_search + url) - video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] + + url, smuggled_data = unsmuggle_url(url) + force_videoid = None + if smuggled_data and 'force_videoid' in smuggled_data: + force_videoid = smuggled_data['force_videoid'] + video_id = force_videoid + else: + video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0] self.to_screen('%s: Requesting header' % video_id) - try: - response = self._send_head(url) + head_req = HEADRequest(url) + response = self._request_webpage( + head_req, video_id, + note=False, errnote='Could not send HEAD request to %s' % url, + fatal=False) + if response is not False: # Check for redirect new_url = response.geturl() if url != new_url: self.report_following_redirect(new_url) + if force_videoid: + new_url = smuggle_url( + new_url, {'force_videoid': force_videoid}) return self.url_result(new_url) # Check for direct link to a video @@ -518,10 +531,6 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } - except compat_urllib_error.HTTPError: - # This may be a stupid server that doesn't like HEAD, our UA, or so - pass - try: webpage = self._download_webpage(url, video_id) except ValueError: @@ -559,6 +568,16 @@ class GenericIE(InfoExtractor): r'(?s)(.*?)', webpage, 'video title', default='video') + # Try to detect age limit automatically + age_limit = self._rta_search(webpage) + # And then there are the jokers who advertise that they use RTA, + # but actually don't. + AGE_LIMIT_MARKERS = [ + r'Proudly Labeled RTA', + ] + if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): + age_limit = 18 + # video uploader is domain name video_uploader = self._search_regex( r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') @@ -609,7 +628,7 @@ class GenericIE(InfoExtractor): embedSWF\(?:\s* ) (["\']) - (?P(?:https?:)?//(?:www\.)?youtube\.com/ + (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ (?:embed|v)/.+?) \1''', webpage) if matches: @@ -640,7 +659,7 @@ class GenericIE(InfoExtractor): mobj = re.search(r']*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) if mobj: return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') - mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9]+)', webpage) + mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) if mobj: return self.url_result(mobj.group(1), 'BlipTV') @@ -805,6 +824,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'SBS') + mobj = re.search( + r']+?src=(["\'])(?Phttps?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'MLB') + # Start with something easy: JW Player in SWFObject found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if not found: @@ -822,6 +847,14 @@ class GenericIE(InfoExtractor): if not found: # Broaden the findall a little bit: JWPlayer JS loader found = re.findall(r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage) + if not found: + # Flow player + found = re.findall(r'''(?xs) + flowplayer\("[^"]+",\s* + \{[^}]+?\}\s*, + \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* + ["']?url["']?\s*:\s*["']([^"']+)["'] + ''', webpage) if not found: # Try to find twitter cards info found = re.findall(r'.*?.*?]+)? src="([^"]+)"', webpage) if not found: found = re.search( r'(?i)