X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=1b7697870bde93fd5bb4218d12eac8c166a306ba;hb=cc7fec5818254f4679896823c7de9d17f50201ca;hp=5a6fe65bcc102b79123d597567f9e17bc8b054cc;hpb=4d805e063c6c4ffd557d7c7cb905a3ed9c926b08;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5a6fe65bc..1b7697870 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -8,12 +8,11 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( - compat_urllib_error, compat_urllib_parse, - compat_urllib_request, compat_urlparse, compat_xml_parse_error, + determine_ext, ExtractorError, float_or_none, HEADRequest, @@ -343,7 +342,46 @@ class GenericIE(InfoExtractor): 'uploader': 'www.handjobhub.com', 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', } - } + }, + # RSS feed + { + 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'info_dict': { + 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'title': 'Zero Punctuation', + 'description': 're:' + }, + 'playlist_mincount': 11, + }, + # Multiple brightcove videos + # https://github.com/rg3/youtube-dl/issues/2283 + { + 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', + 'info_dict': { + 'id': 'always-never', + 'title': 'Always / Never - The New Yorker', + }, + 'playlist_count': 3, + 'params': { + 'extract_flat': False, + 'skip_download': True, + } + }, + # MLB embed + { + 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', + 'md5': '96f09a37e44da40dd083e12d9a683327', + 'info_dict': { + 'id': '33322633', + 'ext': 'mp4', + 'title': 'Ump changes call to ball', + 'description': 'md5:71c11215384298a172a6dcb4c2e20685', + 'duration': 48, + 'timestamp': 1401537900, + 'upload_date': '20140531', + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }, ] def report_download_webpage(self, video_id): @@ -356,58 +394,6 @@ class GenericIE(InfoExtractor): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - def _send_head(self, url): - """Check if it is a redirect, like url shorteners, in case return the new url.""" - - class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler): - """ - Subclass the HTTPRedirectHandler to make it use our - HEADRequest also on the redirected URL - """ - def redirect_request(self, req, fp, code, msg, headers, newurl): - if code in (301, 302, 303, 307): - newurl = newurl.replace(' ', '%20') - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - try: - # This function was deprecated in python 3.3 and removed in 3.4 - origin_req_host = req.get_origin_req_host() - except AttributeError: - origin_req_host = req.origin_req_host - return HEADRequest(newurl, - headers=newheaders, - origin_req_host=origin_req_host, - unverifiable=True) - else: - raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp) - - class HTTPMethodFallback(compat_urllib_request.BaseHandler): - """ - Fallback to GET if HEAD is not allowed (405 HTTP error) - """ - def http_error_405(self, req, fp, code, msg, headers): - fp.read() - fp.close() - - newheaders = dict((k,v) for k,v in req.headers.items() - if k.lower() not in ("content-length", "content-type")) - return self.parent.open(compat_urllib_request.Request(req.get_full_url(), - headers=newheaders, - origin_req_host=req.get_origin_req_host(), - unverifiable=True)) - - # Build our opener - opener = compat_urllib_request.OpenerDirector() - for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler, - HTTPMethodFallback, HEADRedirectHandler, - compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]: - opener.add_handler(handler()) - - response = opener.open(HEADRequest(url)) - if response is None: - raise ExtractorError('Invalid URL protocol') - return response - def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -511,9 +497,13 @@ class GenericIE(InfoExtractor): self.to_screen('%s: Requesting header' % video_id) - try: - response = self._send_head(url) + head_req = HEADRequest(url) + response = self._request_webpage( + head_req, video_id, + note=False, errnote='Could not send HEAD request to %s' % url, + fatal=False) + if response is not False: # Check for redirect new_url = response.geturl() if url != new_url: @@ -541,10 +531,6 @@ class GenericIE(InfoExtractor): 'upload_date': upload_date, } - except compat_urllib_error.HTTPError: - # This may be a stupid server that doesn't like HEAD, our UA, or so - pass - try: webpage = self._download_webpage(url, video_id) except ValueError: @@ -838,6 +824,12 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url'), 'SBS') + mobj = re.search( + r']+?src=(["\'])(?Phttps?://m\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', + webpage) + if mobj is not None: + return self.url_result(mobj.group('url'), 'MLB') + # Start with something easy: JW Player in SWFObject found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if not found: @@ -863,7 +855,6 @@ class GenericIE(InfoExtractor): \s*{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] ''', webpage) - assert found if not found: # Try to find twitter cards info found = re.findall(r'.*?.*?]+)? src="([^"]+)"', webpage) if not found: found = re.search( r'(?i)