X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=40b2791c77351a8625894f709187b3ccfb8e1939;hb=416c7fcbce86324587afae11414c71ff603ad296;hp=42284f3d5e7bde5d61dff40fa054b417e48412cf;hpb=37d66e7f1e5d325802781c69042571968afa1da0;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 42284f3d5..40b2791c7 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -7,11 +7,12 @@ import re from .common import InfoExtractor from .youtube import YoutubeIE -from ..utils import ( +from ..compat import ( compat_urllib_parse, compat_urlparse, compat_xml_parse_error, - +) +from ..utils import ( determine_ext, ExtractorError, float_or_none, @@ -22,6 +23,7 @@ from ..utils import ( unescapeHTML, unified_strdate, unsmuggle_url, + UnsupportedError, url_basename, ) from .brightcove import BrightcoveIE @@ -99,6 +101,22 @@ class GenericIE(InfoExtractor): 'uploader': 'Championat', }, }, + { + # https://github.com/rg3/youtube-dl/issues/3541 + 'add_ie': ['Brightcove'], + 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', + 'info_dict': { + 'id': '3866516442001', + 'ext': 'mp4', + 'title': 'Leer mij vrouwen kennen: Aflevering 1', + 'description': 'Leer mij vrouwen kennen: Aflevering 1', + 'uploader': 'SBS Broadcasting', + }, + 'skip': 'Restricted to Netherlands', + 'params': { + 'skip_download': True, # m3u8 download + }, + }, # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', @@ -325,7 +343,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'age_limit': 18, 'uploader': 'www.handjobhub.com', - 'title': 'Busty Blonde Siri Tit Fuck While Wank at Handjob Hub', + 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, # RSS feed @@ -405,7 +423,62 @@ class GenericIE(InfoExtractor): 'expected_warnings': [ r'501.*Not Implemented' ], - } + }, + # Soundcloud embed + { + 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/', + 'info_dict': { + 'id': '174391317', + 'ext': 'mp3', + 'description': 'md5:ff867d6b555488ad3c52572bb33d432c', + 'uploader': 'Sophos Security', + 'title': 'Chet Chat 171 - Oct 29, 2014', + 'upload_date': '20141029', + } + }, + # Livestream embed + { + 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', + 'info_dict': { + 'id': '67864563', + 'ext': 'flv', + 'upload_date': '20141112', + 'title': 'Rosetta #CometLanding webcast HL 10', + } + }, + # LazyYT + { + 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986', + 'info_dict': { + 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', + }, + 'playlist_mincount': 2, + }, + # Direct link with incorrect MIME type + { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'md5': '4ccbebe5f36706d85221f204d7eb5913', + 'info_dict': { + 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', + 'id': '5_Lennart_Poettering_-_Systemd', + 'ext': 'webm', + 'title': '5_Lennart_Poettering_-_Systemd', + 'upload_date': '20141120', + }, + 'expected_warnings': [ + 'URL could be a direct video link, returning it as such.' + ] + }, + # Cinchcast embed + { + 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/', + 'info_dict': { + 'id': '7141703', + 'ext': 'mp3', + 'upload_date': '20141126', + 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing', + } + }, ] def report_following_redirect(self, new_url): @@ -498,9 +571,9 @@ class GenericIE(InfoExtractor): if default_search in ('error', 'fixup_error'): raise ExtractorError( - ('%r is not a valid URL. ' - 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' - ) % (url, url), expected=True) + '%r is not a valid URL. ' + 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube' + % (url, url), expected=True) else: if ':' not in default_search: default_search += ':' @@ -547,6 +620,7 @@ class GenericIE(InfoExtractor): return { 'id': video_id, 'title': os.path.splitext(url_basename(url))[0], + 'direct': True, 'formats': [{ 'format_id': m.group('format_id'), 'url': url, @@ -558,10 +632,28 @@ class GenericIE(InfoExtractor): if not self._downloader.params.get('test', False) and not is_intentional: self._downloader.report_warning('Falling back on generic information extractor.') - if full_response: - webpage = self._webpage_read_content(full_response, url, video_id) - else: - webpage = self._download_webpage(url, video_id) + if not full_response: + full_response = self._request_webpage(url, video_id) + + # Maybe it's a direct link to a video? + # Be careful not to download the whole thing! + first_bytes = full_response.read(512) + if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): + self._downloader.report_warning( + 'URL could be a direct video link, returning it as such.') + upload_date = unified_strdate( + head_response.headers.get('Last-Modified')) + return { + 'id': video_id, + 'title': os.path.splitext(url_basename(url))[0], + 'direct': True, + 'url': url, + 'upload_date': upload_date, + } + + webpage = self._webpage_read_content( + full_response, url, video_id, prefix=first_bytes) + self.report_extraction(video_id) # Is it an RSS feed? @@ -662,6 +754,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) + # Look for lazyYT YouTube embed + matches = re.findall( + r'class="lazyYT" data-youtube-id="([^"]+)"', webpage) + if matches: + return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) + # Look for embedded Dailymotion player matches = re.findall( r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) @@ -693,7 +791,7 @@ class GenericIE(InfoExtractor): 'title': video_title, 'id': video_id, } - + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P[^"\']+)', webpage) if match: return { @@ -708,7 +806,7 @@ class GenericIE(InfoExtractor): # Look for embedded blip.tv player mobj = re.search(r']*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage) if mobj: - return self.url_result('http://blip.tv/a/a-'+mobj.group(1), 'BlipTV') + return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV') mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage) if mobj: return self.url_result(mobj.group(1), 'BlipTV') @@ -744,7 +842,7 @@ class GenericIE(InfoExtractor): # Look for Ooyala videos mobj = (re.search(r'player.ooyala.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P[^"&]+)', webpage) or - re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage)) + re.search(r'OO.Player.create\([\'"].*?[\'"],\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: return OoyalaIE._build_url_result(mobj.group('ec')) @@ -838,7 +936,7 @@ class GenericIE(InfoExtractor): # Look for embeded soundcloud player mobj = re.search( - r'