X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgeneric.py;h=45adbb7a3947c9f3f6a3522864ac5fe370cdc254;hb=d800609c62703e4e6edd2891a8432306462e4db3;hp=5f61ab26fe1a550c84081aec82f7f33d7e98dfa7;hpb=1c31a5b0e029ae47415f7df7bea30e2b2b5df64f;p=youtube-dl diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 5f61ab26f..45adbb7a3 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -11,7 +11,6 @@ from .youtube import YoutubeIE from ..compat import ( compat_etree_fromstring, compat_urllib_parse_unquote, - compat_urllib_request, compat_urlparse, compat_xml_parse_error, ) @@ -22,6 +21,7 @@ from ..utils import ( HEADRequest, is_html, orderedSet, + sanitized_Request, smuggle_url, unescapeHTML, unified_strdate, @@ -30,7 +30,10 @@ from ..utils import ( url_basename, xpath_text, ) -from .brightcove import BrightcoveIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) from .nbc import NBCSportsVPlayerIE from .ooyala import OoyalaIE from .rutv import RUTVIE @@ -41,7 +44,6 @@ from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE from .senateisvp import SenateISVPIE -from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -51,6 +53,11 @@ from .onionstudios import OnionStudiosIE from .snagfilms import SnagFilmsEmbedIE from .screenwavemedia import ScreenwaveMediaIE from .mtv import MTVServicesEmbeddedIE +from .pladform import PladformIE +from .videomore import VideomoreIE +from .googledrive import GoogleDriveIE +from .jwplatform import JWPlatformIE +from .digiteka import DigitekaIE class GenericIE(InfoExtractor): @@ -217,6 +224,20 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # MPD from http://dash-mse-test.appspot.com/media.html + { + 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', + 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', + 'info_dict': { + 'id': 'car-20120827-manifest', + 'ext': 'mp4', + 'title': 'car-20120827-manifest', + 'formats': 'mincount:9', + }, + 'params': { + 'format': 'bestvideo', + }, + }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', @@ -275,7 +296,7 @@ class GenericIE(InfoExtractor): # it also tests brightcove videos that need to set the 'Referer' in the # http requests { - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { 'id': '2765128793001', @@ -299,7 +320,7 @@ class GenericIE(InfoExtractor): 'uploader': 'thestar.com', 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', @@ -314,7 +335,7 @@ class GenericIE(InfoExtractor): }, { # https://github.com/rg3/youtube-dl/issues/3541 - 'add_ie': ['Brightcove'], + 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { 'id': '3866516442001', @@ -336,6 +357,7 @@ class GenericIE(InfoExtractor): 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ', 'ext': 'mp4', 'title': '2cc213299525360.mov', # that's what we get + 'duration': 238.231, }, 'add_ie': ['Ooyala'], }, @@ -347,6 +369,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '"Steve Jobs: Man in the Machine" trailer', 'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."', + 'duration': 135.427, }, 'params': { 'skip_download': True, @@ -478,7 +501,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9', } }, - # Embeded Ustream video + # Embedded Ustream video { 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm', 'md5': '27b99cdb639c9b12a79bca876a073417', @@ -820,6 +843,19 @@ class GenericIE(InfoExtractor): 'title': 'Os Guinness // Is It Fools Talk? // Unbelievable? Conference 2014', }, }, + # Kaltura embed protected with referrer + { + 'url': 'http://www.disney.nl/disney-channel/filmpjes/achter-de-schermen#/videoId/violetta-achter-de-schermen-ruggero', + 'info_dict': { + 'id': '1_g4fbemnq', + 'ext': 'mp4', + 'title': 'Violetta - Achter De Schermen - Ruggero', + 'description': 'Achter de schermen met Ruggero', + 'timestamp': 1435133761, + 'upload_date': '20150624', + 'uploader_id': 'echojecka', + }, + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -944,8 +980,9 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: Index/Match versus VLOOKUP.', + 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', + 'duration': 191.933, }, 'params': { # m3u8 downloads @@ -1031,6 +1068,31 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'cinemasnob', }, + }, + # BrightcoveInPageEmbed embed + { + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, + }, + # JWPlayer with M3U8 + { + 'url': 'http://ren.tv/novosti/2015-09-25/sluchaynyy-prohozhiy-poymal-avtougonshchika-v-murmanske-video', + 'info_dict': { + 'id': 'playlist', + 'ext': 'mp4', + 'title': 'Случайный прохожий поймал автоугонщика в Мурманске. ВИДЕО | РЕН ТВ', + 'uploader': 'ren.tv', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + } } ] @@ -1174,26 +1236,31 @@ class GenericIE(InfoExtractor): full_response = None if head_response is False: - request = compat_urllib_request.Request(url) + request = sanitized_Request(url) request.add_header('Accept-Encoding', '*') full_response = self._request_webpage(request, video_id) head_response = full_response # Check for direct link to a video content_type = head_response.headers.get('Content-Type', '') - m = re.match(r'^(?Paudio|video|application(?=/ogg$))/(?P.+)$', content_type) + m = re.match(r'^(?Paudio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P.+)$', content_type) if m: upload_date = unified_strdate( head_response.headers.get('Last-Modified')) + formats = [] + if m.group('format_id').endswith('mpegurl'): + formats = self._extract_m3u8_formats(url, video_id, 'mp4') + else: + formats = [{ + 'format_id': m.group('format_id'), + 'url': url, + 'vcodec': 'none' if m.group('type') == 'audio' else None + }] return { 'id': video_id, 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), 'direct': True, - 'formats': [{ - 'format_id': m.group('format_id'), - 'url': url, - 'vcodec': 'none' if m.group('type') == 'audio' else None - }], + 'formats': formats, 'upload_date': upload_date, } @@ -1203,7 +1270,7 @@ class GenericIE(InfoExtractor): '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) if not full_response: - request = compat_urllib_request.Request(url) + request = sanitized_Request(url) # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) # making it impossible to download only chunk of the file (yet we need only 512kB to # test whether it's HTML or not). According to youtube-dl default Accept-Encoding @@ -1236,7 +1303,7 @@ class GenericIE(InfoExtractor): self.report_extraction(video_id) - # Is it an RSS feed, a SMIL file or a XSPF playlist? + # Is it an RSS feed, a SMIL file, an XSPF playlist or a MPD manifest? try: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': @@ -1245,6 +1312,13 @@ class GenericIE(InfoExtractor): return self._parse_smil(doc, url, video_id) elif doc.tag == '{http://xspf.org/ns/0/}playlist': return self.playlist_result(self._parse_xspf(doc, video_id), video_id) + elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): + return { + 'id': video_id, + 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]), + 'formats': self._parse_mpd_formats( + doc, video_id, mpd_base_url=url.rpartition('/')[0]), + } except compat_xml_parse_error: pass @@ -1290,14 +1364,14 @@ class GenericIE(InfoExtractor): return self.playlist_result( urlrs, playlist_id=video_id, playlist_title=video_title) - # Look for BrightCove: - bc_urls = BrightcoveIE._extract_brightcove_urls(webpage) + # Look for Brightcove Legacy Studio embeds + bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: self.to_screen('Brightcove video detected.') entries = [{ '_type': 'url', 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'Brightcove' + 'ie_key': 'BrightcoveLegacy' } for bc_url in bc_urls] return { @@ -1307,6 +1381,11 @@ class GenericIE(InfoExtractor): 'entries': entries, } + # Look for Brightcove New Studio embeds + bc_urls = BrightcoveNewIE._extract_urls(webpage) + if bc_urls: + return _playlist_from_matches(bc_urls, ie='BrightcoveNew') + # Look for embedded rtl.nl player matches = re.findall( r']+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', @@ -1349,7 +1428,7 @@ class GenericIE(InfoExtractor): # Look for embedded Dailymotion player matches = re.findall( - r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage) + r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) if matches: return _playlist_from_matches( matches, lambda m: unescapeHTML(m[1])) @@ -1390,11 +1469,6 @@ class GenericIE(InfoExtractor): 'id': match.group('id') } - # Look for embedded blip.tv player - bliptv_url = BlipTVIE._extract_url(webpage) - if bliptv_url: - return self.url_result(bliptv_url, 'BlipTV') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: @@ -1455,7 +1529,7 @@ class GenericIE(InfoExtractor): re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P.{32})[\'"]\)', webpage) or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P.{32})[\'"]', webpage)) if mobj is not None: - return OoyalaIE._build_url_result(mobj.group('ec')) + return OoyalaIE._build_url_result(smuggle_url(mobj.group('ec'), {'domain': url})) # Look for multiple Ooyala embeds on SBN network websites mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) @@ -1463,7 +1537,7 @@ class GenericIE(InfoExtractor): embeds = self._parse_json(mobj.group(1), video_id, fatal=False) if embeds: return _playlist_from_matches( - embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala') + embeds, getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') # Look for Aparat videos mobj = re.search(r'