3 from __future__ import unicode_literals
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
13 compat_xml_parse_error,
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
39 class GenericIE(InfoExtractor):
40 IE_DESC = 'Generic downloader that works on some sites'
45 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
46 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
48 'id': '13601338388002',
50 'uploader': 'www.hodiho.fr',
51 'title': 'R\u00e9gis plante sa Jeep',
54 # bandcamp page with custom domain
56 'add_ie': ['Bandcamp'],
57 'url': 'http://bronyrock.com/track/the-pony-mash',
61 'title': 'The Pony Mash',
62 'uploader': 'M_Pallante',
64 'skip': 'There is a limit of 200 free downloads / month for the test song',
66 # embedded brightcove video
67 # it also tests brightcove videos that need to set the 'Referer' in the
70 'add_ie': ['Brightcove'],
71 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
73 'id': '2765128793001',
75 'title': 'Le cours de bourse : l’analyse technique',
76 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
77 'uploader': 'BFM BUSINESS',
80 'skip_download': True,
84 # https://github.com/rg3/youtube-dl/issues/2253
85 'url': 'http://bcove.me/i6nfkrc3',
86 'md5': '0ba9446db037002366bab3b3eb30c88c',
88 'id': '3101154703001',
90 'title': 'Still no power',
91 'uploader': 'thestar.com',
92 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
94 'add_ie': ['Brightcove'],
97 'url': 'http://www.championat.com/video/football/v/87/87499.html',
98 'md5': 'fb973ecf6e4a78a67453647444222983',
100 'id': '3414141473001',
102 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
103 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
104 'uploader': 'Championat',
108 # https://github.com/rg3/youtube-dl/issues/3541
109 'add_ie': ['Brightcove'],
110 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
112 'id': '3866516442001',
114 'title': 'Leer mij vrouwen kennen: Aflevering 1',
115 'description': 'Leer mij vrouwen kennen: Aflevering 1',
116 'uploader': 'SBS Broadcasting',
118 'skip': 'Restricted to Netherlands',
120 'skip_download': True, # m3u8 download
123 # Direct link to a video
125 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
126 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
131 'upload_date': '20100513',
136 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
137 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
139 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
141 'title': '2cc213299525360.mov', # that's what we get
143 'add_ie': ['Ooyala'],
145 # multiple ooyala embeds on SBN network websites
147 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
150 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
152 'playlist_mincount': 3,
154 'skip_download': True,
156 'add_ie': ['Ooyala'],
160 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
164 'upload_date': '20130224',
165 'uploader_id': 'TheVerge',
166 'description': 're:^Chris Ziegler takes a look at the\.*',
167 'uploader': 'The Verge',
168 'title': 'First Firefox OS phones side-by-side',
171 'skip_download': False,
176 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
180 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
181 'upload_date': '20140225',
182 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
183 'uploader': 'Tested',
184 'uploader_id': 'testedcom',
186 # No need to test YoutubeIE here
188 'skip_download': True,
193 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
197 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
198 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
203 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
205 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
207 'playlist_mincount': 18,
211 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
215 'title': 'Охотское море стало целиком российским',
216 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
220 'skip_download': True,
225 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
226 'md5': '65fdff94098e4a607385a60c5177c638',
230 'title': 'Hidden miracles of the natural world',
231 'uploader': 'Louie Schwartzberg',
232 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
235 # Embeded Ustream video
237 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
238 'md5': '27b99cdb639c9b12a79bca876a073417',
242 'uploader': 'AU SPA: The NSA and Privacy',
243 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
246 # nowvideo embed hidden behind percent encoding
248 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
249 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
251 'id': '06e53103ca9aa',
253 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
254 'description': 'No description',
259 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
260 'md5': '7653032cbb25bf6c80d80f217055fa43',
262 'id': '048195-004_PLUS7-F',
265 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
266 'upload_date': '20140320',
269 'skip_download': 'Requires rtmpdump'
274 'url': 'http://www.wired.com/2014/04/honda-asimo/',
275 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
277 'id': '53501be369702d3275860000',
279 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
284 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
285 'md5': '441aeeb82eb72c422c7f14ec533999cd',
287 'id': 'k2mm4bCdJ6CQ2i7c8o2',
289 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
292 'add_ie': ['Dailymotion'],
296 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
300 'title': 'The NBL Auction 2014',
301 'uploader': 'BADMINTON England',
302 'uploader_id': 'BADMINTONEvents',
303 'upload_date': '20140603',
304 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
306 'add_ie': ['Youtube'],
308 'skip_download': True,
313 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
314 'md5': '35727f82f58c76d996fc188f9755b0d5',
316 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
319 'description': 'Mario\'s life in the fast lane has never looked so good.',
322 # YouTube embed via <data-embed-url="">
324 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
328 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
329 'uploader': 'Gameloft',
330 'uploader_id': 'gameloft',
331 'upload_date': '20140828',
332 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
335 'skip_download': True,
340 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
342 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
344 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
345 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
350 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
352 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
353 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
359 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
364 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
365 'md5': '9d65602bf31c6e20014319c7d07fba27',
367 'id': '5123ea6d5e5a7',
370 'uploader': 'www.handjobhub.com',
371 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
376 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
379 'title': 'Zero Punctuation',
380 'description': 're:.*groundbreaking video review series.*'
382 'playlist_mincount': 11,
384 # Multiple brightcove videos
385 # https://github.com/rg3/youtube-dl/issues/2283
387 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
389 'id': 'always-never',
390 'title': 'Always / Never - The New Yorker',
394 'extract_flat': False,
395 'skip_download': True,
400 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
401 'md5': '96f09a37e44da40dd083e12d9a683327',
405 'title': 'Ump changes call to ball',
406 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
408 'timestamp': 1401537900,
409 'upload_date': '20140531',
410 'thumbnail': 're:^https?://.*\.jpg$',
415 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
416 'md5': '8788b683c777a5cf25621eaf286d0c23',
420 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
422 'filesize': 182808282,
423 'uploader': 'education-portal.com',
427 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
428 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
432 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
434 'uploader': 'thoughtworks.wistia.com',
437 # Direct download with broken HEAD
439 'url': 'http://ai-radio.org:8000/radio.opus',
446 'skip_download': True, # infinite live stream
448 'expected_warnings': [
449 r'501.*Not Implemented'
454 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
458 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
459 'uploader': 'Sophos Security',
460 'title': 'Chet Chat 171 - Oct 29, 2014',
461 'upload_date': '20141029',
466 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
470 'upload_date': '20141112',
471 'title': 'Rosetta #CometLanding webcast HL 10',
476 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
479 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
481 'playlist_mincount': 2,
483 # Direct link with incorrect MIME type
485 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
486 'md5': '4ccbebe5f36706d85221f204d7eb5913',
488 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
489 'id': '5_Lennart_Poettering_-_Systemd',
491 'title': '5_Lennart_Poettering_-_Systemd',
492 'upload_date': '20141120',
494 'expected_warnings': [
495 'URL could be a direct video link, returning it as such.'
500 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
504 'upload_date': '20141126',
505 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
510 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
512 'id': '730m_DandD_1901_512k',
514 'uploader': 'www.abc.net.au',
515 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
518 # embedded viddler video
520 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
524 'uploader': 'deadspin',
525 'title': 'WALL-TO-GORTAT',
526 'timestamp': 1422285291,
527 'upload_date': '20150126',
529 'add_ie': ['Viddler'],
533 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
537 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
538 'description': 'md5:601cb790edd05908957dae8aaa866465',
539 'upload_date': '20150220',
544 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
548 'upload_date': '20150212',
549 'uploader': 'The National Archives UK',
550 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
551 'uploader_id': 'NationalArchives08',
552 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
557 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
558 'playlist_mincount': 5,
560 'id': 'aanslagen-kopenhagen',
561 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
566 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
570 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
575 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
579 'upload_date': '20150226',
580 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
582 'title': 'John Carlson Postgame 2/25/15',
585 # Eagle.Platform embed (generic URL)
587 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
591 'title': 'Навальный вышел на свободу',
592 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
593 'thumbnail': 're:^https?://.*\.jpg$',
599 # ClipYou (Eagle.Platform) embed (custom URL)
601 'url': 'http://muz-tv.ru/play/7129/',
605 'title': "'O Sole Mio",
606 'thumbnail': 're:^https?://.*\.jpg$',
613 'url': 'http://muz-tv.ru/kinozal/view/7400/',
617 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
618 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
619 'thumbnail': 're:^https?://.*\.jpg$',
626 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
627 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
631 'title': 'Facebook Creates "On This Day" | Crunch Report',
634 # RSS feed with enclosure
636 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
638 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
640 'upload_date': '20150228',
641 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
644 # NBC Sports vplayer embeds
646 'url': 'http://bbs.clutchfans.net/showthread.php?t=244180',
648 'id': '_hqLjQ95yx8Z',
651 'skip': 'This content expired on 9/17/14 12:23 PM',
655 def report_following_redirect(self, new_url):
656 """Report information extraction."""
657 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
659 def _extract_rss(self, url, video_id, doc):
660 playlist_title = doc.find('./channel/title').text
661 playlist_desc_el = doc.find('./channel/description')
662 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
665 for it in doc.findall('./channel/item'):
666 next_url = xpath_text(it, 'link', fatal=False)
668 enclosure_nodes = it.findall('./enclosure')
669 for e in enclosure_nodes:
670 next_url = e.attrib.get('url')
680 'title': it.find('title').text,
686 'title': playlist_title,
687 'description': playlist_desc,
691 def _extract_camtasia(self, url, video_id, webpage):
692 """ Returns None if no camtasia video can be found. """
694 camtasia_cfg = self._search_regex(
695 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
696 webpage, 'camtasia configuration file', default=None)
697 if camtasia_cfg is None:
700 title = self._html_search_meta('DC.title', webpage, fatal=True)
702 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
703 camtasia_cfg = self._download_xml(
704 camtasia_url, video_id,
705 note='Downloading camtasia configuration',
706 errnote='Failed to download camtasia configuration')
707 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
710 for n in fileset_node.getchildren():
711 url_n = n.find('./uri')
716 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
717 'title': '%s - %s' % (title, n.tag),
718 'url': compat_urlparse.urljoin(url, url_n.text),
719 'duration': float_or_none(n.find('./duration').text),
728 def _real_extract(self, url):
729 if url.startswith('//'):
732 'url': self.http_scheme() + url,
735 parsed_url = compat_urlparse.urlparse(url)
736 if not parsed_url.scheme:
737 default_search = self._downloader.params.get('default_search')
738 if default_search is None:
739 default_search = 'fixup_error'
741 if default_search in ('auto', 'auto_warning', 'fixup_error'):
743 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
744 return self.url_result('http://' + url)
745 elif default_search != 'fixup_error':
746 if default_search == 'auto_warning':
747 if re.match(r'^(?:url|URL)$', url):
748 raise ExtractorError(
749 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
752 self._downloader.report_warning(
753 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
754 return self.url_result('ytsearch:' + url)
756 if default_search in ('error', 'fixup_error'):
757 raise ExtractorError(
758 '%r is not a valid URL. '
759 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
760 % (url, url), expected=True)
762 if ':' not in default_search:
763 default_search += ':'
764 return self.url_result(default_search + url)
766 url, smuggled_data = unsmuggle_url(url)
768 is_intentional = smuggled_data and smuggled_data.get('to_generic')
769 if smuggled_data and 'force_videoid' in smuggled_data:
770 force_videoid = smuggled_data['force_videoid']
771 video_id = force_videoid
773 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
775 self.to_screen('%s: Requesting header' % video_id)
777 head_req = HEADRequest(url)
778 head_response = self._request_webpage(
780 note=False, errnote='Could not send HEAD request to %s' % url,
783 if head_response is not False:
785 new_url = head_response.geturl()
787 self.report_following_redirect(new_url)
789 new_url = smuggle_url(
790 new_url, {'force_videoid': force_videoid})
791 return self.url_result(new_url)
794 if head_response is False:
795 full_response = self._request_webpage(url, video_id)
796 head_response = full_response
798 # Check for direct link to a video
799 content_type = head_response.headers.get('Content-Type', '')
800 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
802 upload_date = unified_strdate(
803 head_response.headers.get('Last-Modified'))
806 'title': os.path.splitext(url_basename(url))[0],
809 'format_id': m.group('format_id'),
811 'vcodec': 'none' if m.group('type') == 'audio' else None
813 'upload_date': upload_date,
816 if not self._downloader.params.get('test', False) and not is_intentional:
817 self._downloader.report_warning('Falling back on generic information extractor.')
819 if not full_response:
820 full_response = self._request_webpage(url, video_id)
822 # Maybe it's a direct link to a video?
823 # Be careful not to download the whole thing!
824 first_bytes = full_response.read(512)
825 if not is_html(first_bytes):
826 self._downloader.report_warning(
827 'URL could be a direct video link, returning it as such.')
828 upload_date = unified_strdate(
829 head_response.headers.get('Last-Modified'))
832 'title': os.path.splitext(url_basename(url))[0],
835 'upload_date': upload_date,
838 webpage = self._webpage_read_content(
839 full_response, url, video_id, prefix=first_bytes)
841 self.report_extraction(video_id)
845 doc = parse_xml(webpage)
847 return self._extract_rss(url, video_id, doc)
848 except compat_xml_parse_error:
851 # Is it a Camtasia project?
852 camtasia_res = self._extract_camtasia(url, video_id, webpage)
853 if camtasia_res is not None:
856 # Sometimes embedded video player is hidden behind percent encoding
857 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
858 # Unescaping the whole page allows to handle those cases in a generic way
859 webpage = compat_urllib_parse.unquote(webpage)
861 # it's tempting to parse this further, but you would
862 # have to take into account all the variations like
863 # Video Title - Site Name
864 # Site Name | Video Title
865 # Video Title - Tagline | Site Name
866 # and so on and so forth; it's just not practical
867 video_title = self._html_search_regex(
868 r'(?s)<title>(.*?)</title>', webpage, 'video title',
871 # Try to detect age limit automatically
872 age_limit = self._rta_search(webpage)
873 # And then there are the jokers who advertise that they use RTA,
874 # but actually don't.
875 AGE_LIMIT_MARKERS = [
876 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
878 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
881 # video uploader is domain name
882 video_uploader = self._search_regex(
883 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
886 def _playlist_from_matches(matches, getter=None, ie=None):
888 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
890 return self.playlist_result(
891 urlrs, playlist_id=video_id, playlist_title=video_title)
893 # Look for BrightCove:
894 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
896 self.to_screen('Brightcove video detected.')
899 'url': smuggle_url(bc_url, {'Referer': url}),
900 'ie_key': 'Brightcove'
901 } for bc_url in bc_urls]
905 'title': video_title,
910 # Look for embedded rtl.nl player
911 matches = re.findall(
912 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
915 return _playlist_from_matches(matches, ie='RtlNl')
917 # Look for embedded (iframe) Vimeo player
919 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
921 player_url = unescapeHTML(mobj.group('url'))
922 surl = smuggle_url(player_url, {'Referer': url})
923 return self.url_result(surl)
924 # Look for embedded (swf embed) Vimeo player
926 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
928 return self.url_result(mobj.group(1))
930 # Look for embedded YouTube player
931 matches = re.findall(r'''(?x)
940 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
944 return _playlist_from_matches(
945 matches, lambda m: unescapeHTML(m[1]))
947 # Look for lazyYT YouTube embed
948 matches = re.findall(
949 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
951 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
953 # Look for embedded Dailymotion player
954 matches = re.findall(
955 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
957 return _playlist_from_matches(
958 matches, lambda m: unescapeHTML(m[1]))
960 # Look for embedded Dailymotion playlist player (#3822)
962 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
964 playlists = re.findall(
965 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
967 return _playlist_from_matches(
968 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
970 # Look for embedded Wistia player
972 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
974 embed_url = self._proto_relative_url(
975 unescapeHTML(match.group('url')))
977 '_type': 'url_transparent',
980 'uploader': video_uploader,
981 'title': video_title,
985 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
988 '_type': 'url_transparent',
989 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
991 'uploader': video_uploader,
992 'title': video_title,
993 'id': match.group('id')
996 # Look for embedded blip.tv player
997 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
999 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1000 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1002 return self.url_result(mobj.group(1), 'BlipTV')
1004 # Look for embedded condenast player
1005 matches = re.findall(
1006 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1010 '_type': 'playlist',
1013 'ie_key': 'CondeNast',
1015 } for ma in matches],
1016 'title': video_title,
1020 # Look for Bandcamp pages with custom domain
1021 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1022 if mobj is not None:
1023 burl = unescapeHTML(mobj.group(1))
1024 # Don't set the extractor because it can be a track url or an album
1025 return self.url_result(burl)
1027 # Look for embedded Vevo player
1029 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1030 if mobj is not None:
1031 return self.url_result(mobj.group('url'))
1033 # Look for embedded Viddler player
1035 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1037 if mobj is not None:
1038 return self.url_result(mobj.group('url'))
1040 # Look for NYTimes player
1042 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1044 if mobj is not None:
1045 return self.url_result(mobj.group('url'))
1047 # Look for Libsyn player
1049 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1050 if mobj is not None:
1051 return self.url_result(mobj.group('url'))
1053 # Look for Ooyala videos
1054 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1055 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1056 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
1057 if mobj is not None:
1058 return OoyalaIE._build_url_result(mobj.group('ec'))
1060 # Look for multiple Ooyala embeds on SBN network websites
1061 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1062 if mobj is not None:
1063 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1065 return _playlist_from_matches(
1066 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1068 # Look for Aparat videos
1069 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1070 if mobj is not None:
1071 return self.url_result(mobj.group(1), 'Aparat')
1073 # Look for MPORA videos
1074 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1075 if mobj is not None:
1076 return self.url_result(mobj.group(1), 'Mpora')
1078 # Look for embedded NovaMov-based player
1080 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1081 (?P<url>http://(?:(?:embed|www)\.)?
1083 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1084 videoweed\.(?:es|com)|
1085 movshare\.(?:net|sx|ag)|
1086 divxstage\.(?:eu|net|ch|co|at|ag))
1087 /embed\.php.+?)\1''', webpage)
1088 if mobj is not None:
1089 return self.url_result(mobj.group('url'))
1091 # Look for embedded Facebook player
1093 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1094 if mobj is not None:
1095 return self.url_result(mobj.group('url'), 'Facebook')
1097 # Look for embedded VK player
1098 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1099 if mobj is not None:
1100 return self.url_result(mobj.group('url'), 'VK')
1102 # Look for embedded ivi player
1103 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1104 if mobj is not None:
1105 return self.url_result(mobj.group('url'), 'Ivi')
1107 # Look for embedded Huffington Post player
1109 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1110 if mobj is not None:
1111 return self.url_result(mobj.group('url'), 'HuffPost')
1114 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1115 if mobj is not None:
1116 return self.url_result(mobj.group('url'))
1117 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1118 if mobj is not None:
1119 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1121 # Look for funnyordie embed
1122 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1124 return _playlist_from_matches(
1125 matches, getter=unescapeHTML, ie='FunnyOrDie')
1127 # Look for BBC iPlayer embed
1128 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1130 return _playlist_from_matches(matches, ie='BBCCoUk')
1132 # Look for embedded RUTV player
1133 rutv_url = RUTVIE._extract_url(webpage)
1135 return self.url_result(rutv_url, 'RUTV')
1137 # Look for embedded TED player
1139 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1140 if mobj is not None:
1141 return self.url_result(mobj.group('url'), 'TED')
1143 # Look for embedded Ustream videos
1145 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1146 if mobj is not None:
1147 return self.url_result(mobj.group('url'), 'Ustream')
1149 # Look for embedded arte.tv player
1151 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1153 if mobj is not None:
1154 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1156 # Look for embedded smotri.com player
1157 smotri_url = SmotriIE._extract_url(webpage)
1159 return self.url_result(smotri_url, 'Smotri')
1161 # Look for embeded soundcloud player
1163 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1165 if mobj is not None:
1166 url = unescapeHTML(mobj.group('url'))
1167 return self.url_result(url)
1169 # Look for embedded vulture.com player
1171 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1173 if mobj is not None:
1174 url = unescapeHTML(mobj.group('url'))
1175 return self.url_result(url, ie='Vulture')
1177 # Look for embedded mtvservices player
1179 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1181 if mobj is not None:
1182 url = unescapeHTML(mobj.group('url'))
1183 return self.url_result(url, ie='MTVServicesEmbedded')
1185 # Look for embedded yahoo player
1187 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1189 if mobj is not None:
1190 return self.url_result(mobj.group('url'), 'Yahoo')
1192 # Look for embedded sbs.com.au player
1196 <meta\s+property="og:video"\s+content=|
1199 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1201 if mobj is not None:
1202 return self.url_result(mobj.group('url'), 'SBS')
1204 # Look for embedded Cinchcast player
1206 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1208 if mobj is not None:
1209 return self.url_result(mobj.group('url'), 'Cinchcast')
1212 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1214 if mobj is not None:
1215 return self.url_result(mobj.group('url'), 'MLB')
1218 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1220 if mobj is not None:
1221 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1224 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1226 if mobj is not None:
1227 return self.url_result(mobj.group('url'), 'Livestream')
1229 # Look for Zapiks embed
1231 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1232 if mobj is not None:
1233 return self.url_result(mobj.group('url'), 'Zapiks')
1235 # Look for Kaltura embeds
1237 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1238 if mobj is not None:
1239 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1241 # Look for Eagle.Platform embeds
1243 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1244 if mobj is not None:
1245 return self.url_result(mobj.group('url'), 'EaglePlatform')
1247 # Look for ClipYou (uses Eagle.Platform) embeds
1249 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1250 if mobj is not None:
1251 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1253 # Look for Pladform embeds
1255 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1256 if mobj is not None:
1257 return self.url_result(mobj.group('url'), 'Pladform')
1259 # Look for 5min embeds
1261 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1262 if mobj is not None:
1263 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1265 # Look for NBC Sports VPlayer embeds
1266 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1268 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1270 def check_video(vurl):
1271 if YoutubeIE.suitable(vurl):
1273 vpath = compat_urlparse.urlparse(vurl).path
1274 vext = determine_ext(vpath)
1275 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1277 def filter_video(urls):
1278 return list(filter(check_video, urls))
1280 # Start with something easy: JW Player in SWFObject
1281 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1283 # Look for gorilla-vid style embedding
1284 found = filter_video(re.findall(r'''(?sx)
1288 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1291 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1293 # Broaden the search a little bit
1294 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1296 # Broaden the findall a little bit: JWPlayer JS loader
1297 found = filter_video(re.findall(
1298 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1301 found = filter_video(re.findall(r'''(?xs)
1302 flowplayer\("[^"]+",\s*
1304 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1305 ["']?url["']?\s*:\s*["']([^"']+)["']
1310 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1312 # Try to find twitter cards info
1313 found = filter_video(re.findall(
1314 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1316 # We look for Open Graph info:
1317 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1318 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1319 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1320 if m_video_type is not None:
1321 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1324 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1326 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1328 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1329 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1332 # Look also in Refresh HTTP header
1333 refresh_header = head_response.headers.get('Refresh')
1335 found = re.search(REDIRECT_REGEX, refresh_header)
1337 new_url = found.group(1)
1338 self.report_following_redirect(new_url)
1344 raise UnsupportedError(url)
1347 for video_url in found:
1348 video_url = compat_urlparse.urljoin(url, video_url)
1349 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1351 # Sometimes, jwplayer extraction will result in a YouTube URL
1352 if YoutubeIE.suitable(video_url):
1353 entries.append(self.url_result(video_url, 'Youtube'))
1356 # here's a fun little line of code for you:
1357 video_id = os.path.splitext(video_id)[0]
1362 'uploader': video_uploader,
1363 'title': video_title,
1364 'age_limit': age_limit,
1367 if len(entries) == 1:
1370 for num, e in enumerate(entries, start=1):
1371 # 'url' results don't have a title
1372 if e.get('title') is not None:
1373 e['title'] = '%s (%d)' % (e['title'], num)
1375 '_type': 'playlist',