3 from __future__ import unicode_literals
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
13 compat_xml_parse_error,
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
39 class GenericIE(InfoExtractor):
40 IE_DESC = 'Generic downloader that works on some sites'
45 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
46 'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
48 'id': '13601338388002',
50 'uploader': 'www.hodiho.fr',
51 'title': 'R\u00e9gis plante sa Jeep',
54 # bandcamp page with custom domain
56 'add_ie': ['Bandcamp'],
57 'url': 'http://bronyrock.com/track/the-pony-mash',
61 'title': 'The Pony Mash',
62 'uploader': 'M_Pallante',
64 'skip': 'There is a limit of 200 free downloads / month for the test song',
66 # embedded brightcove video
67 # it also tests brightcove videos that need to set the 'Referer' in the
70 'add_ie': ['Brightcove'],
71 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
73 'id': '2765128793001',
75 'title': 'Le cours de bourse : l’analyse technique',
76 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
77 'uploader': 'BFM BUSINESS',
80 'skip_download': True,
84 # https://github.com/rg3/youtube-dl/issues/2253
85 'url': 'http://bcove.me/i6nfkrc3',
86 'md5': '0ba9446db037002366bab3b3eb30c88c',
88 'id': '3101154703001',
90 'title': 'Still no power',
91 'uploader': 'thestar.com',
92 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
94 'add_ie': ['Brightcove'],
97 'url': 'http://www.championat.com/video/football/v/87/87499.html',
98 'md5': 'fb973ecf6e4a78a67453647444222983',
100 'id': '3414141473001',
102 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
103 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
104 'uploader': 'Championat',
108 # https://github.com/rg3/youtube-dl/issues/3541
109 'add_ie': ['Brightcove'],
110 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
112 'id': '3866516442001',
114 'title': 'Leer mij vrouwen kennen: Aflevering 1',
115 'description': 'Leer mij vrouwen kennen: Aflevering 1',
116 'uploader': 'SBS Broadcasting',
118 'skip': 'Restricted to Netherlands',
120 'skip_download': True, # m3u8 download
123 # Direct link to a video
125 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
126 'md5': '67d406c2bcb6af27fa886f31aa934bbe',
131 'upload_date': '20100513',
136 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
137 'md5': '166dd577b433b4d4ebfee10b0824d8ff',
139 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
141 'title': '2cc213299525360.mov', # that's what we get
143 'add_ie': ['Ooyala'],
145 # multiple ooyala embeds on SBN network websites
147 'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
150 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
152 'playlist_mincount': 3,
154 'skip_download': True,
156 'add_ie': ['Ooyala'],
160 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
164 'upload_date': '20130224',
165 'uploader_id': 'TheVerge',
166 'description': 're:^Chris Ziegler takes a look at the\.*',
167 'uploader': 'The Verge',
168 'title': 'First Firefox OS phones side-by-side',
171 'skip_download': False,
176 'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
180 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
181 'upload_date': '20140225',
182 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
183 'uploader': 'Tested',
184 'uploader_id': 'testedcom',
186 # No need to test YoutubeIE here
188 'skip_download': True,
193 'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
197 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
198 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
203 'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
205 'title': 'BBC - Blogs - Adam Curtis - BUGGER',
207 'playlist_mincount': 18,
211 'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
215 'title': 'Охотское море стало целиком российским',
216 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
220 'skip_download': True,
225 'url': 'http://en.support.wordpress.com/videos/ted-talks/',
226 'md5': '65fdff94098e4a607385a60c5177c638',
230 'title': 'Hidden miracles of the natural world',
231 'uploader': 'Louie Schwartzberg',
232 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
235 # Embeded Ustream video
237 'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
238 'md5': '27b99cdb639c9b12a79bca876a073417',
242 'uploader': 'AU SPA: The NSA and Privacy',
243 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
246 # nowvideo embed hidden behind percent encoding
248 'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
249 'md5': '2baf4ddd70f697d94b1c18cf796d5107',
251 'id': '06e53103ca9aa',
253 'title': 'Macross Episode 001 Watch Macross Episode 001 onl',
254 'description': 'No description',
259 'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
260 'md5': '7653032cbb25bf6c80d80f217055fa43',
262 'id': '048195-004_PLUS7-F',
265 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
266 'upload_date': '20140320',
269 'skip_download': 'Requires rtmpdump'
274 'url': 'http://www.wired.com/2014/04/honda-asimo/',
275 'md5': 'ba0dfe966fa007657bd1443ee672db0f',
277 'id': '53501be369702d3275860000',
279 'title': 'Honda’s New Asimo Robot Is More Human Than Ever',
284 'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
285 'md5': '441aeeb82eb72c422c7f14ec533999cd',
287 'id': 'k2mm4bCdJ6CQ2i7c8o2',
289 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
292 'add_ie': ['Dailymotion'],
296 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
300 'title': 'The NBL Auction 2014',
301 'uploader': 'BADMINTON England',
302 'uploader_id': 'BADMINTONEvents',
303 'upload_date': '20140603',
304 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
306 'add_ie': ['Youtube'],
308 'skip_download': True,
313 'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
314 'md5': '35727f82f58c76d996fc188f9755b0d5',
316 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
319 'description': 'Mario\'s life in the fast lane has never looked so good.',
322 # YouTube embed via <data-embed-url="">
324 'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
328 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
329 'uploader': 'Gameloft',
330 'uploader_id': 'gameloft',
331 'upload_date': '20140828',
332 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
335 'skip_download': True,
340 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
342 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
344 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
345 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
350 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
352 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
353 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
359 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
364 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
365 'md5': '9d65602bf31c6e20014319c7d07fba27',
367 'id': '5123ea6d5e5a7',
370 'uploader': 'www.handjobhub.com',
371 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
376 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
379 'title': 'Zero Punctuation',
380 'description': 're:.*groundbreaking video review series.*'
382 'playlist_mincount': 11,
384 # Multiple brightcove videos
385 # https://github.com/rg3/youtube-dl/issues/2283
387 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
389 'id': 'always-never',
390 'title': 'Always / Never - The New Yorker',
394 'extract_flat': False,
395 'skip_download': True,
400 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
401 'md5': '96f09a37e44da40dd083e12d9a683327',
405 'title': 'Ump changes call to ball',
406 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
408 'timestamp': 1401537900,
409 'upload_date': '20140531',
410 'thumbnail': 're:^https?://.*\.jpg$',
415 'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
416 'md5': '8788b683c777a5cf25621eaf286d0c23',
420 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
422 'filesize': 182808282,
423 'uploader': 'education-portal.com',
427 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
428 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
432 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
434 'uploader': 'thoughtworks.wistia.com',
437 # Direct download with broken HEAD
439 'url': 'http://ai-radio.org:8000/radio.opus',
446 'skip_download': True, # infinite live stream
448 'expected_warnings': [
449 r'501.*Not Implemented'
454 'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
458 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
459 'uploader': 'Sophos Security',
460 'title': 'Chet Chat 171 - Oct 29, 2014',
461 'upload_date': '20141029',
466 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
470 'upload_date': '20141112',
471 'title': 'Rosetta #CometLanding webcast HL 10',
476 'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
479 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
481 'playlist_mincount': 2,
483 # Direct link with incorrect MIME type
485 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
486 'md5': '4ccbebe5f36706d85221f204d7eb5913',
488 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
489 'id': '5_Lennart_Poettering_-_Systemd',
491 'title': '5_Lennart_Poettering_-_Systemd',
492 'upload_date': '20141120',
494 'expected_warnings': [
495 'URL could be a direct video link, returning it as such.'
500 'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
504 'upload_date': '20141126',
505 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
510 'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
512 'id': '730m_DandD_1901_512k',
514 'uploader': 'www.abc.net.au',
515 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
518 # embedded viddler video
520 'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
524 'uploader': 'deadspin',
525 'title': 'WALL-TO-GORTAT',
526 'timestamp': 1422285291,
527 'upload_date': '20150126',
529 'add_ie': ['Viddler'],
533 'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
537 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
538 'description': 'md5:601cb790edd05908957dae8aaa866465',
539 'upload_date': '20150220',
544 'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
548 'upload_date': '20150212',
549 'uploader': 'The National Archives UK',
550 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
551 'uploader_id': 'NationalArchives08',
552 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
557 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
558 'playlist_mincount': 5,
560 'id': 'aanslagen-kopenhagen',
561 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
566 'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
570 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
575 'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
579 'upload_date': '20150226',
580 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
582 'title': 'John Carlson Postgame 2/25/15',
585 # Eagle.Platform embed (generic URL)
587 'url': 'http://lenta.ru/news/2015/03/06/navalny/',
591 'title': 'Навальный вышел на свободу',
592 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
593 'thumbnail': 're:^https?://.*\.jpg$',
599 # ClipYou (Eagle.Platform) embed (custom URL)
601 'url': 'http://muz-tv.ru/play/7129/',
605 'title': "'O Sole Mio",
606 'thumbnail': 're:^https?://.*\.jpg$',
613 'url': 'http://muz-tv.ru/kinozal/view/7400/',
617 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
618 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
619 'thumbnail': 're:^https?://.*\.jpg$',
626 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
627 'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
631 'title': 'Facebook Creates "On This Day" | Crunch Report',
634 # RSS feed with enclosure
636 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
638 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
640 'upload_date': '20150228',
641 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
644 # NBC Sports vplayer embed
646 'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
648 'id': 'ln7x1qSThw4k',
650 'title': "PFT Live: New leader in the 'new-look' defense",
651 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
656 def report_following_redirect(self, new_url):
657 """Report information extraction."""
658 self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
660 def _extract_rss(self, url, video_id, doc):
661 playlist_title = doc.find('./channel/title').text
662 playlist_desc_el = doc.find('./channel/description')
663 playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
666 for it in doc.findall('./channel/item'):
667 next_url = xpath_text(it, 'link', fatal=False)
669 enclosure_nodes = it.findall('./enclosure')
670 for e in enclosure_nodes:
671 next_url = e.attrib.get('url')
681 'title': it.find('title').text,
687 'title': playlist_title,
688 'description': playlist_desc,
692 def _extract_camtasia(self, url, video_id, webpage):
693 """ Returns None if no camtasia video can be found. """
695 camtasia_cfg = self._search_regex(
696 r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
697 webpage, 'camtasia configuration file', default=None)
698 if camtasia_cfg is None:
701 title = self._html_search_meta('DC.title', webpage, fatal=True)
703 camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
704 camtasia_cfg = self._download_xml(
705 camtasia_url, video_id,
706 note='Downloading camtasia configuration',
707 errnote='Failed to download camtasia configuration')
708 fileset_node = camtasia_cfg.find('./playlist/array/fileset')
711 for n in fileset_node.getchildren():
712 url_n = n.find('./uri')
717 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
718 'title': '%s - %s' % (title, n.tag),
719 'url': compat_urlparse.urljoin(url, url_n.text),
720 'duration': float_or_none(n.find('./duration').text),
729 def _real_extract(self, url):
730 if url.startswith('//'):
733 'url': self.http_scheme() + url,
736 parsed_url = compat_urlparse.urlparse(url)
737 if not parsed_url.scheme:
738 default_search = self._downloader.params.get('default_search')
739 if default_search is None:
740 default_search = 'fixup_error'
742 if default_search in ('auto', 'auto_warning', 'fixup_error'):
744 self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
745 return self.url_result('http://' + url)
746 elif default_search != 'fixup_error':
747 if default_search == 'auto_warning':
748 if re.match(r'^(?:url|URL)$', url):
749 raise ExtractorError(
750 'Invalid URL: %r . Call youtube-dl like this: youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc" ' % url,
753 self._downloader.report_warning(
754 'Falling back to youtube search for %s . Set --default-search "auto" to suppress this warning.' % url)
755 return self.url_result('ytsearch:' + url)
757 if default_search in ('error', 'fixup_error'):
758 raise ExtractorError(
759 '%r is not a valid URL. '
760 'Set --default-search "ytsearch" (or run youtube-dl "ytsearch:%s" ) to search YouTube'
761 % (url, url), expected=True)
763 if ':' not in default_search:
764 default_search += ':'
765 return self.url_result(default_search + url)
767 url, smuggled_data = unsmuggle_url(url)
769 is_intentional = smuggled_data and smuggled_data.get('to_generic')
770 if smuggled_data and 'force_videoid' in smuggled_data:
771 force_videoid = smuggled_data['force_videoid']
772 video_id = force_videoid
774 video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
776 self.to_screen('%s: Requesting header' % video_id)
778 head_req = HEADRequest(url)
779 head_response = self._request_webpage(
781 note=False, errnote='Could not send HEAD request to %s' % url,
784 if head_response is not False:
786 new_url = head_response.geturl()
788 self.report_following_redirect(new_url)
790 new_url = smuggle_url(
791 new_url, {'force_videoid': force_videoid})
792 return self.url_result(new_url)
795 if head_response is False:
796 full_response = self._request_webpage(url, video_id)
797 head_response = full_response
799 # Check for direct link to a video
800 content_type = head_response.headers.get('Content-Type', '')
801 m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
803 upload_date = unified_strdate(
804 head_response.headers.get('Last-Modified'))
807 'title': os.path.splitext(url_basename(url))[0],
810 'format_id': m.group('format_id'),
812 'vcodec': 'none' if m.group('type') == 'audio' else None
814 'upload_date': upload_date,
817 if not self._downloader.params.get('test', False) and not is_intentional:
818 self._downloader.report_warning('Falling back on generic information extractor.')
820 if not full_response:
821 full_response = self._request_webpage(url, video_id)
823 # Maybe it's a direct link to a video?
824 # Be careful not to download the whole thing!
825 first_bytes = full_response.read(512)
826 if not is_html(first_bytes):
827 self._downloader.report_warning(
828 'URL could be a direct video link, returning it as such.')
829 upload_date = unified_strdate(
830 head_response.headers.get('Last-Modified'))
833 'title': os.path.splitext(url_basename(url))[0],
836 'upload_date': upload_date,
839 webpage = self._webpage_read_content(
840 full_response, url, video_id, prefix=first_bytes)
842 self.report_extraction(video_id)
846 doc = parse_xml(webpage)
848 return self._extract_rss(url, video_id, doc)
849 except compat_xml_parse_error:
852 # Is it a Camtasia project?
853 camtasia_res = self._extract_camtasia(url, video_id, webpage)
854 if camtasia_res is not None:
857 # Sometimes embedded video player is hidden behind percent encoding
858 # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
859 # Unescaping the whole page allows to handle those cases in a generic way
860 webpage = compat_urllib_parse.unquote(webpage)
862 # it's tempting to parse this further, but you would
863 # have to take into account all the variations like
864 # Video Title - Site Name
865 # Site Name | Video Title
866 # Video Title - Tagline | Site Name
867 # and so on and so forth; it's just not practical
868 video_title = self._html_search_regex(
869 r'(?s)<title>(.*?)</title>', webpage, 'video title',
872 # Try to detect age limit automatically
873 age_limit = self._rta_search(webpage)
874 # And then there are the jokers who advertise that they use RTA,
875 # but actually don't.
876 AGE_LIMIT_MARKERS = [
877 r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
879 if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
882 # video uploader is domain name
883 video_uploader = self._search_regex(
884 r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
887 def _playlist_from_matches(matches, getter=None, ie=None):
889 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
891 return self.playlist_result(
892 urlrs, playlist_id=video_id, playlist_title=video_title)
894 # Look for BrightCove:
895 bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
897 self.to_screen('Brightcove video detected.')
900 'url': smuggle_url(bc_url, {'Referer': url}),
901 'ie_key': 'Brightcove'
902 } for bc_url in bc_urls]
906 'title': video_title,
911 # Look for embedded rtl.nl player
912 matches = re.findall(
913 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
916 return _playlist_from_matches(matches, ie='RtlNl')
918 # Look for embedded (iframe) Vimeo player
920 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
922 player_url = unescapeHTML(mobj.group('url'))
923 surl = smuggle_url(player_url, {'Referer': url})
924 return self.url_result(surl)
925 # Look for embedded (swf embed) Vimeo player
927 r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
929 return self.url_result(mobj.group(1))
931 # Look for embedded YouTube player
932 matches = re.findall(r'''(?x)
941 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
945 return _playlist_from_matches(
946 matches, lambda m: unescapeHTML(m[1]))
948 # Look for lazyYT YouTube embed
949 matches = re.findall(
950 r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
952 return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
954 # Look for embedded Dailymotion player
955 matches = re.findall(
956 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
958 return _playlist_from_matches(
959 matches, lambda m: unescapeHTML(m[1]))
961 # Look for embedded Dailymotion playlist player (#3822)
963 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
965 playlists = re.findall(
966 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
968 return _playlist_from_matches(
969 playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
971 # Look for embedded Wistia player
973 r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
975 embed_url = self._proto_relative_url(
976 unescapeHTML(match.group('url')))
978 '_type': 'url_transparent',
981 'uploader': video_uploader,
982 'title': video_title,
986 match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
989 '_type': 'url_transparent',
990 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
992 'uploader': video_uploader,
993 'title': video_title,
994 'id': match.group('id')
997 # Look for embedded blip.tv player
998 mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
1000 return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1001 mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1003 return self.url_result(mobj.group(1), 'BlipTV')
1005 # Look for embedded condenast player
1006 matches = re.findall(
1007 r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1011 '_type': 'playlist',
1014 'ie_key': 'CondeNast',
1016 } for ma in matches],
1017 'title': video_title,
1021 # Look for Bandcamp pages with custom domain
1022 mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1023 if mobj is not None:
1024 burl = unescapeHTML(mobj.group(1))
1025 # Don't set the extractor because it can be a track url or an album
1026 return self.url_result(burl)
1028 # Look for embedded Vevo player
1030 r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1031 if mobj is not None:
1032 return self.url_result(mobj.group('url'))
1034 # Look for embedded Viddler player
1036 r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1038 if mobj is not None:
1039 return self.url_result(mobj.group('url'))
1041 # Look for NYTimes player
1043 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1045 if mobj is not None:
1046 return self.url_result(mobj.group('url'))
1048 # Look for Libsyn player
1050 r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1051 if mobj is not None:
1052 return self.url_result(mobj.group('url'))
1054 # Look for Ooyala videos
1055 mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1056 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1057 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
1058 if mobj is not None:
1059 return OoyalaIE._build_url_result(mobj.group('ec'))
1061 # Look for multiple Ooyala embeds on SBN network websites
1062 mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1063 if mobj is not None:
1064 embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1066 return _playlist_from_matches(
1067 embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1069 # Look for Aparat videos
1070 mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1071 if mobj is not None:
1072 return self.url_result(mobj.group(1), 'Aparat')
1074 # Look for MPORA videos
1075 mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1076 if mobj is not None:
1077 return self.url_result(mobj.group(1), 'Mpora')
1079 # Look for embedded NovaMov-based player
1081 r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1082 (?P<url>http://(?:(?:embed|www)\.)?
1084 nowvideo\.(?:ch|sx|eu|at|ag|co)|
1085 videoweed\.(?:es|com)|
1086 movshare\.(?:net|sx|ag)|
1087 divxstage\.(?:eu|net|ch|co|at|ag))
1088 /embed\.php.+?)\1''', webpage)
1089 if mobj is not None:
1090 return self.url_result(mobj.group('url'))
1092 # Look for embedded Facebook player
1094 r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1095 if mobj is not None:
1096 return self.url_result(mobj.group('url'), 'Facebook')
1098 # Look for embedded VK player
1099 mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1100 if mobj is not None:
1101 return self.url_result(mobj.group('url'), 'VK')
1103 # Look for embedded ivi player
1104 mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1105 if mobj is not None:
1106 return self.url_result(mobj.group('url'), 'Ivi')
1108 # Look for embedded Huffington Post player
1110 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1111 if mobj is not None:
1112 return self.url_result(mobj.group('url'), 'HuffPost')
1115 mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1116 if mobj is not None:
1117 return self.url_result(mobj.group('url'))
1118 mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1119 if mobj is not None:
1120 return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1122 # Look for funnyordie embed
1123 matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1125 return _playlist_from_matches(
1126 matches, getter=unescapeHTML, ie='FunnyOrDie')
1128 # Look for BBC iPlayer embed
1129 matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1131 return _playlist_from_matches(matches, ie='BBCCoUk')
1133 # Look for embedded RUTV player
1134 rutv_url = RUTVIE._extract_url(webpage)
1136 return self.url_result(rutv_url, 'RUTV')
1138 # Look for embedded TED player
1140 r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1141 if mobj is not None:
1142 return self.url_result(mobj.group('url'), 'TED')
1144 # Look for embedded Ustream videos
1146 r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1147 if mobj is not None:
1148 return self.url_result(mobj.group('url'), 'Ustream')
1150 # Look for embedded arte.tv player
1152 r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1154 if mobj is not None:
1155 return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1157 # Look for embedded smotri.com player
1158 smotri_url = SmotriIE._extract_url(webpage)
1160 return self.url_result(smotri_url, 'Smotri')
1162 # Look for embeded soundcloud player
1164 r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1166 if mobj is not None:
1167 url = unescapeHTML(mobj.group('url'))
1168 return self.url_result(url)
1170 # Look for embedded vulture.com player
1172 r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1174 if mobj is not None:
1175 url = unescapeHTML(mobj.group('url'))
1176 return self.url_result(url, ie='Vulture')
1178 # Look for embedded mtvservices player
1180 r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1182 if mobj is not None:
1183 url = unescapeHTML(mobj.group('url'))
1184 return self.url_result(url, ie='MTVServicesEmbedded')
1186 # Look for embedded yahoo player
1188 r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1190 if mobj is not None:
1191 return self.url_result(mobj.group('url'), 'Yahoo')
1193 # Look for embedded sbs.com.au player
1197 <meta\s+property="og:video"\s+content=|
1200 (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1202 if mobj is not None:
1203 return self.url_result(mobj.group('url'), 'SBS')
1205 # Look for embedded Cinchcast player
1207 r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1209 if mobj is not None:
1210 return self.url_result(mobj.group('url'), 'Cinchcast')
1213 r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1215 if mobj is not None:
1216 return self.url_result(mobj.group('url'), 'MLB')
1219 r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1221 if mobj is not None:
1222 return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1225 r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1227 if mobj is not None:
1228 return self.url_result(mobj.group('url'), 'Livestream')
1230 # Look for Zapiks embed
1232 r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1233 if mobj is not None:
1234 return self.url_result(mobj.group('url'), 'Zapiks')
1236 # Look for Kaltura embeds
1238 r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1239 if mobj is not None:
1240 return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1242 # Look for Eagle.Platform embeds
1244 r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1245 if mobj is not None:
1246 return self.url_result(mobj.group('url'), 'EaglePlatform')
1248 # Look for ClipYou (uses Eagle.Platform) embeds
1250 r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1251 if mobj is not None:
1252 return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1254 # Look for Pladform embeds
1256 r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1257 if mobj is not None:
1258 return self.url_result(mobj.group('url'), 'Pladform')
1260 # Look for 5min embeds
1262 r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1263 if mobj is not None:
1264 return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1266 # Look for NBC Sports VPlayer embeds
1267 nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1269 return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1271 def check_video(vurl):
1272 if YoutubeIE.suitable(vurl):
1274 vpath = compat_urlparse.urlparse(vurl).path
1275 vext = determine_ext(vpath)
1276 return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1278 def filter_video(urls):
1279 return list(filter(check_video, urls))
1281 # Start with something easy: JW Player in SWFObject
1282 found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1284 # Look for gorilla-vid style embedding
1285 found = filter_video(re.findall(r'''(?sx)
1289 jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1292 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1294 # Broaden the search a little bit
1295 found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1297 # Broaden the findall a little bit: JWPlayer JS loader
1298 found = filter_video(re.findall(
1299 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1302 found = filter_video(re.findall(r'''(?xs)
1303 flowplayer\("[^"]+",\s*
1305 \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1306 ["']?url["']?\s*:\s*["']([^"']+)["']
1311 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1313 # Try to find twitter cards info
1314 found = filter_video(re.findall(
1315 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1317 # We look for Open Graph info:
1318 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1319 m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1320 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1321 if m_video_type is not None:
1322 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1325 found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1327 REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1329 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1330 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1333 # Look also in Refresh HTTP header
1334 refresh_header = head_response.headers.get('Refresh')
1336 found = re.search(REDIRECT_REGEX, refresh_header)
1338 new_url = found.group(1)
1339 self.report_following_redirect(new_url)
1345 raise UnsupportedError(url)
1348 for video_url in found:
1349 video_url = compat_urlparse.urljoin(url, video_url)
1350 video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1352 # Sometimes, jwplayer extraction will result in a YouTube URL
1353 if YoutubeIE.suitable(video_url):
1354 entries.append(self.url_result(video_url, 'Youtube'))
1357 # here's a fun little line of code for you:
1358 video_id = os.path.splitext(video_id)[0]
1363 'uploader': video_uploader,
1364 'title': video_title,
1365 'age_limit': age_limit,
1368 if len(entries) == 1:
1371 for num, e in enumerate(entries, start=1):
1372 # 'url' results don't have a title
1373 if e.get('title') is not None:
1374 e['title'] = '%s (%d)' % (e['title'], num)
1376 '_type': 'playlist',