Merge branch 'snagfilms' of https://github.com/remitamine/youtube-dl into remitamine...
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urllib_parse_unquote,
13     compat_urllib_request,
14     compat_urlparse,
15     compat_xml_parse_error,
16 )
17 from ..utils import (
18     determine_ext,
19     ExtractorError,
20     float_or_none,
21     HEADRequest,
22     is_html,
23     orderedSet,
24     parse_xml,
25     smuggle_url,
26     unescapeHTML,
27     unified_strdate,
28     unsmuggle_url,
29     UnsupportedError,
30     url_basename,
31     xpath_text,
32 )
33 from .brightcove import BrightcoveIE
34 from .nbc import NBCSportsVPlayerIE
35 from .ooyala import OoyalaIE
36 from .rutv import RUTVIE
37 from .tvc import TVCIE
38 from .sportbox import SportBoxEmbedIE
39 from .smotri import SmotriIE
40 from .condenast import CondeNastIE
41 from .udn import UDNEmbedIE
42 from .senateisvp import SenateISVPIE
43 from .bliptv import BlipTVIE
44 from .svt import SVTIE
45 from .pornhub import PornHubIE
46 from .xhamster import XHamsterEmbedIE
47 from .vimeo import VimeoIE
48 from .dailymotion import DailymotionCloudIE
49 from .onionstudios import OnionStudiosIE
50
51
52 class GenericIE(InfoExtractor):
53     IE_DESC = 'Generic downloader that works on some sites'
54     _VALID_URL = r'.*'
55     IE_NAME = 'generic'
56     _TESTS = [
57         # Direct link to a video
58         {
59             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
60             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
61             'info_dict': {
62                 'id': 'trailer',
63                 'ext': 'mp4',
64                 'title': 'trailer',
65                 'upload_date': '20100513',
66             }
67         },
68         # Direct link to media delivered compressed (until Accept-Encoding is *)
69         {
70             'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
71             'md5': '128c42e68b13950268b648275386fc74',
72             'info_dict': {
73                 'id': 'FictionJunction-Parallel_Hearts',
74                 'ext': 'flac',
75                 'title': 'FictionJunction-Parallel_Hearts',
76                 'upload_date': '20140522',
77             },
78             'expected_warnings': [
79                 'URL could be a direct video link, returning it as such.'
80             ]
81         },
82         # Direct download with broken HEAD
83         {
84             'url': 'http://ai-radio.org:8000/radio.opus',
85             'info_dict': {
86                 'id': 'radio',
87                 'ext': 'opus',
88                 'title': 'radio',
89             },
90             'params': {
91                 'skip_download': True,  # infinite live stream
92             },
93             'expected_warnings': [
94                 r'501.*Not Implemented'
95             ],
96         },
97         # Direct link with incorrect MIME type
98         {
99             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
100             'md5': '4ccbebe5f36706d85221f204d7eb5913',
101             'info_dict': {
102                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
103                 'id': '5_Lennart_Poettering_-_Systemd',
104                 'ext': 'webm',
105                 'title': '5_Lennart_Poettering_-_Systemd',
106                 'upload_date': '20141120',
107             },
108             'expected_warnings': [
109                 'URL could be a direct video link, returning it as such.'
110             ]
111         },
112         # RSS feed
113         {
114             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
115             'info_dict': {
116                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
117                 'title': 'Zero Punctuation',
118                 'description': 're:.*groundbreaking video review series.*'
119             },
120             'playlist_mincount': 11,
121         },
122         # RSS feed with enclosure
123         {
124             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
125             'info_dict': {
126                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
127                 'ext': 'm4v',
128                 'upload_date': '20150228',
129                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
130             }
131         },
132         # google redirect
133         {
134             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
135             'info_dict': {
136                 'id': 'cmQHVoWB5FY',
137                 'ext': 'mp4',
138                 'upload_date': '20130224',
139                 'uploader_id': 'TheVerge',
140                 'description': 're:^Chris Ziegler takes a look at the\.*',
141                 'uploader': 'The Verge',
142                 'title': 'First Firefox OS phones side-by-side',
143             },
144             'params': {
145                 'skip_download': False,
146             }
147         },
148         {
149             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
150             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
151             'info_dict': {
152                 'id': '13601338388002',
153                 'ext': 'mp4',
154                 'uploader': 'www.hodiho.fr',
155                 'title': 'R\u00e9gis plante sa Jeep',
156             }
157         },
158         # bandcamp page with custom domain
159         {
160             'add_ie': ['Bandcamp'],
161             'url': 'http://bronyrock.com/track/the-pony-mash',
162             'info_dict': {
163                 'id': '3235767654',
164                 'ext': 'mp3',
165                 'title': 'The Pony Mash',
166                 'uploader': 'M_Pallante',
167             },
168             'skip': 'There is a limit of 200 free downloads / month for the test song',
169         },
170         # embedded brightcove video
171         # it also tests brightcove videos that need to set the 'Referer' in the
172         # http requests
173         {
174             'add_ie': ['Brightcove'],
175             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
176             'info_dict': {
177                 'id': '2765128793001',
178                 'ext': 'mp4',
179                 'title': 'Le cours de bourse : l’analyse technique',
180                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
181                 'uploader': 'BFM BUSINESS',
182             },
183             'params': {
184                 'skip_download': True,
185             },
186         },
187         {
188             # https://github.com/rg3/youtube-dl/issues/2253
189             'url': 'http://bcove.me/i6nfkrc3',
190             'md5': '0ba9446db037002366bab3b3eb30c88c',
191             'info_dict': {
192                 'id': '3101154703001',
193                 'ext': 'mp4',
194                 'title': 'Still no power',
195                 'uploader': 'thestar.com',
196                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
197             },
198             'add_ie': ['Brightcove'],
199         },
200         {
201             'url': 'http://www.championat.com/video/football/v/87/87499.html',
202             'md5': 'fb973ecf6e4a78a67453647444222983',
203             'info_dict': {
204                 'id': '3414141473001',
205                 'ext': 'mp4',
206                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
207                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
208                 'uploader': 'Championat',
209             },
210         },
211         {
212             # https://github.com/rg3/youtube-dl/issues/3541
213             'add_ie': ['Brightcove'],
214             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
215             'info_dict': {
216                 'id': '3866516442001',
217                 'ext': 'mp4',
218                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
219                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
220                 'uploader': 'SBS Broadcasting',
221             },
222             'skip': 'Restricted to Netherlands',
223             'params': {
224                 'skip_download': True,  # m3u8 download
225             },
226         },
227         # ooyala video
228         {
229             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
230             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
231             'info_dict': {
232                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
233                 'ext': 'mp4',
234                 'title': '2cc213299525360.mov',  # that's what we get
235             },
236             'add_ie': ['Ooyala'],
237         },
238         # multiple ooyala embeds on SBN network websites
239         {
240             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
241             'info_dict': {
242                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
243                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
244             },
245             'playlist_mincount': 3,
246             'params': {
247                 'skip_download': True,
248             },
249             'add_ie': ['Ooyala'],
250         },
251         # embed.ly video
252         {
253             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
254             'info_dict': {
255                 'id': '9ODmcdjQcHQ',
256                 'ext': 'mp4',
257                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
258                 'upload_date': '20140225',
259                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
260                 'uploader': 'Tested',
261                 'uploader_id': 'testedcom',
262             },
263             # No need to test YoutubeIE here
264             'params': {
265                 'skip_download': True,
266             },
267         },
268         # funnyordie embed
269         {
270             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
271             'info_dict': {
272                 'id': '18e820ec3f',
273                 'ext': 'mp4',
274                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
275                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
276             },
277         },
278         # BBC iPlayer embeds
279         {
280             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
281             'info_dict': {
282                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
283             },
284             'playlist_mincount': 18,
285         },
286         # RUTV embed
287         {
288             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
289             'info_dict': {
290                 'id': '776940',
291                 'ext': 'mp4',
292                 'title': 'Охотское море стало целиком российским',
293                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
294             },
295             'params': {
296                 # m3u8 download
297                 'skip_download': True,
298             },
299         },
300         # TVC embed
301         {
302             'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
303             'info_dict': {
304                 'id': '55304',
305                 'ext': 'mp4',
306                 'title': 'Дошкольное воспитание',
307             },
308         },
309         # SportBox embed
310         {
311             'url': 'http://www.vestifinance.ru/articles/25753',
312             'info_dict': {
313                 'id': '25753',
314                 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
315             },
316             'playlist': [{
317                 'info_dict': {
318                     'id': '370908',
319                     'title': 'Госзаказ. День 3',
320                     'ext': 'mp4',
321                 }
322             }, {
323                 'info_dict': {
324                     'id': '370905',
325                     'title': 'Госзаказ. День 2',
326                     'ext': 'mp4',
327                 }
328             }, {
329                 'info_dict': {
330                     'id': '370902',
331                     'title': 'Госзаказ. День 1',
332                     'ext': 'mp4',
333                 }
334             }],
335             'params': {
336                 # m3u8 download
337                 'skip_download': True,
338             },
339         },
340         # XHamster embed
341         {
342             'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
343             'info_dict': {
344                 'id': 'showthread',
345                 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
346             },
347             'playlist_mincount': 7,
348         },
349         # Embedded TED video
350         {
351             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
352             'md5': '65fdff94098e4a607385a60c5177c638',
353             'info_dict': {
354                 'id': '1969',
355                 'ext': 'mp4',
356                 'title': 'Hidden miracles of the natural world',
357                 'uploader': 'Louie Schwartzberg',
358                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
359             }
360         },
361         # Embeded Ustream video
362         {
363             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
364             'md5': '27b99cdb639c9b12a79bca876a073417',
365             'info_dict': {
366                 'id': '45734260',
367                 'ext': 'flv',
368                 'uploader': 'AU SPA:  The NSA and Privacy',
369                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
370             }
371         },
372         # nowvideo embed hidden behind percent encoding
373         {
374             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
375             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
376             'info_dict': {
377                 'id': '06e53103ca9aa',
378                 'ext': 'flv',
379                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
380                 'description': 'No description',
381             },
382         },
383         # arte embed
384         {
385             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
386             'md5': '7653032cbb25bf6c80d80f217055fa43',
387             'info_dict': {
388                 'id': '048195-004_PLUS7-F',
389                 'ext': 'flv',
390                 'title': 'X:enius',
391                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
392                 'upload_date': '20140320',
393             },
394             'params': {
395                 'skip_download': 'Requires rtmpdump'
396             }
397         },
398         # Condé Nast embed
399         {
400             'url': 'http://www.wired.com/2014/04/honda-asimo/',
401             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
402             'info_dict': {
403                 'id': '53501be369702d3275860000',
404                 'ext': 'mp4',
405                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
406             }
407         },
408         # Dailymotion embed
409         {
410             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
411             'md5': '441aeeb82eb72c422c7f14ec533999cd',
412             'info_dict': {
413                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
414                 'ext': 'mp4',
415                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
416                 'uploader': 'Spi0n',
417             },
418             'add_ie': ['Dailymotion'],
419         },
420         # YouTube embed
421         {
422             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
423             'info_dict': {
424                 'id': 'FXRb4ykk4S0',
425                 'ext': 'mp4',
426                 'title': 'The NBL Auction 2014',
427                 'uploader': 'BADMINTON England',
428                 'uploader_id': 'BADMINTONEvents',
429                 'upload_date': '20140603',
430                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
431             },
432             'add_ie': ['Youtube'],
433             'params': {
434                 'skip_download': True,
435             }
436         },
437         # MTVSercices embed
438         {
439             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
440             'md5': '35727f82f58c76d996fc188f9755b0d5',
441             'info_dict': {
442                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
443                 'ext': 'mp4',
444                 'title': 'Review',
445                 'description': 'Mario\'s life in the fast lane has never looked so good.',
446             },
447         },
448         # YouTube embed via <data-embed-url="">
449         {
450             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
451             'info_dict': {
452                 'id': '4vAffPZIT44',
453                 'ext': 'mp4',
454                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
455                 'uploader': 'Gameloft',
456                 'uploader_id': 'gameloft',
457                 'upload_date': '20140828',
458                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
459             },
460             'params': {
461                 'skip_download': True,
462             }
463         },
464         # Camtasia studio
465         {
466             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
467             'playlist': [{
468                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
469                 'info_dict': {
470                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
471                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
472                     'ext': 'flv',
473                     'duration': 2235.90,
474                 }
475             }, {
476                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
477                 'info_dict': {
478                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
479                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
480                     'ext': 'flv',
481                     'duration': 2235.93,
482                 }
483             }],
484             'info_dict': {
485                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
486             }
487         },
488         # Flowplayer
489         {
490             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
491             'md5': '9d65602bf31c6e20014319c7d07fba27',
492             'info_dict': {
493                 'id': '5123ea6d5e5a7',
494                 'ext': 'mp4',
495                 'age_limit': 18,
496                 'uploader': 'www.handjobhub.com',
497                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
498             }
499         },
500         # Multiple brightcove videos
501         # https://github.com/rg3/youtube-dl/issues/2283
502         {
503             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
504             'info_dict': {
505                 'id': 'always-never',
506                 'title': 'Always / Never - The New Yorker',
507             },
508             'playlist_count': 3,
509             'params': {
510                 'extract_flat': False,
511                 'skip_download': True,
512             }
513         },
514         # MLB embed
515         {
516             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
517             'md5': '96f09a37e44da40dd083e12d9a683327',
518             'info_dict': {
519                 'id': '33322633',
520                 'ext': 'mp4',
521                 'title': 'Ump changes call to ball',
522                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
523                 'duration': 48,
524                 'timestamp': 1401537900,
525                 'upload_date': '20140531',
526                 'thumbnail': 're:^https?://.*\.jpg$',
527             },
528         },
529         # Wistia embed
530         {
531             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
532             'md5': '8788b683c777a5cf25621eaf286d0c23',
533             'info_dict': {
534                 'id': '1cfaf6b7ea',
535                 'ext': 'mov',
536                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
537                 'duration': 643.0,
538                 'filesize': 182808282,
539                 'uploader': 'education-portal.com',
540             },
541         },
542         {
543             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
544             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
545             'info_dict': {
546                 'id': 'uxjb0lwrcz',
547                 'ext': 'mp4',
548                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
549                 'duration': 1715.0,
550                 'uploader': 'thoughtworks.wistia.com',
551             },
552         },
553         # Soundcloud embed
554         {
555             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
556             'info_dict': {
557                 'id': '174391317',
558                 'ext': 'mp3',
559                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
560                 'uploader': 'Sophos Security',
561                 'title': 'Chet Chat 171 - Oct 29, 2014',
562                 'upload_date': '20141029',
563             }
564         },
565         # Livestream embed
566         {
567             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
568             'info_dict': {
569                 'id': '67864563',
570                 'ext': 'flv',
571                 'upload_date': '20141112',
572                 'title': 'Rosetta #CometLanding webcast HL 10',
573             }
574         },
575         # LazyYT
576         {
577             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
578             'info_dict': {
579                 'id': '1986',
580                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
581             },
582             'playlist_mincount': 2,
583         },
584         # Cinchcast embed
585         {
586             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
587             'info_dict': {
588                 'id': '7141703',
589                 'ext': 'mp3',
590                 'upload_date': '20141126',
591                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
592             }
593         },
594         # Cinerama player
595         {
596             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
597             'info_dict': {
598                 'id': '730m_DandD_1901_512k',
599                 'ext': 'mp4',
600                 'uploader': 'www.abc.net.au',
601                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
602             }
603         },
604         # embedded viddler video
605         {
606             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
607             'info_dict': {
608                 'id': '4d03aad9',
609                 'ext': 'mp4',
610                 'uploader': 'deadspin',
611                 'title': 'WALL-TO-GORTAT',
612                 'timestamp': 1422285291,
613                 'upload_date': '20150126',
614             },
615             'add_ie': ['Viddler'],
616         },
617         # Libsyn embed
618         {
619             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
620             'info_dict': {
621                 'id': '3377616',
622                 'ext': 'mp3',
623                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
624                 'description': 'md5:601cb790edd05908957dae8aaa866465',
625                 'upload_date': '20150220',
626             },
627         },
628         # jwplayer YouTube
629         {
630             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
631             'info_dict': {
632                 'id': 'Mrj4DVp2zeA',
633                 'ext': 'mp4',
634                 'upload_date': '20150212',
635                 'uploader': 'The National Archives UK',
636                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
637                 'uploader_id': 'NationalArchives08',
638                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
639             },
640         },
641         # rtl.nl embed
642         {
643             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
644             'playlist_mincount': 5,
645             'info_dict': {
646                 'id': 'aanslagen-kopenhagen',
647                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
648             }
649         },
650         # Zapiks embed
651         {
652             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
653             'info_dict': {
654                 'id': '118046',
655                 'ext': 'mp4',
656                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
657             }
658         },
659         # Kaltura embed
660         {
661             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
662             'info_dict': {
663                 'id': '1_eergr3h1',
664                 'ext': 'mp4',
665                 'upload_date': '20150226',
666                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
667                 'timestamp': int,
668                 'title': 'John Carlson Postgame 2/25/15',
669             },
670         },
671         # Eagle.Platform embed (generic URL)
672         {
673             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
674             'info_dict': {
675                 'id': '227304',
676                 'ext': 'mp4',
677                 'title': 'Навальный вышел на свободу',
678                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
679                 'thumbnail': 're:^https?://.*\.jpg$',
680                 'duration': 87,
681                 'view_count': int,
682                 'age_limit': 0,
683             },
684         },
685         # ClipYou (Eagle.Platform) embed (custom URL)
686         {
687             'url': 'http://muz-tv.ru/play/7129/',
688             'info_dict': {
689                 'id': '12820',
690                 'ext': 'mp4',
691                 'title': "'O Sole Mio",
692                 'thumbnail': 're:^https?://.*\.jpg$',
693                 'duration': 216,
694                 'view_count': int,
695             },
696         },
697         # Pladform embed
698         {
699             'url': 'http://muz-tv.ru/kinozal/view/7400/',
700             'info_dict': {
701                 'id': '100183293',
702                 'ext': 'mp4',
703                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
704                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
705                 'thumbnail': 're:^https?://.*\.jpg$',
706                 'duration': 694,
707                 'age_limit': 0,
708             },
709         },
710         # Playwire embed
711         {
712             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
713             'info_dict': {
714                 'id': '3519514',
715                 'ext': 'mp4',
716                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
717                 'thumbnail': 're:^https?://.*\.png$',
718                 'duration': 45.115,
719             },
720         },
721         # 5min embed
722         {
723             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
724             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
725             'info_dict': {
726                 'id': '518726732',
727                 'ext': 'mp4',
728                 'title': 'Facebook Creates "On This Day" | Crunch Report',
729             },
730         },
731         # SVT embed
732         {
733             'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
734             'info_dict': {
735                 'id': '2900353',
736                 'ext': 'flv',
737                 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
738                 'duration': 27,
739                 'age_limit': 0,
740             },
741         },
742         # Crooks and Liars embed
743         {
744             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
745             'info_dict': {
746                 'id': '8RUoRhRi',
747                 'ext': 'mp4',
748                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
749                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
750                 'timestamp': 1428207000,
751                 'upload_date': '20150405',
752                 'uploader': 'Heather',
753             },
754         },
755         # Crooks and Liars external embed
756         {
757             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
758             'info_dict': {
759                 'id': 'MTE3MjUtMzQ2MzA',
760                 'ext': 'mp4',
761                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
762                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
763                 'timestamp': 1265032391,
764                 'upload_date': '20100201',
765                 'uploader': 'Heather',
766             },
767         },
768         # NBC Sports vplayer embed
769         {
770             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
771             'info_dict': {
772                 'id': 'ln7x1qSThw4k',
773                 'ext': 'flv',
774                 'title': "PFT Live: New leader in the 'new-look' defense",
775                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
776             },
777         },
778         # UDN embed
779         {
780             'url': 'http://www.udn.com/news/story/7314/822787',
781             'md5': 'fd2060e988c326991037b9aff9df21a6',
782             'info_dict': {
783                 'id': '300346',
784                 'ext': 'mp4',
785                 'title': '中一中男師變性 全校師生力挺',
786                 'thumbnail': 're:^https?://.*\.jpg$',
787             }
788         },
789         # Ooyala embed
790         {
791             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
792             'info_dict': {
793                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
794                 'ext': 'mp4',
795                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
796                 'title': 'This is what separates the Excel masters from the wannabes',
797             },
798             'params': {
799                 # m3u8 downloads
800                 'skip_download': True,
801             }
802         },
803         # Contains a SMIL manifest
804         {
805             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
806             'info_dict': {
807                 'id': 'file',
808                 'ext': 'flv',
809                 'title': '+ Football: Lottery Champions League Europe',
810                 'uploader': 'www.telewebion.com',
811             },
812             'params': {
813                 # rtmpe downloads
814                 'skip_download': True,
815             }
816         },
817         # Brightcove URL in single quotes
818         {
819             'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
820             'md5': '4ae374f1f8b91c889c4b9203c8c752af',
821             'info_dict': {
822                 'id': '4255764656001',
823                 'ext': 'mp4',
824                 'title': 'SN Presents: Russell Martin, World Citizen',
825                 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
826                 'uploader': 'Rogers Sportsnet',
827             },
828         },
829         # Dailymotion Cloud video
830         {
831             'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
832             'md5': '49444254273501a64675a7e68c502681',
833             'info_dict': {
834                 'id': '5585de919473990de4bee11b',
835                 'ext': 'mp4',
836                 'title': 'Le débat',
837                 'thumbnail': 're:^https?://.*\.jpe?g$',
838             }
839         },
840         # OnionStudios embed
841         {
842             'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
843             'info_dict': {
844                 'id': '2855',
845                 'ext': 'mp4',
846                 'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
847                 'thumbnail': 're:^https?://.*\.jpe?g$',
848                 'uploader': 'ClickHole',
849                 'uploader_id': 'clickhole',
850             }
851         },
852         # AdobeTVVideo embed
853         {
854             'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners',
855             'md5': '43662b577c018ad707a63766462b1e87',
856             'info_dict': {
857                 'id': '2456',
858                 'ext': 'mp4',
859                 'title': 'New experience with Acrobat DC',
860                 'description': 'New experience with Acrobat DC',
861                 'duration': 248.667,
862             },
863         }
864     ]
865
866     def report_following_redirect(self, new_url):
867         """Report information extraction."""
868         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
869
870     def _extract_rss(self, url, video_id, doc):
871         playlist_title = doc.find('./channel/title').text
872         playlist_desc_el = doc.find('./channel/description')
873         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
874
875         entries = []
876         for it in doc.findall('./channel/item'):
877             next_url = xpath_text(it, 'link', fatal=False)
878             if not next_url:
879                 enclosure_nodes = it.findall('./enclosure')
880                 for e in enclosure_nodes:
881                     next_url = e.attrib.get('url')
882                     if next_url:
883                         break
884
885             if not next_url:
886                 continue
887
888             entries.append({
889                 '_type': 'url',
890                 'url': next_url,
891                 'title': it.find('title').text,
892             })
893
894         return {
895             '_type': 'playlist',
896             'id': url,
897             'title': playlist_title,
898             'description': playlist_desc,
899             'entries': entries,
900         }
901
902     def _extract_camtasia(self, url, video_id, webpage):
903         """ Returns None if no camtasia video can be found. """
904
905         camtasia_cfg = self._search_regex(
906             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
907             webpage, 'camtasia configuration file', default=None)
908         if camtasia_cfg is None:
909             return None
910
911         title = self._html_search_meta('DC.title', webpage, fatal=True)
912
913         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
914         camtasia_cfg = self._download_xml(
915             camtasia_url, video_id,
916             note='Downloading camtasia configuration',
917             errnote='Failed to download camtasia configuration')
918         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
919
920         entries = []
921         for n in fileset_node.getchildren():
922             url_n = n.find('./uri')
923             if url_n is None:
924                 continue
925
926             entries.append({
927                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
928                 'title': '%s - %s' % (title, n.tag),
929                 'url': compat_urlparse.urljoin(url, url_n.text),
930                 'duration': float_or_none(n.find('./duration').text),
931             })
932
933         return {
934             '_type': 'playlist',
935             'entries': entries,
936             'title': title,
937         }
938
939     def _real_extract(self, url):
940         if url.startswith('//'):
941             return {
942                 '_type': 'url',
943                 'url': self.http_scheme() + url,
944             }
945
946         parsed_url = compat_urlparse.urlparse(url)
947         if not parsed_url.scheme:
948             default_search = self._downloader.params.get('default_search')
949             if default_search is None:
950                 default_search = 'fixup_error'
951
952             if default_search in ('auto', 'auto_warning', 'fixup_error'):
953                 if '/' in url:
954                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
955                     return self.url_result('http://' + url)
956                 elif default_search != 'fixup_error':
957                     if default_search == 'auto_warning':
958                         if re.match(r'^(?:url|URL)$', url):
959                             raise ExtractorError(
960                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
961                                 expected=True)
962                         else:
963                             self._downloader.report_warning(
964                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
965                     return self.url_result('ytsearch:' + url)
966
967             if default_search in ('error', 'fixup_error'):
968                 raise ExtractorError(
969                     '%r is not a valid URL. '
970                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
971                     % (url, url), expected=True)
972             else:
973                 if ':' not in default_search:
974                     default_search += ':'
975                 return self.url_result(default_search + url)
976
977         url, smuggled_data = unsmuggle_url(url)
978         force_videoid = None
979         is_intentional = smuggled_data and smuggled_data.get('to_generic')
980         if smuggled_data and 'force_videoid' in smuggled_data:
981             force_videoid = smuggled_data['force_videoid']
982             video_id = force_videoid
983         else:
984             video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
985
986         self.to_screen('%s: Requesting header' % video_id)
987
988         head_req = HEADRequest(url)
989         head_response = self._request_webpage(
990             head_req, video_id,
991             note=False, errnote='Could not send HEAD request to %s' % url,
992             fatal=False)
993
994         if head_response is not False:
995             # Check for redirect
996             new_url = head_response.geturl()
997             if url != new_url:
998                 self.report_following_redirect(new_url)
999                 if force_videoid:
1000                     new_url = smuggle_url(
1001                         new_url, {'force_videoid': force_videoid})
1002                 return self.url_result(new_url)
1003
1004         full_response = None
1005         if head_response is False:
1006             request = compat_urllib_request.Request(url)
1007             request.add_header('Accept-Encoding', '*')
1008             full_response = self._request_webpage(request, video_id)
1009             head_response = full_response
1010
1011         # Check for direct link to a video
1012         content_type = head_response.headers.get('Content-Type', '')
1013         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
1014         if m:
1015             upload_date = unified_strdate(
1016                 head_response.headers.get('Last-Modified'))
1017             return {
1018                 'id': video_id,
1019                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
1020                 'direct': True,
1021                 'formats': [{
1022                     'format_id': m.group('format_id'),
1023                     'url': url,
1024                     'vcodec': 'none' if m.group('type') == 'audio' else None
1025                 }],
1026                 'upload_date': upload_date,
1027             }
1028
1029         if not self._downloader.params.get('test', False) and not is_intentional:
1030             force = self._downloader.params.get('force_generic_extractor', False)
1031             self._downloader.report_warning(
1032                 '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
1033
1034         if not full_response:
1035             request = compat_urllib_request.Request(url)
1036             # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
1037             # making it impossible to download only chunk of the file (yet we need only 512kB to
1038             # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
1039             # that will always result in downloading the whole file that is not desirable.
1040             # Therefore for extraction pass we have to override Accept-Encoding to any in order
1041             # to accept raw bytes and being able to download only a chunk.
1042             # It may probably better to solve this by checking Content-Type for application/octet-stream
1043             # after HEAD request finishes, but not sure if we can rely on this.
1044             request.add_header('Accept-Encoding', '*')
1045             full_response = self._request_webpage(request, video_id)
1046
1047         # Maybe it's a direct link to a video?
1048         # Be careful not to download the whole thing!
1049         first_bytes = full_response.read(512)
1050         if not is_html(first_bytes):
1051             self._downloader.report_warning(
1052                 'URL could be a direct video link, returning it as such.')
1053             upload_date = unified_strdate(
1054                 head_response.headers.get('Last-Modified'))
1055             return {
1056                 'id': video_id,
1057                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
1058                 'direct': True,
1059                 'url': url,
1060                 'upload_date': upload_date,
1061             }
1062
1063         webpage = self._webpage_read_content(
1064             full_response, url, video_id, prefix=first_bytes)
1065
1066         self.report_extraction(video_id)
1067
1068         # Is it an RSS feed?
1069         try:
1070             doc = parse_xml(webpage)
1071             if doc.tag == 'rss':
1072                 return self._extract_rss(url, video_id, doc)
1073         except compat_xml_parse_error:
1074             pass
1075
1076         # Is it a Camtasia project?
1077         camtasia_res = self._extract_camtasia(url, video_id, webpage)
1078         if camtasia_res is not None:
1079             return camtasia_res
1080
1081         # Sometimes embedded video player is hidden behind percent encoding
1082         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1083         # Unescaping the whole page allows to handle those cases in a generic way
1084         webpage = compat_urllib_parse.unquote(webpage)
1085
1086         # it's tempting to parse this further, but you would
1087         # have to take into account all the variations like
1088         #   Video Title - Site Name
1089         #   Site Name | Video Title
1090         #   Video Title - Tagline | Site Name
1091         # and so on and so forth; it's just not practical
1092         video_title = self._html_search_regex(
1093             r'(?s)<title>(.*?)</title>', webpage, 'video title',
1094             default='video')
1095
1096         # Try to detect age limit automatically
1097         age_limit = self._rta_search(webpage)
1098         # And then there are the jokers who advertise that they use RTA,
1099         # but actually don't.
1100         AGE_LIMIT_MARKERS = [
1101             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1102         ]
1103         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1104             age_limit = 18
1105
1106         # video uploader is domain name
1107         video_uploader = self._search_regex(
1108             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1109
1110         # Helper method
1111         def _playlist_from_matches(matches, getter=None, ie=None):
1112             urlrs = orderedSet(
1113                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1114                 for m in matches)
1115             return self.playlist_result(
1116                 urlrs, playlist_id=video_id, playlist_title=video_title)
1117
1118         # Look for BrightCove:
1119         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1120         if bc_urls:
1121             self.to_screen('Brightcove video detected.')
1122             entries = [{
1123                 '_type': 'url',
1124                 'url': smuggle_url(bc_url, {'Referer': url}),
1125                 'ie_key': 'Brightcove'
1126             } for bc_url in bc_urls]
1127
1128             return {
1129                 '_type': 'playlist',
1130                 'title': video_title,
1131                 'id': video_id,
1132                 'entries': entries,
1133             }
1134
1135         # Look for embedded rtl.nl player
1136         matches = re.findall(
1137             r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
1138             webpage)
1139         if matches:
1140             return _playlist_from_matches(matches, ie='RtlNl')
1141
1142         vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
1143         if vimeo_url is not None:
1144             return self.url_result(vimeo_url)
1145
1146         # Look for embedded YouTube player
1147         matches = re.findall(r'''(?x)
1148             (?:
1149                 <iframe[^>]+?src=|
1150                 data-video-url=|
1151                 <embed[^>]+?src=|
1152                 embedSWF\(?:\s*|
1153                 new\s+SWFObject\(
1154             )
1155             (["\'])
1156                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1157                 (?:embed|v|p)/.+?)
1158             \1''', webpage)
1159         if matches:
1160             return _playlist_from_matches(
1161                 matches, lambda m: unescapeHTML(m[1]))
1162
1163         # Look for lazyYT YouTube embed
1164         matches = re.findall(
1165             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1166         if matches:
1167             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1168
1169         # Look for embedded Dailymotion player
1170         matches = re.findall(
1171             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1172         if matches:
1173             return _playlist_from_matches(
1174                 matches, lambda m: unescapeHTML(m[1]))
1175
1176         # Look for embedded Dailymotion playlist player (#3822)
1177         m = re.search(
1178             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1179         if m:
1180             playlists = re.findall(
1181                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1182             if playlists:
1183                 return _playlist_from_matches(
1184                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1185
1186         # Look for embedded Wistia player
1187         match = re.search(
1188             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1189         if match:
1190             embed_url = self._proto_relative_url(
1191                 unescapeHTML(match.group('url')))
1192             return {
1193                 '_type': 'url_transparent',
1194                 'url': embed_url,
1195                 'ie_key': 'Wistia',
1196                 'uploader': video_uploader,
1197                 'title': video_title,
1198                 'id': video_id,
1199             }
1200
1201         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1202         if match:
1203             return {
1204                 '_type': 'url_transparent',
1205                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1206                 'ie_key': 'Wistia',
1207                 'uploader': video_uploader,
1208                 'title': video_title,
1209                 'id': match.group('id')
1210             }
1211
1212         # Look for embedded blip.tv player
1213         bliptv_url = BlipTVIE._extract_url(webpage)
1214         if bliptv_url:
1215             return self.url_result(bliptv_url, 'BlipTV')
1216
1217         # Look for SVT player
1218         svt_url = SVTIE._extract_url(webpage)
1219         if svt_url:
1220             return self.url_result(svt_url, 'SVT')
1221
1222         # Look for embedded condenast player
1223         matches = re.findall(
1224             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1225             webpage)
1226         if matches:
1227             return {
1228                 '_type': 'playlist',
1229                 'entries': [{
1230                     '_type': 'url',
1231                     'ie_key': 'CondeNast',
1232                     'url': ma,
1233                 } for ma in matches],
1234                 'title': video_title,
1235                 'id': video_id,
1236             }
1237
1238         # Look for Bandcamp pages with custom domain
1239         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1240         if mobj is not None:
1241             burl = unescapeHTML(mobj.group(1))
1242             # Don't set the extractor because it can be a track url or an album
1243             return self.url_result(burl)
1244
1245         # Look for embedded Vevo player
1246         mobj = re.search(
1247             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1248         if mobj is not None:
1249             return self.url_result(mobj.group('url'))
1250
1251         # Look for embedded Viddler player
1252         mobj = re.search(
1253             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1254             webpage)
1255         if mobj is not None:
1256             return self.url_result(mobj.group('url'))
1257
1258         # Look for NYTimes player
1259         mobj = re.search(
1260             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1261             webpage)
1262         if mobj is not None:
1263             return self.url_result(mobj.group('url'))
1264
1265         # Look for Libsyn player
1266         mobj = re.search(
1267             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1268         if mobj is not None:
1269             return self.url_result(mobj.group('url'))
1270
1271         # Look for Ooyala videos
1272         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1273                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1274                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1275                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1276         if mobj is not None:
1277             return OoyalaIE._build_url_result(mobj.group('ec'))
1278
1279         # Look for multiple Ooyala embeds on SBN network websites
1280         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1281         if mobj is not None:
1282             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1283             if embeds:
1284                 return _playlist_from_matches(
1285                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1286
1287         # Look for Aparat videos
1288         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1289         if mobj is not None:
1290             return self.url_result(mobj.group(1), 'Aparat')
1291
1292         # Look for MPORA videos
1293         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1294         if mobj is not None:
1295             return self.url_result(mobj.group(1), 'Mpora')
1296
1297         # Look for embedded NovaMov-based player
1298         mobj = re.search(
1299             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1300                     (?P<url>http://(?:(?:embed|www)\.)?
1301                         (?:novamov\.com|
1302                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1303                            videoweed\.(?:es|com)|
1304                            movshare\.(?:net|sx|ag)|
1305                            divxstage\.(?:eu|net|ch|co|at|ag))
1306                         /embed\.php.+?)\1''', webpage)
1307         if mobj is not None:
1308             return self.url_result(mobj.group('url'))
1309
1310         # Look for embedded Facebook player
1311         mobj = re.search(
1312             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1313         if mobj is not None:
1314             return self.url_result(mobj.group('url'), 'Facebook')
1315
1316         # Look for embedded VK player
1317         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1318         if mobj is not None:
1319             return self.url_result(mobj.group('url'), 'VK')
1320
1321         # Look for embedded ivi player
1322         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1323         if mobj is not None:
1324             return self.url_result(mobj.group('url'), 'Ivi')
1325
1326         # Look for embedded Huffington Post player
1327         mobj = re.search(
1328             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1329         if mobj is not None:
1330             return self.url_result(mobj.group('url'), 'HuffPost')
1331
1332         # Look for embed.ly
1333         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1334         if mobj is not None:
1335             return self.url_result(mobj.group('url'))
1336         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1337         if mobj is not None:
1338             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1339
1340         # Look for funnyordie embed
1341         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1342         if matches:
1343             return _playlist_from_matches(
1344                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1345
1346         # Look for BBC iPlayer embed
1347         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1348         if matches:
1349             return _playlist_from_matches(matches, ie='BBCCoUk')
1350
1351         # Look for embedded RUTV player
1352         rutv_url = RUTVIE._extract_url(webpage)
1353         if rutv_url:
1354             return self.url_result(rutv_url, 'RUTV')
1355
1356         # Look for embedded TVC player
1357         tvc_url = TVCIE._extract_url(webpage)
1358         if tvc_url:
1359             return self.url_result(tvc_url, 'TVC')
1360
1361         # Look for embedded SportBox player
1362         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1363         if sportbox_urls:
1364             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1365
1366         # Look for embedded PornHub player
1367         pornhub_url = PornHubIE._extract_url(webpage)
1368         if pornhub_url:
1369             return self.url_result(pornhub_url, 'PornHub')
1370
1371         # Look for embedded XHamster player
1372         xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
1373         if xhamster_urls:
1374             return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
1375
1376         # Look for embedded Tvigle player
1377         mobj = re.search(
1378             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
1379         if mobj is not None:
1380             return self.url_result(mobj.group('url'), 'Tvigle')
1381
1382         # Look for embedded TED player
1383         mobj = re.search(
1384             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1385         if mobj is not None:
1386             return self.url_result(mobj.group('url'), 'TED')
1387
1388         # Look for embedded Ustream videos
1389         mobj = re.search(
1390             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1391         if mobj is not None:
1392             return self.url_result(mobj.group('url'), 'Ustream')
1393
1394         # Look for embedded arte.tv player
1395         mobj = re.search(
1396             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1397             webpage)
1398         if mobj is not None:
1399             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1400
1401         # Look for embedded smotri.com player
1402         smotri_url = SmotriIE._extract_url(webpage)
1403         if smotri_url:
1404             return self.url_result(smotri_url, 'Smotri')
1405
1406         # Look for embeded soundcloud player
1407         mobj = re.search(
1408             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1409             webpage)
1410         if mobj is not None:
1411             url = unescapeHTML(mobj.group('url'))
1412             return self.url_result(url)
1413
1414         # Look for embedded vulture.com player
1415         mobj = re.search(
1416             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1417             webpage)
1418         if mobj is not None:
1419             url = unescapeHTML(mobj.group('url'))
1420             return self.url_result(url, ie='Vulture')
1421
1422         # Look for embedded mtvservices player
1423         mobj = re.search(
1424             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1425             webpage)
1426         if mobj is not None:
1427             url = unescapeHTML(mobj.group('url'))
1428             return self.url_result(url, ie='MTVServicesEmbedded')
1429
1430         # Look for embedded yahoo player
1431         mobj = re.search(
1432             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1433             webpage)
1434         if mobj is not None:
1435             return self.url_result(mobj.group('url'), 'Yahoo')
1436
1437         # Look for embedded sbs.com.au player
1438         mobj = re.search(
1439             r'''(?x)
1440             (?:
1441                 <meta\s+property="og:video"\s+content=|
1442                 <iframe[^>]+?src=
1443             )
1444             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1445             webpage)
1446         if mobj is not None:
1447             return self.url_result(mobj.group('url'), 'SBS')
1448
1449         # Look for embedded Cinchcast player
1450         mobj = re.search(
1451             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1452             webpage)
1453         if mobj is not None:
1454             return self.url_result(mobj.group('url'), 'Cinchcast')
1455
1456         mobj = re.search(
1457             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1458             webpage)
1459         if not mobj:
1460             mobj = re.search(
1461                 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1462                 webpage)
1463         if mobj is not None:
1464             return self.url_result(mobj.group('url'), 'MLB')
1465
1466         mobj = re.search(
1467             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1468             webpage)
1469         if mobj is not None:
1470             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1471
1472         mobj = re.search(
1473             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1474             webpage)
1475         if mobj is not None:
1476             return self.url_result(mobj.group('url'), 'Livestream')
1477
1478         # Look for Zapiks embed
1479         mobj = re.search(
1480             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1481         if mobj is not None:
1482             return self.url_result(mobj.group('url'), 'Zapiks')
1483
1484         # Look for Kaltura embeds
1485         mobj = re.search(
1486             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1487         if mobj is not None:
1488             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1489
1490         # Look for Eagle.Platform embeds
1491         mobj = re.search(
1492             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1493         if mobj is not None:
1494             return self.url_result(mobj.group('url'), 'EaglePlatform')
1495
1496         # Look for ClipYou (uses Eagle.Platform) embeds
1497         mobj = re.search(
1498             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1499         if mobj is not None:
1500             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1501
1502         # Look for Pladform embeds
1503         mobj = re.search(
1504             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1505         if mobj is not None:
1506             return self.url_result(mobj.group('url'), 'Pladform')
1507
1508         # Look for Playwire embeds
1509         mobj = re.search(
1510             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1511         if mobj is not None:
1512             return self.url_result(mobj.group('url'))
1513
1514         # Look for 5min embeds
1515         mobj = re.search(
1516             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1517         if mobj is not None:
1518             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1519
1520         # Look for Crooks and Liars embeds
1521         mobj = re.search(
1522             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1523         if mobj is not None:
1524             return self.url_result(mobj.group('url'))
1525
1526         # Look for NBC Sports VPlayer embeds
1527         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1528         if nbc_sports_url:
1529             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1530
1531         # Look for UDN embeds
1532         mobj = re.search(
1533             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1534         if mobj is not None:
1535             return self.url_result(
1536                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1537
1538         # Look for Senate ISVP iframe
1539         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1540         if senate_isvp_url:
1541             return self.url_result(senate_isvp_url, 'SenateISVP')
1542
1543         # Look for Dailymotion Cloud videos
1544         dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
1545         if dmcloud_url:
1546             return self.url_result(dmcloud_url, 'DailymotionCloud')
1547
1548         # Look for OnionStudios embeds
1549         onionstudios_url = OnionStudiosIE._extract_url(webpage)
1550         if onionstudios_url:
1551             return self.url_result(onionstudios_url)
1552
1553         # Look for AdobeTVVideo embeds
1554         mobj = re.search(
1555             r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',
1556             webpage)
1557         if mobj is not None:
1558             return self.url_result(
1559                 self._proto_relative_url(unescapeHTML(mobj.group(1))),
1560                 'AdobeTVVideo')
1561
1562         def check_video(vurl):
1563             if YoutubeIE.suitable(vurl):
1564                 return True
1565             vpath = compat_urlparse.urlparse(vurl).path
1566             vext = determine_ext(vpath)
1567             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1568
1569         def filter_video(urls):
1570             return list(filter(check_video, urls))
1571
1572         # Start with something easy: JW Player in SWFObject
1573         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1574         if not found:
1575             # Look for gorilla-vid style embedding
1576             found = filter_video(re.findall(r'''(?sx)
1577                 (?:
1578                     jw_plugins|
1579                     JWPlayerOptions|
1580                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1581                 )
1582                 .*?
1583                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1584         if not found:
1585             # Broaden the search a little bit
1586             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1587         if not found:
1588             # Broaden the findall a little bit: JWPlayer JS loader
1589             found = filter_video(re.findall(
1590                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1591         if not found:
1592             # Flow player
1593             found = filter_video(re.findall(r'''(?xs)
1594                 flowplayer\("[^"]+",\s*
1595                     \{[^}]+?\}\s*,
1596                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1597                         ["']?url["']?\s*:\s*["']([^"']+)["']
1598             ''', webpage))
1599         if not found:
1600             # Cinerama player
1601             found = re.findall(
1602                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1603         if not found:
1604             # Try to find twitter cards info
1605             found = filter_video(re.findall(
1606                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1607         if not found:
1608             # We look for Open Graph info:
1609             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1610             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1611             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1612             if m_video_type is not None:
1613                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1614         if not found:
1615             # HTML5 video
1616             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1617         if not found:
1618             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1619             found = re.search(
1620                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1621                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1622                 webpage)
1623             if not found:
1624                 # Look also in Refresh HTTP header
1625                 refresh_header = head_response.headers.get('Refresh')
1626                 if refresh_header:
1627                     found = re.search(REDIRECT_REGEX, refresh_header)
1628             if found:
1629                 new_url = compat_urlparse.urljoin(url, found.group(1))
1630                 self.report_following_redirect(new_url)
1631                 return {
1632                     '_type': 'url',
1633                     'url': new_url,
1634                 }
1635         if not found:
1636             raise UnsupportedError(url)
1637
1638         entries = []
1639         for video_url in found:
1640             video_url = compat_urlparse.urljoin(url, video_url)
1641             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1642
1643             # Sometimes, jwplayer extraction will result in a YouTube URL
1644             if YoutubeIE.suitable(video_url):
1645                 entries.append(self.url_result(video_url, 'Youtube'))
1646                 continue
1647
1648             # here's a fun little line of code for you:
1649             video_id = os.path.splitext(video_id)[0]
1650
1651             if determine_ext(video_url) == 'smil':
1652                 entries.append({
1653                     'id': video_id,
1654                     'formats': self._extract_smil_formats(video_url, video_id),
1655                     'uploader': video_uploader,
1656                     'title': video_title,
1657                     'age_limit': age_limit,
1658                 })
1659             else:
1660                 entries.append({
1661                     'id': video_id,
1662                     'url': video_url,
1663                     'uploader': video_uploader,
1664                     'title': video_title,
1665                     'age_limit': age_limit,
1666                 })
1667
1668         if len(entries) == 1:
1669             return entries[0]
1670         else:
1671             for num, e in enumerate(entries, start=1):
1672                 # 'url' results don't have a title
1673                 if e.get('title') is not None:
1674                     e['title'] = '%s (%d)' % (e['title'], num)
1675             return {
1676                 '_type': 'playlist',
1677                 'entries': entries,
1678             }