[extractor/generic] Clarify generic extraction warning
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urllib_parse_unquote,
13     compat_urllib_request,
14     compat_urlparse,
15     compat_xml_parse_error,
16 )
17 from ..utils import (
18     determine_ext,
19     ExtractorError,
20     float_or_none,
21     HEADRequest,
22     is_html,
23     orderedSet,
24     parse_xml,
25     smuggle_url,
26     unescapeHTML,
27     unified_strdate,
28     unsmuggle_url,
29     UnsupportedError,
30     url_basename,
31     xpath_text,
32 )
33 from .brightcove import BrightcoveIE
34 from .nbc import NBCSportsVPlayerIE
35 from .ooyala import OoyalaIE
36 from .rutv import RUTVIE
37 from .tvc import TVCIE
38 from .sportbox import SportBoxEmbedIE
39 from .smotri import SmotriIE
40 from .condenast import CondeNastIE
41 from .udn import UDNEmbedIE
42 from .senateisvp import SenateISVPIE
43 from .bliptv import BlipTVIE
44 from .svt import SVTIE
45
46
47 class GenericIE(InfoExtractor):
48     IE_DESC = 'Generic downloader that works on some sites'
49     _VALID_URL = r'.*'
50     IE_NAME = 'generic'
51     _TESTS = [
52         # Direct link to a video
53         {
54             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
55             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
56             'info_dict': {
57                 'id': 'trailer',
58                 'ext': 'mp4',
59                 'title': 'trailer',
60                 'upload_date': '20100513',
61             }
62         },
63         # Direct link to media delivered compressed (until Accept-Encoding is *)
64         {
65             'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
66             'md5': '128c42e68b13950268b648275386fc74',
67             'info_dict': {
68                 'id': 'FictionJunction-Parallel_Hearts',
69                 'ext': 'flac',
70                 'title': 'FictionJunction-Parallel_Hearts',
71                 'upload_date': '20140522',
72             },
73             'expected_warnings': [
74                 'URL could be a direct video link, returning it as such.'
75             ]
76         },
77         # Direct download with broken HEAD
78         {
79             'url': 'http://ai-radio.org:8000/radio.opus',
80             'info_dict': {
81                 'id': 'radio',
82                 'ext': 'opus',
83                 'title': 'radio',
84             },
85             'params': {
86                 'skip_download': True,  # infinite live stream
87             },
88             'expected_warnings': [
89                 r'501.*Not Implemented'
90             ],
91         },
92         # Direct link with incorrect MIME type
93         {
94             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
95             'md5': '4ccbebe5f36706d85221f204d7eb5913',
96             'info_dict': {
97                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
98                 'id': '5_Lennart_Poettering_-_Systemd',
99                 'ext': 'webm',
100                 'title': '5_Lennart_Poettering_-_Systemd',
101                 'upload_date': '20141120',
102             },
103             'expected_warnings': [
104                 'URL could be a direct video link, returning it as such.'
105             ]
106         },
107         # RSS feed
108         {
109             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
110             'info_dict': {
111                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
112                 'title': 'Zero Punctuation',
113                 'description': 're:.*groundbreaking video review series.*'
114             },
115             'playlist_mincount': 11,
116         },
117         # RSS feed with enclosure
118         {
119             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
120             'info_dict': {
121                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
122                 'ext': 'm4v',
123                 'upload_date': '20150228',
124                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
125             }
126         },
127         # google redirect
128         {
129             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
130             'info_dict': {
131                 'id': 'cmQHVoWB5FY',
132                 'ext': 'mp4',
133                 'upload_date': '20130224',
134                 'uploader_id': 'TheVerge',
135                 'description': 're:^Chris Ziegler takes a look at the\.*',
136                 'uploader': 'The Verge',
137                 'title': 'First Firefox OS phones side-by-side',
138             },
139             'params': {
140                 'skip_download': False,
141             }
142         },
143         {
144             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
145             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
146             'info_dict': {
147                 'id': '13601338388002',
148                 'ext': 'mp4',
149                 'uploader': 'www.hodiho.fr',
150                 'title': 'R\u00e9gis plante sa Jeep',
151             }
152         },
153         # bandcamp page with custom domain
154         {
155             'add_ie': ['Bandcamp'],
156             'url': 'http://bronyrock.com/track/the-pony-mash',
157             'info_dict': {
158                 'id': '3235767654',
159                 'ext': 'mp3',
160                 'title': 'The Pony Mash',
161                 'uploader': 'M_Pallante',
162             },
163             'skip': 'There is a limit of 200 free downloads / month for the test song',
164         },
165         # embedded brightcove video
166         # it also tests brightcove videos that need to set the 'Referer' in the
167         # http requests
168         {
169             'add_ie': ['Brightcove'],
170             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
171             'info_dict': {
172                 'id': '2765128793001',
173                 'ext': 'mp4',
174                 'title': 'Le cours de bourse : l’analyse technique',
175                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
176                 'uploader': 'BFM BUSINESS',
177             },
178             'params': {
179                 'skip_download': True,
180             },
181         },
182         {
183             # https://github.com/rg3/youtube-dl/issues/2253
184             'url': 'http://bcove.me/i6nfkrc3',
185             'md5': '0ba9446db037002366bab3b3eb30c88c',
186             'info_dict': {
187                 'id': '3101154703001',
188                 'ext': 'mp4',
189                 'title': 'Still no power',
190                 'uploader': 'thestar.com',
191                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
192             },
193             'add_ie': ['Brightcove'],
194         },
195         {
196             'url': 'http://www.championat.com/video/football/v/87/87499.html',
197             'md5': 'fb973ecf6e4a78a67453647444222983',
198             'info_dict': {
199                 'id': '3414141473001',
200                 'ext': 'mp4',
201                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
202                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
203                 'uploader': 'Championat',
204             },
205         },
206         {
207             # https://github.com/rg3/youtube-dl/issues/3541
208             'add_ie': ['Brightcove'],
209             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
210             'info_dict': {
211                 'id': '3866516442001',
212                 'ext': 'mp4',
213                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
214                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
215                 'uploader': 'SBS Broadcasting',
216             },
217             'skip': 'Restricted to Netherlands',
218             'params': {
219                 'skip_download': True,  # m3u8 download
220             },
221         },
222         # ooyala video
223         {
224             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
225             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
226             'info_dict': {
227                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
228                 'ext': 'mp4',
229                 'title': '2cc213299525360.mov',  # that's what we get
230             },
231             'add_ie': ['Ooyala'],
232         },
233         # multiple ooyala embeds on SBN network websites
234         {
235             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
236             'info_dict': {
237                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
238                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
239             },
240             'playlist_mincount': 3,
241             'params': {
242                 'skip_download': True,
243             },
244             'add_ie': ['Ooyala'],
245         },
246         # embed.ly video
247         {
248             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
249             'info_dict': {
250                 'id': '9ODmcdjQcHQ',
251                 'ext': 'mp4',
252                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
253                 'upload_date': '20140225',
254                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
255                 'uploader': 'Tested',
256                 'uploader_id': 'testedcom',
257             },
258             # No need to test YoutubeIE here
259             'params': {
260                 'skip_download': True,
261             },
262         },
263         # funnyordie embed
264         {
265             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
266             'info_dict': {
267                 'id': '18e820ec3f',
268                 'ext': 'mp4',
269                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
270                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
271             },
272         },
273         # BBC iPlayer embeds
274         {
275             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
276             'info_dict': {
277                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
278             },
279             'playlist_mincount': 18,
280         },
281         # RUTV embed
282         {
283             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
284             'info_dict': {
285                 'id': '776940',
286                 'ext': 'mp4',
287                 'title': 'Охотское море стало целиком российским',
288                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
289             },
290             'params': {
291                 # m3u8 download
292                 'skip_download': True,
293             },
294         },
295         # TVC embed
296         {
297             'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
298             'info_dict': {
299                 'id': '55304',
300                 'ext': 'mp4',
301                 'title': 'Дошкольное воспитание',
302             },
303         },
304         # SportBox embed
305         {
306             'url': 'http://www.vestifinance.ru/articles/25753',
307             'info_dict': {
308                 'id': '25753',
309                 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
310             },
311             'playlist': [{
312                 'info_dict': {
313                     'id': '370908',
314                     'title': 'Госзаказ. День 3',
315                     'ext': 'mp4',
316                 }
317             }, {
318                 'info_dict': {
319                     'id': '370905',
320                     'title': 'Госзаказ. День 2',
321                     'ext': 'mp4',
322                 }
323             }, {
324                 'info_dict': {
325                     'id': '370902',
326                     'title': 'Госзаказ. День 1',
327                     'ext': 'mp4',
328                 }
329             }],
330             'params': {
331                 # m3u8 download
332                 'skip_download': True,
333             },
334         },
335         # Embedded TED video
336         {
337             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
338             'md5': '65fdff94098e4a607385a60c5177c638',
339             'info_dict': {
340                 'id': '1969',
341                 'ext': 'mp4',
342                 'title': 'Hidden miracles of the natural world',
343                 'uploader': 'Louie Schwartzberg',
344                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
345             }
346         },
347         # Embeded Ustream video
348         {
349             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
350             'md5': '27b99cdb639c9b12a79bca876a073417',
351             'info_dict': {
352                 'id': '45734260',
353                 'ext': 'flv',
354                 'uploader': 'AU SPA:  The NSA and Privacy',
355                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
356             }
357         },
358         # nowvideo embed hidden behind percent encoding
359         {
360             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
361             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
362             'info_dict': {
363                 'id': '06e53103ca9aa',
364                 'ext': 'flv',
365                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
366                 'description': 'No description',
367             },
368         },
369         # arte embed
370         {
371             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
372             'md5': '7653032cbb25bf6c80d80f217055fa43',
373             'info_dict': {
374                 'id': '048195-004_PLUS7-F',
375                 'ext': 'flv',
376                 'title': 'X:enius',
377                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
378                 'upload_date': '20140320',
379             },
380             'params': {
381                 'skip_download': 'Requires rtmpdump'
382             }
383         },
384         # Condé Nast embed
385         {
386             'url': 'http://www.wired.com/2014/04/honda-asimo/',
387             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
388             'info_dict': {
389                 'id': '53501be369702d3275860000',
390                 'ext': 'mp4',
391                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
392             }
393         },
394         # Dailymotion embed
395         {
396             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
397             'md5': '441aeeb82eb72c422c7f14ec533999cd',
398             'info_dict': {
399                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
400                 'ext': 'mp4',
401                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
402                 'uploader': 'Spi0n',
403             },
404             'add_ie': ['Dailymotion'],
405         },
406         # YouTube embed
407         {
408             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
409             'info_dict': {
410                 'id': 'FXRb4ykk4S0',
411                 'ext': 'mp4',
412                 'title': 'The NBL Auction 2014',
413                 'uploader': 'BADMINTON England',
414                 'uploader_id': 'BADMINTONEvents',
415                 'upload_date': '20140603',
416                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
417             },
418             'add_ie': ['Youtube'],
419             'params': {
420                 'skip_download': True,
421             }
422         },
423         # MTVSercices embed
424         {
425             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
426             'md5': '35727f82f58c76d996fc188f9755b0d5',
427             'info_dict': {
428                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
429                 'ext': 'mp4',
430                 'title': 'Review',
431                 'description': 'Mario\'s life in the fast lane has never looked so good.',
432             },
433         },
434         # YouTube embed via <data-embed-url="">
435         {
436             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
437             'info_dict': {
438                 'id': '4vAffPZIT44',
439                 'ext': 'mp4',
440                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
441                 'uploader': 'Gameloft',
442                 'uploader_id': 'gameloft',
443                 'upload_date': '20140828',
444                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
445             },
446             'params': {
447                 'skip_download': True,
448             }
449         },
450         # Camtasia studio
451         {
452             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
453             'playlist': [{
454                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
455                 'info_dict': {
456                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
457                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
458                     'ext': 'flv',
459                     'duration': 2235.90,
460                 }
461             }, {
462                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
463                 'info_dict': {
464                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
465                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
466                     'ext': 'flv',
467                     'duration': 2235.93,
468                 }
469             }],
470             'info_dict': {
471                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
472             }
473         },
474         # Flowplayer
475         {
476             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
477             'md5': '9d65602bf31c6e20014319c7d07fba27',
478             'info_dict': {
479                 'id': '5123ea6d5e5a7',
480                 'ext': 'mp4',
481                 'age_limit': 18,
482                 'uploader': 'www.handjobhub.com',
483                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
484             }
485         },
486         # Multiple brightcove videos
487         # https://github.com/rg3/youtube-dl/issues/2283
488         {
489             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
490             'info_dict': {
491                 'id': 'always-never',
492                 'title': 'Always / Never - The New Yorker',
493             },
494             'playlist_count': 3,
495             'params': {
496                 'extract_flat': False,
497                 'skip_download': True,
498             }
499         },
500         # MLB embed
501         {
502             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
503             'md5': '96f09a37e44da40dd083e12d9a683327',
504             'info_dict': {
505                 'id': '33322633',
506                 'ext': 'mp4',
507                 'title': 'Ump changes call to ball',
508                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
509                 'duration': 48,
510                 'timestamp': 1401537900,
511                 'upload_date': '20140531',
512                 'thumbnail': 're:^https?://.*\.jpg$',
513             },
514         },
515         # Wistia embed
516         {
517             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
518             'md5': '8788b683c777a5cf25621eaf286d0c23',
519             'info_dict': {
520                 'id': '1cfaf6b7ea',
521                 'ext': 'mov',
522                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
523                 'duration': 643.0,
524                 'filesize': 182808282,
525                 'uploader': 'education-portal.com',
526             },
527         },
528         {
529             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
530             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
531             'info_dict': {
532                 'id': 'uxjb0lwrcz',
533                 'ext': 'mp4',
534                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
535                 'duration': 1715.0,
536                 'uploader': 'thoughtworks.wistia.com',
537             },
538         },
539         # Soundcloud embed
540         {
541             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
542             'info_dict': {
543                 'id': '174391317',
544                 'ext': 'mp3',
545                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
546                 'uploader': 'Sophos Security',
547                 'title': 'Chet Chat 171 - Oct 29, 2014',
548                 'upload_date': '20141029',
549             }
550         },
551         # Livestream embed
552         {
553             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
554             'info_dict': {
555                 'id': '67864563',
556                 'ext': 'flv',
557                 'upload_date': '20141112',
558                 'title': 'Rosetta #CometLanding webcast HL 10',
559             }
560         },
561         # LazyYT
562         {
563             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
564             'info_dict': {
565                 'id': '1986',
566                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
567             },
568             'playlist_mincount': 2,
569         },
570         # Cinchcast embed
571         {
572             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
573             'info_dict': {
574                 'id': '7141703',
575                 'ext': 'mp3',
576                 'upload_date': '20141126',
577                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
578             }
579         },
580         # Cinerama player
581         {
582             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
583             'info_dict': {
584                 'id': '730m_DandD_1901_512k',
585                 'ext': 'mp4',
586                 'uploader': 'www.abc.net.au',
587                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
588             }
589         },
590         # embedded viddler video
591         {
592             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
593             'info_dict': {
594                 'id': '4d03aad9',
595                 'ext': 'mp4',
596                 'uploader': 'deadspin',
597                 'title': 'WALL-TO-GORTAT',
598                 'timestamp': 1422285291,
599                 'upload_date': '20150126',
600             },
601             'add_ie': ['Viddler'],
602         },
603         # Libsyn embed
604         {
605             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
606             'info_dict': {
607                 'id': '3377616',
608                 'ext': 'mp3',
609                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
610                 'description': 'md5:601cb790edd05908957dae8aaa866465',
611                 'upload_date': '20150220',
612             },
613         },
614         # jwplayer YouTube
615         {
616             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
617             'info_dict': {
618                 'id': 'Mrj4DVp2zeA',
619                 'ext': 'mp4',
620                 'upload_date': '20150212',
621                 'uploader': 'The National Archives UK',
622                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
623                 'uploader_id': 'NationalArchives08',
624                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
625             },
626         },
627         # rtl.nl embed
628         {
629             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
630             'playlist_mincount': 5,
631             'info_dict': {
632                 'id': 'aanslagen-kopenhagen',
633                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
634             }
635         },
636         # Zapiks embed
637         {
638             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
639             'info_dict': {
640                 'id': '118046',
641                 'ext': 'mp4',
642                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
643             }
644         },
645         # Kaltura embed
646         {
647             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
648             'info_dict': {
649                 'id': '1_eergr3h1',
650                 'ext': 'mp4',
651                 'upload_date': '20150226',
652                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
653                 'timestamp': int,
654                 'title': 'John Carlson Postgame 2/25/15',
655             },
656         },
657         # Eagle.Platform embed (generic URL)
658         {
659             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
660             'info_dict': {
661                 'id': '227304',
662                 'ext': 'mp4',
663                 'title': 'Навальный вышел на свободу',
664                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
665                 'thumbnail': 're:^https?://.*\.jpg$',
666                 'duration': 87,
667                 'view_count': int,
668                 'age_limit': 0,
669             },
670         },
671         # ClipYou (Eagle.Platform) embed (custom URL)
672         {
673             'url': 'http://muz-tv.ru/play/7129/',
674             'info_dict': {
675                 'id': '12820',
676                 'ext': 'mp4',
677                 'title': "'O Sole Mio",
678                 'thumbnail': 're:^https?://.*\.jpg$',
679                 'duration': 216,
680                 'view_count': int,
681             },
682         },
683         # Pladform embed
684         {
685             'url': 'http://muz-tv.ru/kinozal/view/7400/',
686             'info_dict': {
687                 'id': '100183293',
688                 'ext': 'mp4',
689                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
690                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
691                 'thumbnail': 're:^https?://.*\.jpg$',
692                 'duration': 694,
693                 'age_limit': 0,
694             },
695         },
696         # Playwire embed
697         {
698             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
699             'info_dict': {
700                 'id': '3519514',
701                 'ext': 'mp4',
702                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
703                 'thumbnail': 're:^https?://.*\.png$',
704                 'duration': 45.115,
705             },
706         },
707         # 5min embed
708         {
709             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
710             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
711             'info_dict': {
712                 'id': '518726732',
713                 'ext': 'mp4',
714                 'title': 'Facebook Creates "On This Day" | Crunch Report',
715             },
716         },
717         # SVT embed
718         {
719             'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
720             'info_dict': {
721                 'id': '2900353',
722                 'ext': 'flv',
723                 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
724                 'duration': 27,
725                 'age_limit': 0,
726             },
727         },
728         # Crooks and Liars embed
729         {
730             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
731             'info_dict': {
732                 'id': '8RUoRhRi',
733                 'ext': 'mp4',
734                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
735                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
736                 'timestamp': 1428207000,
737                 'upload_date': '20150405',
738                 'uploader': 'Heather',
739             },
740         },
741         # Crooks and Liars external embed
742         {
743             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
744             'info_dict': {
745                 'id': 'MTE3MjUtMzQ2MzA',
746                 'ext': 'mp4',
747                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
748                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
749                 'timestamp': 1265032391,
750                 'upload_date': '20100201',
751                 'uploader': 'Heather',
752             },
753         },
754         # NBC Sports vplayer embed
755         {
756             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
757             'info_dict': {
758                 'id': 'ln7x1qSThw4k',
759                 'ext': 'flv',
760                 'title': "PFT Live: New leader in the 'new-look' defense",
761                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
762             },
763         },
764         # UDN embed
765         {
766             'url': 'http://www.udn.com/news/story/7314/822787',
767             'md5': 'fd2060e988c326991037b9aff9df21a6',
768             'info_dict': {
769                 'id': '300346',
770                 'ext': 'mp4',
771                 'title': '中一中男師變性 全校師生力挺',
772                 'thumbnail': 're:^https?://.*\.jpg$',
773             }
774         },
775         # Ooyala embed
776         {
777             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
778             'info_dict': {
779                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
780                 'ext': 'mp4',
781                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
782                 'title': 'This is what separates the Excel masters from the wannabes',
783             },
784             'params': {
785                 # m3u8 downloads
786                 'skip_download': True,
787             }
788         },
789         # Contains a SMIL manifest
790         {
791             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
792             'info_dict': {
793                 'id': 'file',
794                 'ext': 'flv',
795                 'title': '+ Football: Lottery Champions League Europe',
796                 'uploader': 'www.telewebion.com',
797             },
798             'params': {
799                 # rtmpe downloads
800                 'skip_download': True,
801             }
802         },
803         # Brightcove URL in single quotes
804         {
805             'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
806             'md5': '4ae374f1f8b91c889c4b9203c8c752af',
807             'info_dict': {
808                 'id': '4255764656001',
809                 'ext': 'mp4',
810                 'title': 'SN Presents: Russell Martin, World Citizen',
811                 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
812                 'uploader': 'Rogers Sportsnet',
813             },
814         }
815     ]
816
817     def report_following_redirect(self, new_url):
818         """Report information extraction."""
819         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
820
821     def _extract_rss(self, url, video_id, doc):
822         playlist_title = doc.find('./channel/title').text
823         playlist_desc_el = doc.find('./channel/description')
824         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
825
826         entries = []
827         for it in doc.findall('./channel/item'):
828             next_url = xpath_text(it, 'link', fatal=False)
829             if not next_url:
830                 enclosure_nodes = it.findall('./enclosure')
831                 for e in enclosure_nodes:
832                     next_url = e.attrib.get('url')
833                     if next_url:
834                         break
835
836             if not next_url:
837                 continue
838
839             entries.append({
840                 '_type': 'url',
841                 'url': next_url,
842                 'title': it.find('title').text,
843             })
844
845         return {
846             '_type': 'playlist',
847             'id': url,
848             'title': playlist_title,
849             'description': playlist_desc,
850             'entries': entries,
851         }
852
853     def _extract_camtasia(self, url, video_id, webpage):
854         """ Returns None if no camtasia video can be found. """
855
856         camtasia_cfg = self._search_regex(
857             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
858             webpage, 'camtasia configuration file', default=None)
859         if camtasia_cfg is None:
860             return None
861
862         title = self._html_search_meta('DC.title', webpage, fatal=True)
863
864         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
865         camtasia_cfg = self._download_xml(
866             camtasia_url, video_id,
867             note='Downloading camtasia configuration',
868             errnote='Failed to download camtasia configuration')
869         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
870
871         entries = []
872         for n in fileset_node.getchildren():
873             url_n = n.find('./uri')
874             if url_n is None:
875                 continue
876
877             entries.append({
878                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
879                 'title': '%s - %s' % (title, n.tag),
880                 'url': compat_urlparse.urljoin(url, url_n.text),
881                 'duration': float_or_none(n.find('./duration').text),
882             })
883
884         return {
885             '_type': 'playlist',
886             'entries': entries,
887             'title': title,
888         }
889
890     def _real_extract(self, url):
891         if url.startswith('//'):
892             return {
893                 '_type': 'url',
894                 'url': self.http_scheme() + url,
895             }
896
897         parsed_url = compat_urlparse.urlparse(url)
898         if not parsed_url.scheme:
899             default_search = self._downloader.params.get('default_search')
900             if default_search is None:
901                 default_search = 'fixup_error'
902
903             if default_search in ('auto', 'auto_warning', 'fixup_error'):
904                 if '/' in url:
905                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
906                     return self.url_result('http://' + url)
907                 elif default_search != 'fixup_error':
908                     if default_search == 'auto_warning':
909                         if re.match(r'^(?:url|URL)$', url):
910                             raise ExtractorError(
911                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
912                                 expected=True)
913                         else:
914                             self._downloader.report_warning(
915                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
916                     return self.url_result('ytsearch:' + url)
917
918             if default_search in ('error', 'fixup_error'):
919                 raise ExtractorError(
920                     '%r is not a valid URL. '
921                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
922                     % (url, url), expected=True)
923             else:
924                 if ':' not in default_search:
925                     default_search += ':'
926                 return self.url_result(default_search + url)
927
928         url, smuggled_data = unsmuggle_url(url)
929         force_videoid = None
930         is_intentional = smuggled_data and smuggled_data.get('to_generic')
931         if smuggled_data and 'force_videoid' in smuggled_data:
932             force_videoid = smuggled_data['force_videoid']
933             video_id = force_videoid
934         else:
935             video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
936
937         self.to_screen('%s: Requesting header' % video_id)
938
939         head_req = HEADRequest(url)
940         head_response = self._request_webpage(
941             head_req, video_id,
942             note=False, errnote='Could not send HEAD request to %s' % url,
943             fatal=False)
944
945         if head_response is not False:
946             # Check for redirect
947             new_url = head_response.geturl()
948             if url != new_url:
949                 self.report_following_redirect(new_url)
950                 if force_videoid:
951                     new_url = smuggle_url(
952                         new_url, {'force_videoid': force_videoid})
953                 return self.url_result(new_url)
954
955         full_response = None
956         if head_response is False:
957             request = compat_urllib_request.Request(url)
958             request.add_header('Accept-Encoding', '*')
959             full_response = self._request_webpage(request, video_id)
960             head_response = full_response
961
962         # Check for direct link to a video
963         content_type = head_response.headers.get('Content-Type', '')
964         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
965         if m:
966             upload_date = unified_strdate(
967                 head_response.headers.get('Last-Modified'))
968             return {
969                 'id': video_id,
970                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
971                 'direct': True,
972                 'formats': [{
973                     'format_id': m.group('format_id'),
974                     'url': url,
975                     'vcodec': 'none' if m.group('type') == 'audio' else None
976                 }],
977                 'upload_date': upload_date,
978             }
979
980         if not self._downloader.params.get('test', False) and not is_intentional:
981             force = self._downloader.params.get('force_generic_extractor', False)
982             self._downloader.report_warning(
983                 '%s on generic information extractor.' % ('Forcing' if force else 'Falling back'))
984
985         if not full_response:
986             request = compat_urllib_request.Request(url)
987             # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
988             # making it impossible to download only chunk of the file (yet we need only 512kB to
989             # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
990             # that will always result in downloading the whole file that is not desirable.
991             # Therefore for extraction pass we have to override Accept-Encoding to any in order
992             # to accept raw bytes and being able to download only a chunk.
993             # It may probably better to solve this by checking Content-Type for application/octet-stream
994             # after HEAD request finishes, but not sure if we can rely on this.
995             request.add_header('Accept-Encoding', '*')
996             full_response = self._request_webpage(request, video_id)
997
998         # Maybe it's a direct link to a video?
999         # Be careful not to download the whole thing!
1000         first_bytes = full_response.read(512)
1001         if not is_html(first_bytes):
1002             self._downloader.report_warning(
1003                 'URL could be a direct video link, returning it as such.')
1004             upload_date = unified_strdate(
1005                 head_response.headers.get('Last-Modified'))
1006             return {
1007                 'id': video_id,
1008                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
1009                 'direct': True,
1010                 'url': url,
1011                 'upload_date': upload_date,
1012             }
1013
1014         webpage = self._webpage_read_content(
1015             full_response, url, video_id, prefix=first_bytes)
1016
1017         self.report_extraction(video_id)
1018
1019         # Is it an RSS feed?
1020         try:
1021             doc = parse_xml(webpage)
1022             if doc.tag == 'rss':
1023                 return self._extract_rss(url, video_id, doc)
1024         except compat_xml_parse_error:
1025             pass
1026
1027         # Is it a Camtasia project?
1028         camtasia_res = self._extract_camtasia(url, video_id, webpage)
1029         if camtasia_res is not None:
1030             return camtasia_res
1031
1032         # Sometimes embedded video player is hidden behind percent encoding
1033         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1034         # Unescaping the whole page allows to handle those cases in a generic way
1035         webpage = compat_urllib_parse.unquote(webpage)
1036
1037         # it's tempting to parse this further, but you would
1038         # have to take into account all the variations like
1039         #   Video Title - Site Name
1040         #   Site Name | Video Title
1041         #   Video Title - Tagline | Site Name
1042         # and so on and so forth; it's just not practical
1043         video_title = self._html_search_regex(
1044             r'(?s)<title>(.*?)</title>', webpage, 'video title',
1045             default='video')
1046
1047         # Try to detect age limit automatically
1048         age_limit = self._rta_search(webpage)
1049         # And then there are the jokers who advertise that they use RTA,
1050         # but actually don't.
1051         AGE_LIMIT_MARKERS = [
1052             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1053         ]
1054         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1055             age_limit = 18
1056
1057         # video uploader is domain name
1058         video_uploader = self._search_regex(
1059             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1060
1061         # Helper method
1062         def _playlist_from_matches(matches, getter=None, ie=None):
1063             urlrs = orderedSet(
1064                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1065                 for m in matches)
1066             return self.playlist_result(
1067                 urlrs, playlist_id=video_id, playlist_title=video_title)
1068
1069         # Look for BrightCove:
1070         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1071         if bc_urls:
1072             self.to_screen('Brightcove video detected.')
1073             entries = [{
1074                 '_type': 'url',
1075                 'url': smuggle_url(bc_url, {'Referer': url}),
1076                 'ie_key': 'Brightcove'
1077             } for bc_url in bc_urls]
1078
1079             return {
1080                 '_type': 'playlist',
1081                 'title': video_title,
1082                 'id': video_id,
1083                 'entries': entries,
1084             }
1085
1086         # Look for embedded rtl.nl player
1087         matches = re.findall(
1088             r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
1089             webpage)
1090         if matches:
1091             return _playlist_from_matches(matches, ie='RtlNl')
1092
1093         # Look for embedded (iframe) Vimeo player
1094         mobj = re.search(
1095             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1096         if mobj:
1097             player_url = unescapeHTML(mobj.group('url'))
1098             surl = smuggle_url(player_url, {'Referer': url})
1099             return self.url_result(surl)
1100         # Look for embedded (swf embed) Vimeo player
1101         mobj = re.search(
1102             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1103         if mobj:
1104             return self.url_result(mobj.group(1))
1105
1106         # Look for embedded YouTube player
1107         matches = re.findall(r'''(?x)
1108             (?:
1109                 <iframe[^>]+?src=|
1110                 data-video-url=|
1111                 <embed[^>]+?src=|
1112                 embedSWF\(?:\s*|
1113                 new\s+SWFObject\(
1114             )
1115             (["\'])
1116                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1117                 (?:embed|v|p)/.+?)
1118             \1''', webpage)
1119         if matches:
1120             return _playlist_from_matches(
1121                 matches, lambda m: unescapeHTML(m[1]))
1122
1123         # Look for lazyYT YouTube embed
1124         matches = re.findall(
1125             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1126         if matches:
1127             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1128
1129         # Look for embedded Dailymotion player
1130         matches = re.findall(
1131             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1132         if matches:
1133             return _playlist_from_matches(
1134                 matches, lambda m: unescapeHTML(m[1]))
1135
1136         # Look for embedded Dailymotion playlist player (#3822)
1137         m = re.search(
1138             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1139         if m:
1140             playlists = re.findall(
1141                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1142             if playlists:
1143                 return _playlist_from_matches(
1144                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1145
1146         # Look for embedded Wistia player
1147         match = re.search(
1148             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1149         if match:
1150             embed_url = self._proto_relative_url(
1151                 unescapeHTML(match.group('url')))
1152             return {
1153                 '_type': 'url_transparent',
1154                 'url': embed_url,
1155                 'ie_key': 'Wistia',
1156                 'uploader': video_uploader,
1157                 'title': video_title,
1158                 'id': video_id,
1159             }
1160
1161         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1162         if match:
1163             return {
1164                 '_type': 'url_transparent',
1165                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1166                 'ie_key': 'Wistia',
1167                 'uploader': video_uploader,
1168                 'title': video_title,
1169                 'id': match.group('id')
1170             }
1171
1172         # Look for embedded blip.tv player
1173         bliptv_url = BlipTVIE._extract_url(webpage)
1174         if bliptv_url:
1175             return self.url_result(bliptv_url, 'BlipTV')
1176
1177         # Look for SVT player
1178         svt_url = SVTIE._extract_url(webpage)
1179         if svt_url:
1180             return self.url_result(svt_url, 'SVT')
1181
1182         # Look for embedded condenast player
1183         matches = re.findall(
1184             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1185             webpage)
1186         if matches:
1187             return {
1188                 '_type': 'playlist',
1189                 'entries': [{
1190                     '_type': 'url',
1191                     'ie_key': 'CondeNast',
1192                     'url': ma,
1193                 } for ma in matches],
1194                 'title': video_title,
1195                 'id': video_id,
1196             }
1197
1198         # Look for Bandcamp pages with custom domain
1199         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1200         if mobj is not None:
1201             burl = unescapeHTML(mobj.group(1))
1202             # Don't set the extractor because it can be a track url or an album
1203             return self.url_result(burl)
1204
1205         # Look for embedded Vevo player
1206         mobj = re.search(
1207             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1208         if mobj is not None:
1209             return self.url_result(mobj.group('url'))
1210
1211         # Look for embedded Viddler player
1212         mobj = re.search(
1213             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1214             webpage)
1215         if mobj is not None:
1216             return self.url_result(mobj.group('url'))
1217
1218         # Look for NYTimes player
1219         mobj = re.search(
1220             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1221             webpage)
1222         if mobj is not None:
1223             return self.url_result(mobj.group('url'))
1224
1225         # Look for Libsyn player
1226         mobj = re.search(
1227             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1228         if mobj is not None:
1229             return self.url_result(mobj.group('url'))
1230
1231         # Look for Ooyala videos
1232         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1233                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1234                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1235                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1236         if mobj is not None:
1237             return OoyalaIE._build_url_result(mobj.group('ec'))
1238
1239         # Look for multiple Ooyala embeds on SBN network websites
1240         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1241         if mobj is not None:
1242             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1243             if embeds:
1244                 return _playlist_from_matches(
1245                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1246
1247         # Look for Aparat videos
1248         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1249         if mobj is not None:
1250             return self.url_result(mobj.group(1), 'Aparat')
1251
1252         # Look for MPORA videos
1253         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1254         if mobj is not None:
1255             return self.url_result(mobj.group(1), 'Mpora')
1256
1257         # Look for embedded NovaMov-based player
1258         mobj = re.search(
1259             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1260                     (?P<url>http://(?:(?:embed|www)\.)?
1261                         (?:novamov\.com|
1262                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1263                            videoweed\.(?:es|com)|
1264                            movshare\.(?:net|sx|ag)|
1265                            divxstage\.(?:eu|net|ch|co|at|ag))
1266                         /embed\.php.+?)\1''', webpage)
1267         if mobj is not None:
1268             return self.url_result(mobj.group('url'))
1269
1270         # Look for embedded Facebook player
1271         mobj = re.search(
1272             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1273         if mobj is not None:
1274             return self.url_result(mobj.group('url'), 'Facebook')
1275
1276         # Look for embedded VK player
1277         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1278         if mobj is not None:
1279             return self.url_result(mobj.group('url'), 'VK')
1280
1281         # Look for embedded ivi player
1282         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1283         if mobj is not None:
1284             return self.url_result(mobj.group('url'), 'Ivi')
1285
1286         # Look for embedded Huffington Post player
1287         mobj = re.search(
1288             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1289         if mobj is not None:
1290             return self.url_result(mobj.group('url'), 'HuffPost')
1291
1292         # Look for embed.ly
1293         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1294         if mobj is not None:
1295             return self.url_result(mobj.group('url'))
1296         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1297         if mobj is not None:
1298             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1299
1300         # Look for funnyordie embed
1301         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1302         if matches:
1303             return _playlist_from_matches(
1304                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1305
1306         # Look for BBC iPlayer embed
1307         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1308         if matches:
1309             return _playlist_from_matches(matches, ie='BBCCoUk')
1310
1311         # Look for embedded RUTV player
1312         rutv_url = RUTVIE._extract_url(webpage)
1313         if rutv_url:
1314             return self.url_result(rutv_url, 'RUTV')
1315
1316         # Look for embedded TVC player
1317         tvc_url = TVCIE._extract_url(webpage)
1318         if tvc_url:
1319             return self.url_result(tvc_url, 'TVC')
1320
1321         # Look for embedded SportBox player
1322         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1323         if sportbox_urls:
1324             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1325
1326         # Look for embedded Tvigle player
1327         mobj = re.search(
1328             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
1329         if mobj is not None:
1330             return self.url_result(mobj.group('url'), 'Tvigle')
1331
1332         # Look for embedded TED player
1333         mobj = re.search(
1334             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1335         if mobj is not None:
1336             return self.url_result(mobj.group('url'), 'TED')
1337
1338         # Look for embedded Ustream videos
1339         mobj = re.search(
1340             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1341         if mobj is not None:
1342             return self.url_result(mobj.group('url'), 'Ustream')
1343
1344         # Look for embedded arte.tv player
1345         mobj = re.search(
1346             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1347             webpage)
1348         if mobj is not None:
1349             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1350
1351         # Look for embedded smotri.com player
1352         smotri_url = SmotriIE._extract_url(webpage)
1353         if smotri_url:
1354             return self.url_result(smotri_url, 'Smotri')
1355
1356         # Look for embeded soundcloud player
1357         mobj = re.search(
1358             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1359             webpage)
1360         if mobj is not None:
1361             url = unescapeHTML(mobj.group('url'))
1362             return self.url_result(url)
1363
1364         # Look for embedded vulture.com player
1365         mobj = re.search(
1366             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1367             webpage)
1368         if mobj is not None:
1369             url = unescapeHTML(mobj.group('url'))
1370             return self.url_result(url, ie='Vulture')
1371
1372         # Look for embedded mtvservices player
1373         mobj = re.search(
1374             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1375             webpage)
1376         if mobj is not None:
1377             url = unescapeHTML(mobj.group('url'))
1378             return self.url_result(url, ie='MTVServicesEmbedded')
1379
1380         # Look for embedded yahoo player
1381         mobj = re.search(
1382             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1383             webpage)
1384         if mobj is not None:
1385             return self.url_result(mobj.group('url'), 'Yahoo')
1386
1387         # Look for embedded sbs.com.au player
1388         mobj = re.search(
1389             r'''(?x)
1390             (?:
1391                 <meta\s+property="og:video"\s+content=|
1392                 <iframe[^>]+?src=
1393             )
1394             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1395             webpage)
1396         if mobj is not None:
1397             return self.url_result(mobj.group('url'), 'SBS')
1398
1399         # Look for embedded Cinchcast player
1400         mobj = re.search(
1401             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1402             webpage)
1403         if mobj is not None:
1404             return self.url_result(mobj.group('url'), 'Cinchcast')
1405
1406         mobj = re.search(
1407             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1408             webpage)
1409         if not mobj:
1410             mobj = re.search(
1411                 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1412                 webpage)
1413         if mobj is not None:
1414             return self.url_result(mobj.group('url'), 'MLB')
1415
1416         mobj = re.search(
1417             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1418             webpage)
1419         if mobj is not None:
1420             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1421
1422         mobj = re.search(
1423             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1424             webpage)
1425         if mobj is not None:
1426             return self.url_result(mobj.group('url'), 'Livestream')
1427
1428         # Look for Zapiks embed
1429         mobj = re.search(
1430             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1431         if mobj is not None:
1432             return self.url_result(mobj.group('url'), 'Zapiks')
1433
1434         # Look for Kaltura embeds
1435         mobj = re.search(
1436             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1437         if mobj is not None:
1438             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1439
1440         # Look for Eagle.Platform embeds
1441         mobj = re.search(
1442             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1443         if mobj is not None:
1444             return self.url_result(mobj.group('url'), 'EaglePlatform')
1445
1446         # Look for ClipYou (uses Eagle.Platform) embeds
1447         mobj = re.search(
1448             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1449         if mobj is not None:
1450             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1451
1452         # Look for Pladform embeds
1453         mobj = re.search(
1454             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1455         if mobj is not None:
1456             return self.url_result(mobj.group('url'), 'Pladform')
1457
1458         # Look for Playwire embeds
1459         mobj = re.search(
1460             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1461         if mobj is not None:
1462             return self.url_result(mobj.group('url'))
1463
1464         # Look for 5min embeds
1465         mobj = re.search(
1466             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1467         if mobj is not None:
1468             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1469
1470         # Look for Crooks and Liars embeds
1471         mobj = re.search(
1472             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1473         if mobj is not None:
1474             return self.url_result(mobj.group('url'))
1475
1476         # Look for NBC Sports VPlayer embeds
1477         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1478         if nbc_sports_url:
1479             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1480
1481         # Look for UDN embeds
1482         mobj = re.search(
1483             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1484         if mobj is not None:
1485             return self.url_result(
1486                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1487
1488         # Look for Senate ISVP iframe
1489         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1490         if senate_isvp_url:
1491             return self.url_result(senate_isvp_url, 'SenateISVP')
1492
1493         def check_video(vurl):
1494             if YoutubeIE.suitable(vurl):
1495                 return True
1496             vpath = compat_urlparse.urlparse(vurl).path
1497             vext = determine_ext(vpath)
1498             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1499
1500         def filter_video(urls):
1501             return list(filter(check_video, urls))
1502
1503         # Start with something easy: JW Player in SWFObject
1504         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1505         if not found:
1506             # Look for gorilla-vid style embedding
1507             found = filter_video(re.findall(r'''(?sx)
1508                 (?:
1509                     jw_plugins|
1510                     JWPlayerOptions|
1511                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1512                 )
1513                 .*?
1514                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1515         if not found:
1516             # Broaden the search a little bit
1517             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1518         if not found:
1519             # Broaden the findall a little bit: JWPlayer JS loader
1520             found = filter_video(re.findall(
1521                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1522         if not found:
1523             # Flow player
1524             found = filter_video(re.findall(r'''(?xs)
1525                 flowplayer\("[^"]+",\s*
1526                     \{[^}]+?\}\s*,
1527                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1528                         ["']?url["']?\s*:\s*["']([^"']+)["']
1529             ''', webpage))
1530         if not found:
1531             # Cinerama player
1532             found = re.findall(
1533                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1534         if not found:
1535             # Try to find twitter cards info
1536             found = filter_video(re.findall(
1537                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1538         if not found:
1539             # We look for Open Graph info:
1540             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1541             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1542             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1543             if m_video_type is not None:
1544                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1545         if not found:
1546             # HTML5 video
1547             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1548         if not found:
1549             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1550             found = re.search(
1551                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1552                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1553                 webpage)
1554             if not found:
1555                 # Look also in Refresh HTTP header
1556                 refresh_header = head_response.headers.get('Refresh')
1557                 if refresh_header:
1558                     found = re.search(REDIRECT_REGEX, refresh_header)
1559             if found:
1560                 new_url = compat_urlparse.urljoin(url, found.group(1))
1561                 self.report_following_redirect(new_url)
1562                 return {
1563                     '_type': 'url',
1564                     'url': new_url,
1565                 }
1566         if not found:
1567             raise UnsupportedError(url)
1568
1569         entries = []
1570         for video_url in found:
1571             video_url = compat_urlparse.urljoin(url, video_url)
1572             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1573
1574             # Sometimes, jwplayer extraction will result in a YouTube URL
1575             if YoutubeIE.suitable(video_url):
1576                 entries.append(self.url_result(video_url, 'Youtube'))
1577                 continue
1578
1579             # here's a fun little line of code for you:
1580             video_id = os.path.splitext(video_id)[0]
1581
1582             if determine_ext(video_url) == 'smil':
1583                 entries.append({
1584                     'id': video_id,
1585                     'formats': self._extract_smil_formats(video_url, video_id),
1586                     'uploader': video_uploader,
1587                     'title': video_title,
1588                     'age_limit': age_limit,
1589                 })
1590             else:
1591                 entries.append({
1592                     'id': video_id,
1593                     'url': video_url,
1594                     'uploader': video_uploader,
1595                     'title': video_title,
1596                     'age_limit': age_limit,
1597                 })
1598
1599         if len(entries) == 1:
1600             return entries[0]
1601         else:
1602             for num, e in enumerate(entries, start=1):
1603                 # 'url' results don't have a title
1604                 if e.get('title') is not None:
1605                     e['title'] = '%s (%d)' % (e['title'], num)
1606             return {
1607                 '_type': 'playlist',
1608                 'entries': entries,
1609             }