]> git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/generic.py
[SpankBang] Use python2.6 compatible string formatting spec
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urlparse,
13     compat_xml_parse_error,
14 )
15 from ..utils import (
16     determine_ext,
17     ExtractorError,
18     float_or_none,
19     HEADRequest,
20     is_html,
21     orderedSet,
22     parse_xml,
23     smuggle_url,
24     unescapeHTML,
25     unified_strdate,
26     unsmuggle_url,
27     UnsupportedError,
28     url_basename,
29     xpath_text,
30 )
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
37
38
39 class GenericIE(InfoExtractor):
40     IE_DESC = 'Generic downloader that works on some sites'
41     _VALID_URL = r'.*'
42     IE_NAME = 'generic'
43     _TESTS = [
44         {
45             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
46             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
47             'info_dict': {
48                 'id': '13601338388002',
49                 'ext': 'mp4',
50                 'uploader': 'www.hodiho.fr',
51                 'title': 'R\u00e9gis plante sa Jeep',
52             }
53         },
54         # bandcamp page with custom domain
55         {
56             'add_ie': ['Bandcamp'],
57             'url': 'http://bronyrock.com/track/the-pony-mash',
58             'info_dict': {
59                 'id': '3235767654',
60                 'ext': 'mp3',
61                 'title': 'The Pony Mash',
62                 'uploader': 'M_Pallante',
63             },
64             'skip': 'There is a limit of 200 free downloads / month for the test song',
65         },
66         # embedded brightcove video
67         # it also tests brightcove videos that need to set the 'Referer' in the
68         # http requests
69         {
70             'add_ie': ['Brightcove'],
71             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
72             'info_dict': {
73                 'id': '2765128793001',
74                 'ext': 'mp4',
75                 'title': 'Le cours de bourse : l’analyse technique',
76                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
77                 'uploader': 'BFM BUSINESS',
78             },
79             'params': {
80                 'skip_download': True,
81             },
82         },
83         {
84             # https://github.com/rg3/youtube-dl/issues/2253
85             'url': 'http://bcove.me/i6nfkrc3',
86             'md5': '0ba9446db037002366bab3b3eb30c88c',
87             'info_dict': {
88                 'id': '3101154703001',
89                 'ext': 'mp4',
90                 'title': 'Still no power',
91                 'uploader': 'thestar.com',
92                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
93             },
94             'add_ie': ['Brightcove'],
95         },
96         {
97             'url': 'http://www.championat.com/video/football/v/87/87499.html',
98             'md5': 'fb973ecf6e4a78a67453647444222983',
99             'info_dict': {
100                 'id': '3414141473001',
101                 'ext': 'mp4',
102                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
103                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
104                 'uploader': 'Championat',
105             },
106         },
107         {
108             # https://github.com/rg3/youtube-dl/issues/3541
109             'add_ie': ['Brightcove'],
110             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
111             'info_dict': {
112                 'id': '3866516442001',
113                 'ext': 'mp4',
114                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
115                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
116                 'uploader': 'SBS Broadcasting',
117             },
118             'skip': 'Restricted to Netherlands',
119             'params': {
120                 'skip_download': True,  # m3u8 download
121             },
122         },
123         # Direct link to a video
124         {
125             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
126             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
127             'info_dict': {
128                 'id': 'trailer',
129                 'ext': 'mp4',
130                 'title': 'trailer',
131                 'upload_date': '20100513',
132             }
133         },
134         # ooyala video
135         {
136             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
137             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
138             'info_dict': {
139                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
140                 'ext': 'mp4',
141                 'title': '2cc213299525360.mov',  # that's what we get
142             },
143             'add_ie': ['Ooyala'],
144         },
145         # multiple ooyala embeds on SBN network websites
146         {
147             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
148             'info_dict': {
149                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
150                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
151             },
152             'playlist_mincount': 3,
153             'params': {
154                 'skip_download': True,
155             },
156             'add_ie': ['Ooyala'],
157         },
158         # google redirect
159         {
160             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
161             'info_dict': {
162                 'id': 'cmQHVoWB5FY',
163                 'ext': 'mp4',
164                 'upload_date': '20130224',
165                 'uploader_id': 'TheVerge',
166                 'description': 're:^Chris Ziegler takes a look at the\.*',
167                 'uploader': 'The Verge',
168                 'title': 'First Firefox OS phones side-by-side',
169             },
170             'params': {
171                 'skip_download': False,
172             }
173         },
174         # embed.ly video
175         {
176             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
177             'info_dict': {
178                 'id': '9ODmcdjQcHQ',
179                 'ext': 'mp4',
180                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
181                 'upload_date': '20140225',
182                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
183                 'uploader': 'Tested',
184                 'uploader_id': 'testedcom',
185             },
186             # No need to test YoutubeIE here
187             'params': {
188                 'skip_download': True,
189             },
190         },
191         # funnyordie embed
192         {
193             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
194             'info_dict': {
195                 'id': '18e820ec3f',
196                 'ext': 'mp4',
197                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
198                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
199             },
200         },
201         # BBC iPlayer embeds
202         {
203             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
204             'info_dict': {
205                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
206             },
207             'playlist_mincount': 18,
208         },
209         # RUTV embed
210         {
211             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
212             'info_dict': {
213                 'id': '776940',
214                 'ext': 'mp4',
215                 'title': 'Охотское море стало целиком российским',
216                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
217             },
218             'params': {
219                 # m3u8 download
220                 'skip_download': True,
221             },
222         },
223         # Embedded TED video
224         {
225             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
226             'md5': '65fdff94098e4a607385a60c5177c638',
227             'info_dict': {
228                 'id': '1969',
229                 'ext': 'mp4',
230                 'title': 'Hidden miracles of the natural world',
231                 'uploader': 'Louie Schwartzberg',
232                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
233             }
234         },
235         # Embeded Ustream video
236         {
237             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
238             'md5': '27b99cdb639c9b12a79bca876a073417',
239             'info_dict': {
240                 'id': '45734260',
241                 'ext': 'flv',
242                 'uploader': 'AU SPA:  The NSA and Privacy',
243                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
244             }
245         },
246         # nowvideo embed hidden behind percent encoding
247         {
248             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
249             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
250             'info_dict': {
251                 'id': '06e53103ca9aa',
252                 'ext': 'flv',
253                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
254                 'description': 'No description',
255             },
256         },
257         # arte embed
258         {
259             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
260             'md5': '7653032cbb25bf6c80d80f217055fa43',
261             'info_dict': {
262                 'id': '048195-004_PLUS7-F',
263                 'ext': 'flv',
264                 'title': 'X:enius',
265                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
266                 'upload_date': '20140320',
267             },
268             'params': {
269                 'skip_download': 'Requires rtmpdump'
270             }
271         },
272         # Condé Nast embed
273         {
274             'url': 'http://www.wired.com/2014/04/honda-asimo/',
275             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
276             'info_dict': {
277                 'id': '53501be369702d3275860000',
278                 'ext': 'mp4',
279                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
280             }
281         },
282         # Dailymotion embed
283         {
284             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
285             'md5': '441aeeb82eb72c422c7f14ec533999cd',
286             'info_dict': {
287                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
288                 'ext': 'mp4',
289                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
290                 'uploader': 'Spi0n',
291             },
292             'add_ie': ['Dailymotion'],
293         },
294         # YouTube embed
295         {
296             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
297             'info_dict': {
298                 'id': 'FXRb4ykk4S0',
299                 'ext': 'mp4',
300                 'title': 'The NBL Auction 2014',
301                 'uploader': 'BADMINTON England',
302                 'uploader_id': 'BADMINTONEvents',
303                 'upload_date': '20140603',
304                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
305             },
306             'add_ie': ['Youtube'],
307             'params': {
308                 'skip_download': True,
309             }
310         },
311         # MTVSercices embed
312         {
313             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
314             'md5': '35727f82f58c76d996fc188f9755b0d5',
315             'info_dict': {
316                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
317                 'ext': 'mp4',
318                 'title': 'Review',
319                 'description': 'Mario\'s life in the fast lane has never looked so good.',
320             },
321         },
322         # YouTube embed via <data-embed-url="">
323         {
324             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
325             'info_dict': {
326                 'id': '4vAffPZIT44',
327                 'ext': 'mp4',
328                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
329                 'uploader': 'Gameloft',
330                 'uploader_id': 'gameloft',
331                 'upload_date': '20140828',
332                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
333             },
334             'params': {
335                 'skip_download': True,
336             }
337         },
338         # Camtasia studio
339         {
340             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
341             'playlist': [{
342                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
343                 'info_dict': {
344                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
345                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
346                     'ext': 'flv',
347                     'duration': 2235.90,
348                 }
349             }, {
350                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
351                 'info_dict': {
352                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
353                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
354                     'ext': 'flv',
355                     'duration': 2235.93,
356                 }
357             }],
358             'info_dict': {
359                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
360             }
361         },
362         # Flowplayer
363         {
364             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
365             'md5': '9d65602bf31c6e20014319c7d07fba27',
366             'info_dict': {
367                 'id': '5123ea6d5e5a7',
368                 'ext': 'mp4',
369                 'age_limit': 18,
370                 'uploader': 'www.handjobhub.com',
371                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
372             }
373         },
374         # RSS feed
375         {
376             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
377             'info_dict': {
378                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
379                 'title': 'Zero Punctuation',
380                 'description': 're:.*groundbreaking video review series.*'
381             },
382             'playlist_mincount': 11,
383         },
384         # Multiple brightcove videos
385         # https://github.com/rg3/youtube-dl/issues/2283
386         {
387             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
388             'info_dict': {
389                 'id': 'always-never',
390                 'title': 'Always / Never - The New Yorker',
391             },
392             'playlist_count': 3,
393             'params': {
394                 'extract_flat': False,
395                 'skip_download': True,
396             }
397         },
398         # MLB embed
399         {
400             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
401             'md5': '96f09a37e44da40dd083e12d9a683327',
402             'info_dict': {
403                 'id': '33322633',
404                 'ext': 'mp4',
405                 'title': 'Ump changes call to ball',
406                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
407                 'duration': 48,
408                 'timestamp': 1401537900,
409                 'upload_date': '20140531',
410                 'thumbnail': 're:^https?://.*\.jpg$',
411             },
412         },
413         # Wistia embed
414         {
415             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
416             'md5': '8788b683c777a5cf25621eaf286d0c23',
417             'info_dict': {
418                 'id': '1cfaf6b7ea',
419                 'ext': 'mov',
420                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
421                 'duration': 643.0,
422                 'filesize': 182808282,
423                 'uploader': 'education-portal.com',
424             },
425         },
426         {
427             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
428             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
429             'info_dict': {
430                 'id': 'uxjb0lwrcz',
431                 'ext': 'mp4',
432                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
433                 'duration': 1715.0,
434                 'uploader': 'thoughtworks.wistia.com',
435             },
436         },
437         # Direct download with broken HEAD
438         {
439             'url': 'http://ai-radio.org:8000/radio.opus',
440             'info_dict': {
441                 'id': 'radio',
442                 'ext': 'opus',
443                 'title': 'radio',
444             },
445             'params': {
446                 'skip_download': True,  # infinite live stream
447             },
448             'expected_warnings': [
449                 r'501.*Not Implemented'
450             ],
451         },
452         # Soundcloud embed
453         {
454             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
455             'info_dict': {
456                 'id': '174391317',
457                 'ext': 'mp3',
458                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
459                 'uploader': 'Sophos Security',
460                 'title': 'Chet Chat 171 - Oct 29, 2014',
461                 'upload_date': '20141029',
462             }
463         },
464         # Livestream embed
465         {
466             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
467             'info_dict': {
468                 'id': '67864563',
469                 'ext': 'flv',
470                 'upload_date': '20141112',
471                 'title': 'Rosetta #CometLanding webcast HL 10',
472             }
473         },
474         # LazyYT
475         {
476             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
477             'info_dict': {
478                 'id': '1986',
479                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
480             },
481             'playlist_mincount': 2,
482         },
483         # Direct link with incorrect MIME type
484         {
485             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
486             'md5': '4ccbebe5f36706d85221f204d7eb5913',
487             'info_dict': {
488                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
489                 'id': '5_Lennart_Poettering_-_Systemd',
490                 'ext': 'webm',
491                 'title': '5_Lennart_Poettering_-_Systemd',
492                 'upload_date': '20141120',
493             },
494             'expected_warnings': [
495                 'URL could be a direct video link, returning it as such.'
496             ]
497         },
498         # Cinchcast embed
499         {
500             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
501             'info_dict': {
502                 'id': '7141703',
503                 'ext': 'mp3',
504                 'upload_date': '20141126',
505                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
506             }
507         },
508         # Cinerama player
509         {
510             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
511             'info_dict': {
512                 'id': '730m_DandD_1901_512k',
513                 'ext': 'mp4',
514                 'uploader': 'www.abc.net.au',
515                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
516             }
517         },
518         # embedded viddler video
519         {
520             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
521             'info_dict': {
522                 'id': '4d03aad9',
523                 'ext': 'mp4',
524                 'uploader': 'deadspin',
525                 'title': 'WALL-TO-GORTAT',
526                 'timestamp': 1422285291,
527                 'upload_date': '20150126',
528             },
529             'add_ie': ['Viddler'],
530         },
531         # Libsyn embed
532         {
533             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
534             'info_dict': {
535                 'id': '3377616',
536                 'ext': 'mp3',
537                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
538                 'description': 'md5:601cb790edd05908957dae8aaa866465',
539                 'upload_date': '20150220',
540             },
541         },
542         # jwplayer YouTube
543         {
544             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
545             'info_dict': {
546                 'id': 'Mrj4DVp2zeA',
547                 'ext': 'mp4',
548                 'upload_date': '20150212',
549                 'uploader': 'The National Archives UK',
550                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
551                 'uploader_id': 'NationalArchives08',
552                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
553             },
554         },
555         # rtl.nl embed
556         {
557             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
558             'playlist_mincount': 5,
559             'info_dict': {
560                 'id': 'aanslagen-kopenhagen',
561                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
562             }
563         },
564         # Zapiks embed
565         {
566             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
567             'info_dict': {
568                 'id': '118046',
569                 'ext': 'mp4',
570                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
571             }
572         },
573         # Kaltura embed
574         {
575             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
576             'info_dict': {
577                 'id': '1_eergr3h1',
578                 'ext': 'mp4',
579                 'upload_date': '20150226',
580                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
581                 'timestamp': int,
582                 'title': 'John Carlson Postgame 2/25/15',
583             },
584         },
585         # Eagle.Platform embed (generic URL)
586         {
587             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
588             'info_dict': {
589                 'id': '227304',
590                 'ext': 'mp4',
591                 'title': 'Навальный вышел на свободу',
592                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
593                 'thumbnail': 're:^https?://.*\.jpg$',
594                 'duration': 87,
595                 'view_count': int,
596                 'age_limit': 0,
597             },
598         },
599         # ClipYou (Eagle.Platform) embed (custom URL)
600         {
601             'url': 'http://muz-tv.ru/play/7129/',
602             'info_dict': {
603                 'id': '12820',
604                 'ext': 'mp4',
605                 'title': "'O Sole Mio",
606                 'thumbnail': 're:^https?://.*\.jpg$',
607                 'duration': 216,
608                 'view_count': int,
609             },
610         },
611         # Pladform embed
612         {
613             'url': 'http://muz-tv.ru/kinozal/view/7400/',
614             'info_dict': {
615                 'id': '100183293',
616                 'ext': 'mp4',
617                 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
618                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
619                 'thumbnail': 're:^https?://.*\.jpg$',
620                 'duration': 694,
621                 'age_limit': 0,
622             },
623         },
624         # 5min embed
625         {
626             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
627             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
628             'info_dict': {
629                 'id': '518726732',
630                 'ext': 'mp4',
631                 'title': 'Facebook Creates "On This Day" | Crunch Report',
632             },
633         },
634         # RSS feed with enclosure
635         {
636             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
637             'info_dict': {
638                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
639                 'ext': 'm4v',
640                 'upload_date': '20150228',
641                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
642             }
643         },
644         # NBC Sports vplayer embed
645         {
646             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
647             'info_dict': {
648                 'id': 'ln7x1qSThw4k',
649                 'ext': 'flv',
650                 'title': "PFT Live: New leader in the 'new-look' defense",
651                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
652             },
653         }
654     ]
655
656     def report_following_redirect(self, new_url):
657         """Report information extraction."""
658         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
659
660     def _extract_rss(self, url, video_id, doc):
661         playlist_title = doc.find('./channel/title').text
662         playlist_desc_el = doc.find('./channel/description')
663         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
664
665         entries = []
666         for it in doc.findall('./channel/item'):
667             next_url = xpath_text(it, 'link', fatal=False)
668             if not next_url:
669                 enclosure_nodes = it.findall('./enclosure')
670                 for e in enclosure_nodes:
671                     next_url = e.attrib.get('url')
672                     if next_url:
673                         break
674
675             if not next_url:
676                 continue
677
678             entries.append({
679                 '_type': 'url',
680                 'url': next_url,
681                 'title': it.find('title').text,
682             })
683
684         return {
685             '_type': 'playlist',
686             'id': url,
687             'title': playlist_title,
688             'description': playlist_desc,
689             'entries': entries,
690         }
691
692     def _extract_camtasia(self, url, video_id, webpage):
693         """ Returns None if no camtasia video can be found. """
694
695         camtasia_cfg = self._search_regex(
696             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
697             webpage, 'camtasia configuration file', default=None)
698         if camtasia_cfg is None:
699             return None
700
701         title = self._html_search_meta('DC.title', webpage, fatal=True)
702
703         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
704         camtasia_cfg = self._download_xml(
705             camtasia_url, video_id,
706             note='Downloading camtasia configuration',
707             errnote='Failed to download camtasia configuration')
708         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
709
710         entries = []
711         for n in fileset_node.getchildren():
712             url_n = n.find('./uri')
713             if url_n is None:
714                 continue
715
716             entries.append({
717                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
718                 'title': '%s - %s' % (title, n.tag),
719                 'url': compat_urlparse.urljoin(url, url_n.text),
720                 'duration': float_or_none(n.find('./duration').text),
721             })
722
723         return {
724             '_type': 'playlist',
725             'entries': entries,
726             'title': title,
727         }
728
729     def _real_extract(self, url):
730         if url.startswith('//'):
731             return {
732                 '_type': 'url',
733                 'url': self.http_scheme() + url,
734             }
735
736         parsed_url = compat_urlparse.urlparse(url)
737         if not parsed_url.scheme:
738             default_search = self._downloader.params.get('default_search')
739             if default_search is None:
740                 default_search = 'fixup_error'
741
742             if default_search in ('auto', 'auto_warning', 'fixup_error'):
743                 if '/' in url:
744                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
745                     return self.url_result('http://' + url)
746                 elif default_search != 'fixup_error':
747                     if default_search == 'auto_warning':
748                         if re.match(r'^(?:url|URL)$', url):
749                             raise ExtractorError(
750                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
751                                 expected=True)
752                         else:
753                             self._downloader.report_warning(
754                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
755                     return self.url_result('ytsearch:' + url)
756
757             if default_search in ('error', 'fixup_error'):
758                 raise ExtractorError(
759                     '%r is not a valid URL. '
760                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
761                     % (url, url), expected=True)
762             else:
763                 if ':' not in default_search:
764                     default_search += ':'
765                 return self.url_result(default_search + url)
766
767         url, smuggled_data = unsmuggle_url(url)
768         force_videoid = None
769         is_intentional = smuggled_data and smuggled_data.get('to_generic')
770         if smuggled_data and 'force_videoid' in smuggled_data:
771             force_videoid = smuggled_data['force_videoid']
772             video_id = force_videoid
773         else:
774             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
775
776         self.to_screen('%s: Requesting header' % video_id)
777
778         head_req = HEADRequest(url)
779         head_response = self._request_webpage(
780             head_req, video_id,
781             note=False, errnote='Could not send HEAD request to %s' % url,
782             fatal=False)
783
784         if head_response is not False:
785             # Check for redirect
786             new_url = head_response.geturl()
787             if url != new_url:
788                 self.report_following_redirect(new_url)
789                 if force_videoid:
790                     new_url = smuggle_url(
791                         new_url, {'force_videoid': force_videoid})
792                 return self.url_result(new_url)
793
794         full_response = None
795         if head_response is False:
796             full_response = self._request_webpage(url, video_id)
797             head_response = full_response
798
799         # Check for direct link to a video
800         content_type = head_response.headers.get('Content-Type', '')
801         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
802         if m:
803             upload_date = unified_strdate(
804                 head_response.headers.get('Last-Modified'))
805             return {
806                 'id': video_id,
807                 'title': os.path.splitext(url_basename(url))[0],
808                 'direct': True,
809                 'formats': [{
810                     'format_id': m.group('format_id'),
811                     'url': url,
812                     'vcodec': 'none' if m.group('type') == 'audio' else None
813                 }],
814                 'upload_date': upload_date,
815             }
816
817         if not self._downloader.params.get('test', False) and not is_intentional:
818             self._downloader.report_warning('Falling back on generic information extractor.')
819
820         if not full_response:
821             full_response = self._request_webpage(url, video_id)
822
823         # Maybe it's a direct link to a video?
824         # Be careful not to download the whole thing!
825         first_bytes = full_response.read(512)
826         if not is_html(first_bytes):
827             self._downloader.report_warning(
828                 'URL could be a direct video link, returning it as such.')
829             upload_date = unified_strdate(
830                 head_response.headers.get('Last-Modified'))
831             return {
832                 'id': video_id,
833                 'title': os.path.splitext(url_basename(url))[0],
834                 'direct': True,
835                 'url': url,
836                 'upload_date': upload_date,
837             }
838
839         webpage = self._webpage_read_content(
840             full_response, url, video_id, prefix=first_bytes)
841
842         self.report_extraction(video_id)
843
844         # Is it an RSS feed?
845         try:
846             doc = parse_xml(webpage)
847             if doc.tag == 'rss':
848                 return self._extract_rss(url, video_id, doc)
849         except compat_xml_parse_error:
850             pass
851
852         # Is it a Camtasia project?
853         camtasia_res = self._extract_camtasia(url, video_id, webpage)
854         if camtasia_res is not None:
855             return camtasia_res
856
857         # Sometimes embedded video player is hidden behind percent encoding
858         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
859         # Unescaping the whole page allows to handle those cases in a generic way
860         webpage = compat_urllib_parse.unquote(webpage)
861
862         # it's tempting to parse this further, but you would
863         # have to take into account all the variations like
864         #   Video Title - Site Name
865         #   Site Name | Video Title
866         #   Video Title - Tagline | Site Name
867         # and so on and so forth; it's just not practical
868         video_title = self._html_search_regex(
869             r'(?s)<title>(.*?)</title>', webpage, 'video title',
870             default='video')
871
872         # Try to detect age limit automatically
873         age_limit = self._rta_search(webpage)
874         # And then there are the jokers who advertise that they use RTA,
875         # but actually don't.
876         AGE_LIMIT_MARKERS = [
877             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
878         ]
879         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
880             age_limit = 18
881
882         # video uploader is domain name
883         video_uploader = self._search_regex(
884             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
885
886         # Helper method
887         def _playlist_from_matches(matches, getter=None, ie=None):
888             urlrs = orderedSet(
889                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
890                 for m in matches)
891             return self.playlist_result(
892                 urlrs, playlist_id=video_id, playlist_title=video_title)
893
894         # Look for BrightCove:
895         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
896         if bc_urls:
897             self.to_screen('Brightcove video detected.')
898             entries = [{
899                 '_type': 'url',
900                 'url': smuggle_url(bc_url, {'Referer': url}),
901                 'ie_key': 'Brightcove'
902             } for bc_url in bc_urls]
903
904             return {
905                 '_type': 'playlist',
906                 'title': video_title,
907                 'id': video_id,
908                 'entries': entries,
909             }
910
911         # Look for embedded rtl.nl player
912         matches = re.findall(
913             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
914             webpage)
915         if matches:
916             return _playlist_from_matches(matches, ie='RtlNl')
917
918         # Look for embedded (iframe) Vimeo player
919         mobj = re.search(
920             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
921         if mobj:
922             player_url = unescapeHTML(mobj.group('url'))
923             surl = smuggle_url(player_url, {'Referer': url})
924             return self.url_result(surl)
925         # Look for embedded (swf embed) Vimeo player
926         mobj = re.search(
927             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
928         if mobj:
929             return self.url_result(mobj.group(1))
930
931         # Look for embedded YouTube player
932         matches = re.findall(r'''(?x)
933             (?:
934                 <iframe[^>]+?src=|
935                 data-video-url=|
936                 <embed[^>]+?src=|
937                 embedSWF\(?:\s*|
938                 new\s+SWFObject\(
939             )
940             (["\'])
941                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
942                 (?:embed|v|p)/.+?)
943             \1''', webpage)
944         if matches:
945             return _playlist_from_matches(
946                 matches, lambda m: unescapeHTML(m[1]))
947
948         # Look for lazyYT YouTube embed
949         matches = re.findall(
950             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
951         if matches:
952             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
953
954         # Look for embedded Dailymotion player
955         matches = re.findall(
956             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
957         if matches:
958             return _playlist_from_matches(
959                 matches, lambda m: unescapeHTML(m[1]))
960
961         # Look for embedded Dailymotion playlist player (#3822)
962         m = re.search(
963             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
964         if m:
965             playlists = re.findall(
966                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
967             if playlists:
968                 return _playlist_from_matches(
969                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
970
971         # Look for embedded Wistia player
972         match = re.search(
973             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
974         if match:
975             embed_url = self._proto_relative_url(
976                 unescapeHTML(match.group('url')))
977             return {
978                 '_type': 'url_transparent',
979                 'url': embed_url,
980                 'ie_key': 'Wistia',
981                 'uploader': video_uploader,
982                 'title': video_title,
983                 'id': video_id,
984             }
985
986         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
987         if match:
988             return {
989                 '_type': 'url_transparent',
990                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
991                 'ie_key': 'Wistia',
992                 'uploader': video_uploader,
993                 'title': video_title,
994                 'id': match.group('id')
995             }
996
997         # Look for embedded blip.tv player
998         mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
999         if mobj:
1000             return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1001         mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1002         if mobj:
1003             return self.url_result(mobj.group(1), 'BlipTV')
1004
1005         # Look for embedded condenast player
1006         matches = re.findall(
1007             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1008             webpage)
1009         if matches:
1010             return {
1011                 '_type': 'playlist',
1012                 'entries': [{
1013                     '_type': 'url',
1014                     'ie_key': 'CondeNast',
1015                     'url': ma,
1016                 } for ma in matches],
1017                 'title': video_title,
1018                 'id': video_id,
1019             }
1020
1021         # Look for Bandcamp pages with custom domain
1022         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1023         if mobj is not None:
1024             burl = unescapeHTML(mobj.group(1))
1025             # Don't set the extractor because it can be a track url or an album
1026             return self.url_result(burl)
1027
1028         # Look for embedded Vevo player
1029         mobj = re.search(
1030             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1031         if mobj is not None:
1032             return self.url_result(mobj.group('url'))
1033
1034         # Look for embedded Viddler player
1035         mobj = re.search(
1036             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1037             webpage)
1038         if mobj is not None:
1039             return self.url_result(mobj.group('url'))
1040
1041         # Look for NYTimes player
1042         mobj = re.search(
1043             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1044             webpage)
1045         if mobj is not None:
1046             return self.url_result(mobj.group('url'))
1047
1048         # Look for Libsyn player
1049         mobj = re.search(
1050             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1051         if mobj is not None:
1052             return self.url_result(mobj.group('url'))
1053
1054         # Look for Ooyala videos
1055         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1056                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1057                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
1058         if mobj is not None:
1059             return OoyalaIE._build_url_result(mobj.group('ec'))
1060
1061         # Look for multiple Ooyala embeds on SBN network websites
1062         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1063         if mobj is not None:
1064             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1065             if embeds:
1066                 return _playlist_from_matches(
1067                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1068
1069         # Look for Aparat videos
1070         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1071         if mobj is not None:
1072             return self.url_result(mobj.group(1), 'Aparat')
1073
1074         # Look for MPORA videos
1075         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1076         if mobj is not None:
1077             return self.url_result(mobj.group(1), 'Mpora')
1078
1079         # Look for embedded NovaMov-based player
1080         mobj = re.search(
1081             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1082                     (?P<url>http://(?:(?:embed|www)\.)?
1083                         (?:novamov\.com|
1084                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1085                            videoweed\.(?:es|com)|
1086                            movshare\.(?:net|sx|ag)|
1087                            divxstage\.(?:eu|net|ch|co|at|ag))
1088                         /embed\.php.+?)\1''', webpage)
1089         if mobj is not None:
1090             return self.url_result(mobj.group('url'))
1091
1092         # Look for embedded Facebook player
1093         mobj = re.search(
1094             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1095         if mobj is not None:
1096             return self.url_result(mobj.group('url'), 'Facebook')
1097
1098         # Look for embedded VK player
1099         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1100         if mobj is not None:
1101             return self.url_result(mobj.group('url'), 'VK')
1102
1103         # Look for embedded ivi player
1104         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1105         if mobj is not None:
1106             return self.url_result(mobj.group('url'), 'Ivi')
1107
1108         # Look for embedded Huffington Post player
1109         mobj = re.search(
1110             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1111         if mobj is not None:
1112             return self.url_result(mobj.group('url'), 'HuffPost')
1113
1114         # Look for embed.ly
1115         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1116         if mobj is not None:
1117             return self.url_result(mobj.group('url'))
1118         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1119         if mobj is not None:
1120             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1121
1122         # Look for funnyordie embed
1123         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1124         if matches:
1125             return _playlist_from_matches(
1126                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1127
1128         # Look for BBC iPlayer embed
1129         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1130         if matches:
1131             return _playlist_from_matches(matches, ie='BBCCoUk')
1132
1133         # Look for embedded RUTV player
1134         rutv_url = RUTVIE._extract_url(webpage)
1135         if rutv_url:
1136             return self.url_result(rutv_url, 'RUTV')
1137
1138         # Look for embedded TED player
1139         mobj = re.search(
1140             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1141         if mobj is not None:
1142             return self.url_result(mobj.group('url'), 'TED')
1143
1144         # Look for embedded Ustream videos
1145         mobj = re.search(
1146             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1147         if mobj is not None:
1148             return self.url_result(mobj.group('url'), 'Ustream')
1149
1150         # Look for embedded arte.tv player
1151         mobj = re.search(
1152             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1153             webpage)
1154         if mobj is not None:
1155             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1156
1157         # Look for embedded smotri.com player
1158         smotri_url = SmotriIE._extract_url(webpage)
1159         if smotri_url:
1160             return self.url_result(smotri_url, 'Smotri')
1161
1162         # Look for embeded soundcloud player
1163         mobj = re.search(
1164             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1165             webpage)
1166         if mobj is not None:
1167             url = unescapeHTML(mobj.group('url'))
1168             return self.url_result(url)
1169
1170         # Look for embedded vulture.com player
1171         mobj = re.search(
1172             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1173             webpage)
1174         if mobj is not None:
1175             url = unescapeHTML(mobj.group('url'))
1176             return self.url_result(url, ie='Vulture')
1177
1178         # Look for embedded mtvservices player
1179         mobj = re.search(
1180             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1181             webpage)
1182         if mobj is not None:
1183             url = unescapeHTML(mobj.group('url'))
1184             return self.url_result(url, ie='MTVServicesEmbedded')
1185
1186         # Look for embedded yahoo player
1187         mobj = re.search(
1188             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1189             webpage)
1190         if mobj is not None:
1191             return self.url_result(mobj.group('url'), 'Yahoo')
1192
1193         # Look for embedded sbs.com.au player
1194         mobj = re.search(
1195             r'''(?x)
1196             (?:
1197                 <meta\s+property="og:video"\s+content=|
1198                 <iframe[^>]+?src=
1199             )
1200             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1201             webpage)
1202         if mobj is not None:
1203             return self.url_result(mobj.group('url'), 'SBS')
1204
1205         # Look for embedded Cinchcast player
1206         mobj = re.search(
1207             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1208             webpage)
1209         if mobj is not None:
1210             return self.url_result(mobj.group('url'), 'Cinchcast')
1211
1212         mobj = re.search(
1213             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1214             webpage)
1215         if mobj is not None:
1216             return self.url_result(mobj.group('url'), 'MLB')
1217
1218         mobj = re.search(
1219             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1220             webpage)
1221         if mobj is not None:
1222             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1223
1224         mobj = re.search(
1225             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1226             webpage)
1227         if mobj is not None:
1228             return self.url_result(mobj.group('url'), 'Livestream')
1229
1230         # Look for Zapiks embed
1231         mobj = re.search(
1232             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1233         if mobj is not None:
1234             return self.url_result(mobj.group('url'), 'Zapiks')
1235
1236         # Look for Kaltura embeds
1237         mobj = re.search(
1238             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1239         if mobj is not None:
1240             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1241
1242         # Look for Eagle.Platform embeds
1243         mobj = re.search(
1244             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1245         if mobj is not None:
1246             return self.url_result(mobj.group('url'), 'EaglePlatform')
1247
1248         # Look for ClipYou (uses Eagle.Platform) embeds
1249         mobj = re.search(
1250             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1251         if mobj is not None:
1252             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1253
1254         # Look for Pladform embeds
1255         mobj = re.search(
1256             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1257         if mobj is not None:
1258             return self.url_result(mobj.group('url'), 'Pladform')
1259
1260         # Look for 5min embeds
1261         mobj = re.search(
1262             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1263         if mobj is not None:
1264             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1265
1266         # Look for NBC Sports VPlayer embeds
1267         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1268         if nbc_sports_url:
1269             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1270
1271         def check_video(vurl):
1272             if YoutubeIE.suitable(vurl):
1273                 return True
1274             vpath = compat_urlparse.urlparse(vurl).path
1275             vext = determine_ext(vpath)
1276             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1277
1278         def filter_video(urls):
1279             return list(filter(check_video, urls))
1280
1281         # Start with something easy: JW Player in SWFObject
1282         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1283         if not found:
1284             # Look for gorilla-vid style embedding
1285             found = filter_video(re.findall(r'''(?sx)
1286                 (?:
1287                     jw_plugins|
1288                     JWPlayerOptions|
1289                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1290                 )
1291                 .*?
1292                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1293         if not found:
1294             # Broaden the search a little bit
1295             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1296         if not found:
1297             # Broaden the findall a little bit: JWPlayer JS loader
1298             found = filter_video(re.findall(
1299                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1300         if not found:
1301             # Flow player
1302             found = filter_video(re.findall(r'''(?xs)
1303                 flowplayer\("[^"]+",\s*
1304                     \{[^}]+?\}\s*,
1305                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1306                         ["']?url["']?\s*:\s*["']([^"']+)["']
1307             ''', webpage))
1308         if not found:
1309             # Cinerama player
1310             found = re.findall(
1311                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1312         if not found:
1313             # Try to find twitter cards info
1314             found = filter_video(re.findall(
1315                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1316         if not found:
1317             # We look for Open Graph info:
1318             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1319             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1320             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1321             if m_video_type is not None:
1322                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1323         if not found:
1324             # HTML5 video
1325             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1326         if not found:
1327             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1328             found = re.search(
1329                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1330                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1331                 webpage)
1332             if not found:
1333                 # Look also in Refresh HTTP header
1334                 refresh_header = head_response.headers.get('Refresh')
1335                 if refresh_header:
1336                     found = re.search(REDIRECT_REGEX, refresh_header)
1337             if found:
1338                 new_url = found.group(1)
1339                 self.report_following_redirect(new_url)
1340                 return {
1341                     '_type': 'url',
1342                     'url': new_url,
1343                 }
1344         if not found:
1345             raise UnsupportedError(url)
1346
1347         entries = []
1348         for video_url in found:
1349             video_url = compat_urlparse.urljoin(url, video_url)
1350             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1351
1352             # Sometimes, jwplayer extraction will result in a YouTube URL
1353             if YoutubeIE.suitable(video_url):
1354                 entries.append(self.url_result(video_url, 'Youtube'))
1355                 continue
1356
1357             # here's a fun little line of code for you:
1358             video_id = os.path.splitext(video_id)[0]
1359
1360             entries.append({
1361                 'id': video_id,
1362                 'url': video_url,
1363                 'uploader': video_uploader,
1364                 'title': video_title,
1365                 'age_limit': age_limit,
1366             })
1367
1368         if len(entries) == 1:
1369             return entries[0]
1370         else:
1371             for num, e in enumerate(entries, start=1):
1372                 # 'url' results don't have a title
1373                 if e.get('title') is not None:
1374                     e['title'] = '%s (%d)' % (e['title'], num)
1375             return {
1376                 '_type': 'playlist',
1377                 'entries': entries,
1378             }