[generic] Add tests for Crooks and Liars embeds
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urlparse,
13     compat_xml_parse_error,
14 )
15 from ..utils import (
16     determine_ext,
17     ExtractorError,
18     float_or_none,
19     HEADRequest,
20     is_html,
21     orderedSet,
22     parse_xml,
23     smuggle_url,
24     unescapeHTML,
25     unified_strdate,
26     unsmuggle_url,
27     UnsupportedError,
28     url_basename,
29     xpath_text,
30 )
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
37 from .udn import UDNEmbedIE
38
39
40 class GenericIE(InfoExtractor):
41     IE_DESC = 'Generic downloader that works on some sites'
42     _VALID_URL = r'.*'
43     IE_NAME = 'generic'
44     _TESTS = [
45         {
46             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
47             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
48             'info_dict': {
49                 'id': '13601338388002',
50                 'ext': 'mp4',
51                 'uploader': 'www.hodiho.fr',
52                 'title': 'R\u00e9gis plante sa Jeep',
53             }
54         },
55         # bandcamp page with custom domain
56         {
57             'add_ie': ['Bandcamp'],
58             'url': 'http://bronyrock.com/track/the-pony-mash',
59             'info_dict': {
60                 'id': '3235767654',
61                 'ext': 'mp3',
62                 'title': 'The Pony Mash',
63                 'uploader': 'M_Pallante',
64             },
65             'skip': 'There is a limit of 200 free downloads / month for the test song',
66         },
67         # embedded brightcove video
68         # it also tests brightcove videos that need to set the 'Referer' in the
69         # http requests
70         {
71             'add_ie': ['Brightcove'],
72             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
73             'info_dict': {
74                 'id': '2765128793001',
75                 'ext': 'mp4',
76                 'title': 'Le cours de bourse : l’analyse technique',
77                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
78                 'uploader': 'BFM BUSINESS',
79             },
80             'params': {
81                 'skip_download': True,
82             },
83         },
84         {
85             # https://github.com/rg3/youtube-dl/issues/2253
86             'url': 'http://bcove.me/i6nfkrc3',
87             'md5': '0ba9446db037002366bab3b3eb30c88c',
88             'info_dict': {
89                 'id': '3101154703001',
90                 'ext': 'mp4',
91                 'title': 'Still no power',
92                 'uploader': 'thestar.com',
93                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
94             },
95             'add_ie': ['Brightcove'],
96         },
97         {
98             'url': 'http://www.championat.com/video/football/v/87/87499.html',
99             'md5': 'fb973ecf6e4a78a67453647444222983',
100             'info_dict': {
101                 'id': '3414141473001',
102                 'ext': 'mp4',
103                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
104                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
105                 'uploader': 'Championat',
106             },
107         },
108         {
109             # https://github.com/rg3/youtube-dl/issues/3541
110             'add_ie': ['Brightcove'],
111             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
112             'info_dict': {
113                 'id': '3866516442001',
114                 'ext': 'mp4',
115                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
116                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
117                 'uploader': 'SBS Broadcasting',
118             },
119             'skip': 'Restricted to Netherlands',
120             'params': {
121                 'skip_download': True,  # m3u8 download
122             },
123         },
124         # Direct link to a video
125         {
126             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
127             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
128             'info_dict': {
129                 'id': 'trailer',
130                 'ext': 'mp4',
131                 'title': 'trailer',
132                 'upload_date': '20100513',
133             }
134         },
135         # ooyala video
136         {
137             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
138             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
139             'info_dict': {
140                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
141                 'ext': 'mp4',
142                 'title': '2cc213299525360.mov',  # that's what we get
143             },
144             'add_ie': ['Ooyala'],
145         },
146         # multiple ooyala embeds on SBN network websites
147         {
148             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149             'info_dict': {
150                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
151                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
152             },
153             'playlist_mincount': 3,
154             'params': {
155                 'skip_download': True,
156             },
157             'add_ie': ['Ooyala'],
158         },
159         # google redirect
160         {
161             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
162             'info_dict': {
163                 'id': 'cmQHVoWB5FY',
164                 'ext': 'mp4',
165                 'upload_date': '20130224',
166                 'uploader_id': 'TheVerge',
167                 'description': 're:^Chris Ziegler takes a look at the\.*',
168                 'uploader': 'The Verge',
169                 'title': 'First Firefox OS phones side-by-side',
170             },
171             'params': {
172                 'skip_download': False,
173             }
174         },
175         # embed.ly video
176         {
177             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
178             'info_dict': {
179                 'id': '9ODmcdjQcHQ',
180                 'ext': 'mp4',
181                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
182                 'upload_date': '20140225',
183                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
184                 'uploader': 'Tested',
185                 'uploader_id': 'testedcom',
186             },
187             # No need to test YoutubeIE here
188             'params': {
189                 'skip_download': True,
190             },
191         },
192         # funnyordie embed
193         {
194             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
195             'info_dict': {
196                 'id': '18e820ec3f',
197                 'ext': 'mp4',
198                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
199                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
200             },
201         },
202         # BBC iPlayer embeds
203         {
204             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
205             'info_dict': {
206                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
207             },
208             'playlist_mincount': 18,
209         },
210         # RUTV embed
211         {
212             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
213             'info_dict': {
214                 'id': '776940',
215                 'ext': 'mp4',
216                 'title': 'Охотское море стало целиком российским',
217                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
218             },
219             'params': {
220                 # m3u8 download
221                 'skip_download': True,
222             },
223         },
224         # Embedded TED video
225         {
226             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
227             'md5': '65fdff94098e4a607385a60c5177c638',
228             'info_dict': {
229                 'id': '1969',
230                 'ext': 'mp4',
231                 'title': 'Hidden miracles of the natural world',
232                 'uploader': 'Louie Schwartzberg',
233                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
234             }
235         },
236         # Embeded Ustream video
237         {
238             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
239             'md5': '27b99cdb639c9b12a79bca876a073417',
240             'info_dict': {
241                 'id': '45734260',
242                 'ext': 'flv',
243                 'uploader': 'AU SPA:  The NSA and Privacy',
244                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
245             }
246         },
247         # nowvideo embed hidden behind percent encoding
248         {
249             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
250             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
251             'info_dict': {
252                 'id': '06e53103ca9aa',
253                 'ext': 'flv',
254                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
255                 'description': 'No description',
256             },
257         },
258         # arte embed
259         {
260             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
261             'md5': '7653032cbb25bf6c80d80f217055fa43',
262             'info_dict': {
263                 'id': '048195-004_PLUS7-F',
264                 'ext': 'flv',
265                 'title': 'X:enius',
266                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
267                 'upload_date': '20140320',
268             },
269             'params': {
270                 'skip_download': 'Requires rtmpdump'
271             }
272         },
273         # Condé Nast embed
274         {
275             'url': 'http://www.wired.com/2014/04/honda-asimo/',
276             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
277             'info_dict': {
278                 'id': '53501be369702d3275860000',
279                 'ext': 'mp4',
280                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
281             }
282         },
283         # Dailymotion embed
284         {
285             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
286             'md5': '441aeeb82eb72c422c7f14ec533999cd',
287             'info_dict': {
288                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
289                 'ext': 'mp4',
290                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
291                 'uploader': 'Spi0n',
292             },
293             'add_ie': ['Dailymotion'],
294         },
295         # YouTube embed
296         {
297             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
298             'info_dict': {
299                 'id': 'FXRb4ykk4S0',
300                 'ext': 'mp4',
301                 'title': 'The NBL Auction 2014',
302                 'uploader': 'BADMINTON England',
303                 'uploader_id': 'BADMINTONEvents',
304                 'upload_date': '20140603',
305                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
306             },
307             'add_ie': ['Youtube'],
308             'params': {
309                 'skip_download': True,
310             }
311         },
312         # MTVSercices embed
313         {
314             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
315             'md5': '35727f82f58c76d996fc188f9755b0d5',
316             'info_dict': {
317                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
318                 'ext': 'mp4',
319                 'title': 'Review',
320                 'description': 'Mario\'s life in the fast lane has never looked so good.',
321             },
322         },
323         # YouTube embed via <data-embed-url="">
324         {
325             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
326             'info_dict': {
327                 'id': '4vAffPZIT44',
328                 'ext': 'mp4',
329                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
330                 'uploader': 'Gameloft',
331                 'uploader_id': 'gameloft',
332                 'upload_date': '20140828',
333                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
334             },
335             'params': {
336                 'skip_download': True,
337             }
338         },
339         # Camtasia studio
340         {
341             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
342             'playlist': [{
343                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
344                 'info_dict': {
345                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
346                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
347                     'ext': 'flv',
348                     'duration': 2235.90,
349                 }
350             }, {
351                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
352                 'info_dict': {
353                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
354                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
355                     'ext': 'flv',
356                     'duration': 2235.93,
357                 }
358             }],
359             'info_dict': {
360                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
361             }
362         },
363         # Flowplayer
364         {
365             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
366             'md5': '9d65602bf31c6e20014319c7d07fba27',
367             'info_dict': {
368                 'id': '5123ea6d5e5a7',
369                 'ext': 'mp4',
370                 'age_limit': 18,
371                 'uploader': 'www.handjobhub.com',
372                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
373             }
374         },
375         # RSS feed
376         {
377             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378             'info_dict': {
379                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
380                 'title': 'Zero Punctuation',
381                 'description': 're:.*groundbreaking video review series.*'
382             },
383             'playlist_mincount': 11,
384         },
385         # Multiple brightcove videos
386         # https://github.com/rg3/youtube-dl/issues/2283
387         {
388             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
389             'info_dict': {
390                 'id': 'always-never',
391                 'title': 'Always / Never - The New Yorker',
392             },
393             'playlist_count': 3,
394             'params': {
395                 'extract_flat': False,
396                 'skip_download': True,
397             }
398         },
399         # MLB embed
400         {
401             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
402             'md5': '96f09a37e44da40dd083e12d9a683327',
403             'info_dict': {
404                 'id': '33322633',
405                 'ext': 'mp4',
406                 'title': 'Ump changes call to ball',
407                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
408                 'duration': 48,
409                 'timestamp': 1401537900,
410                 'upload_date': '20140531',
411                 'thumbnail': 're:^https?://.*\.jpg$',
412             },
413         },
414         # Wistia embed
415         {
416             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
417             'md5': '8788b683c777a5cf25621eaf286d0c23',
418             'info_dict': {
419                 'id': '1cfaf6b7ea',
420                 'ext': 'mov',
421                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
422                 'duration': 643.0,
423                 'filesize': 182808282,
424                 'uploader': 'education-portal.com',
425             },
426         },
427         {
428             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
429             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
430             'info_dict': {
431                 'id': 'uxjb0lwrcz',
432                 'ext': 'mp4',
433                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
434                 'duration': 1715.0,
435                 'uploader': 'thoughtworks.wistia.com',
436             },
437         },
438         # Direct download with broken HEAD
439         {
440             'url': 'http://ai-radio.org:8000/radio.opus',
441             'info_dict': {
442                 'id': 'radio',
443                 'ext': 'opus',
444                 'title': 'radio',
445             },
446             'params': {
447                 'skip_download': True,  # infinite live stream
448             },
449             'expected_warnings': [
450                 r'501.*Not Implemented'
451             ],
452         },
453         # Soundcloud embed
454         {
455             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
456             'info_dict': {
457                 'id': '174391317',
458                 'ext': 'mp3',
459                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
460                 'uploader': 'Sophos Security',
461                 'title': 'Chet Chat 171 - Oct 29, 2014',
462                 'upload_date': '20141029',
463             }
464         },
465         # Livestream embed
466         {
467             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
468             'info_dict': {
469                 'id': '67864563',
470                 'ext': 'flv',
471                 'upload_date': '20141112',
472                 'title': 'Rosetta #CometLanding webcast HL 10',
473             }
474         },
475         # LazyYT
476         {
477             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
478             'info_dict': {
479                 'id': '1986',
480                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
481             },
482             'playlist_mincount': 2,
483         },
484         # Direct link with incorrect MIME type
485         {
486             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
487             'md5': '4ccbebe5f36706d85221f204d7eb5913',
488             'info_dict': {
489                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
490                 'id': '5_Lennart_Poettering_-_Systemd',
491                 'ext': 'webm',
492                 'title': '5_Lennart_Poettering_-_Systemd',
493                 'upload_date': '20141120',
494             },
495             'expected_warnings': [
496                 'URL could be a direct video link, returning it as such.'
497             ]
498         },
499         # Cinchcast embed
500         {
501             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
502             'info_dict': {
503                 'id': '7141703',
504                 'ext': 'mp3',
505                 'upload_date': '20141126',
506                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
507             }
508         },
509         # Cinerama player
510         {
511             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
512             'info_dict': {
513                 'id': '730m_DandD_1901_512k',
514                 'ext': 'mp4',
515                 'uploader': 'www.abc.net.au',
516                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
517             }
518         },
519         # embedded viddler video
520         {
521             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
522             'info_dict': {
523                 'id': '4d03aad9',
524                 'ext': 'mp4',
525                 'uploader': 'deadspin',
526                 'title': 'WALL-TO-GORTAT',
527                 'timestamp': 1422285291,
528                 'upload_date': '20150126',
529             },
530             'add_ie': ['Viddler'],
531         },
532         # Libsyn embed
533         {
534             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
535             'info_dict': {
536                 'id': '3377616',
537                 'ext': 'mp3',
538                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
539                 'description': 'md5:601cb790edd05908957dae8aaa866465',
540                 'upload_date': '20150220',
541             },
542         },
543         # jwplayer YouTube
544         {
545             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
546             'info_dict': {
547                 'id': 'Mrj4DVp2zeA',
548                 'ext': 'mp4',
549                 'upload_date': '20150212',
550                 'uploader': 'The National Archives UK',
551                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
552                 'uploader_id': 'NationalArchives08',
553                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
554             },
555         },
556         # rtl.nl embed
557         {
558             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
559             'playlist_mincount': 5,
560             'info_dict': {
561                 'id': 'aanslagen-kopenhagen',
562                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
563             }
564         },
565         # Zapiks embed
566         {
567             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
568             'info_dict': {
569                 'id': '118046',
570                 'ext': 'mp4',
571                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
572             }
573         },
574         # Kaltura embed
575         {
576             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
577             'info_dict': {
578                 'id': '1_eergr3h1',
579                 'ext': 'mp4',
580                 'upload_date': '20150226',
581                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
582                 'timestamp': int,
583                 'title': 'John Carlson Postgame 2/25/15',
584             },
585         },
586         # Eagle.Platform embed (generic URL)
587         {
588             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
589             'info_dict': {
590                 'id': '227304',
591                 'ext': 'mp4',
592                 'title': 'Навальный вышел на свободу',
593                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
594                 'thumbnail': 're:^https?://.*\.jpg$',
595                 'duration': 87,
596                 'view_count': int,
597                 'age_limit': 0,
598             },
599         },
600         # ClipYou (Eagle.Platform) embed (custom URL)
601         {
602             'url': 'http://muz-tv.ru/play/7129/',
603             'info_dict': {
604                 'id': '12820',
605                 'ext': 'mp4',
606                 'title': "'O Sole Mio",
607                 'thumbnail': 're:^https?://.*\.jpg$',
608                 'duration': 216,
609                 'view_count': int,
610             },
611         },
612         # Pladform embed
613         {
614             'url': 'http://muz-tv.ru/kinozal/view/7400/',
615             'info_dict': {
616                 'id': '100183293',
617                 'ext': 'mp4',
618                 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
619                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
620                 'thumbnail': 're:^https?://.*\.jpg$',
621                 'duration': 694,
622                 'age_limit': 0,
623             },
624         },
625         # 5min embed
626         {
627             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
628             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
629             'info_dict': {
630                 'id': '518726732',
631                 'ext': 'mp4',
632                 'title': 'Facebook Creates "On This Day" | Crunch Report',
633             },
634         },
635         # RSS feed with enclosure
636         {
637             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
638             'info_dict': {
639                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
640                 'ext': 'm4v',
641                 'upload_date': '20150228',
642                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
643             }
644         },
645         # Crooks and Liars embed
646         {
647             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
648             'info_dict': {
649                 'id': '8RUoRhRi',
650                 'ext': 'mp4',
651                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
652                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
653                 'timestamp': 1428207000,
654                 'upload_date': '20150405',
655                 'uploader': 'Heather',
656             },
657         },
658         # Crooks and Liars external embed
659         {
660             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
661             'info_dict': {
662                 'id': 'MTE3MjUtMzQ2MzA',
663                 'ext': 'mp4',
664                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
665                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
666                 'timestamp': 1265032391,
667                 'upload_date': '20100201',
668                 'uploader': 'Heather',
669             },
670         },
671         # NBC Sports vplayer embed
672         {
673             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
674             'info_dict': {
675                 'id': 'ln7x1qSThw4k',
676                 'ext': 'flv',
677                 'title': "PFT Live: New leader in the 'new-look' defense",
678                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
679             },
680         },
681         # UDN embed
682         {
683             'url': 'http://www.udn.com/news/story/7314/822787',
684             'md5': 'de06b4c90b042c128395a88f0384817e',
685             'info_dict': {
686                 'id': '300040',
687                 'ext': 'mp4',
688                 'title': '生物老師男變女 全校挺"做自己"',
689                 'thumbnail': 're:^https?://.*\.jpg$',
690             }
691         }
692     ]
693
694     def report_following_redirect(self, new_url):
695         """Report information extraction."""
696         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
697
698     def _extract_rss(self, url, video_id, doc):
699         playlist_title = doc.find('./channel/title').text
700         playlist_desc_el = doc.find('./channel/description')
701         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
702
703         entries = []
704         for it in doc.findall('./channel/item'):
705             next_url = xpath_text(it, 'link', fatal=False)
706             if not next_url:
707                 enclosure_nodes = it.findall('./enclosure')
708                 for e in enclosure_nodes:
709                     next_url = e.attrib.get('url')
710                     if next_url:
711                         break
712
713             if not next_url:
714                 continue
715
716             entries.append({
717                 '_type': 'url',
718                 'url': next_url,
719                 'title': it.find('title').text,
720             })
721
722         return {
723             '_type': 'playlist',
724             'id': url,
725             'title': playlist_title,
726             'description': playlist_desc,
727             'entries': entries,
728         }
729
730     def _extract_camtasia(self, url, video_id, webpage):
731         """ Returns None if no camtasia video can be found. """
732
733         camtasia_cfg = self._search_regex(
734             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
735             webpage, 'camtasia configuration file', default=None)
736         if camtasia_cfg is None:
737             return None
738
739         title = self._html_search_meta('DC.title', webpage, fatal=True)
740
741         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
742         camtasia_cfg = self._download_xml(
743             camtasia_url, video_id,
744             note='Downloading camtasia configuration',
745             errnote='Failed to download camtasia configuration')
746         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
747
748         entries = []
749         for n in fileset_node.getchildren():
750             url_n = n.find('./uri')
751             if url_n is None:
752                 continue
753
754             entries.append({
755                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
756                 'title': '%s - %s' % (title, n.tag),
757                 'url': compat_urlparse.urljoin(url, url_n.text),
758                 'duration': float_or_none(n.find('./duration').text),
759             })
760
761         return {
762             '_type': 'playlist',
763             'entries': entries,
764             'title': title,
765         }
766
767     def _real_extract(self, url):
768         if url.startswith('//'):
769             return {
770                 '_type': 'url',
771                 'url': self.http_scheme() + url,
772             }
773
774         parsed_url = compat_urlparse.urlparse(url)
775         if not parsed_url.scheme:
776             default_search = self._downloader.params.get('default_search')
777             if default_search is None:
778                 default_search = 'fixup_error'
779
780             if default_search in ('auto', 'auto_warning', 'fixup_error'):
781                 if '/' in url:
782                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
783                     return self.url_result('http://' + url)
784                 elif default_search != 'fixup_error':
785                     if default_search == 'auto_warning':
786                         if re.match(r'^(?:url|URL)$', url):
787                             raise ExtractorError(
788                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
789                                 expected=True)
790                         else:
791                             self._downloader.report_warning(
792                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
793                     return self.url_result('ytsearch:' + url)
794
795             if default_search in ('error', 'fixup_error'):
796                 raise ExtractorError(
797                     '%r is not a valid URL. '
798                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
799                     % (url, url), expected=True)
800             else:
801                 if ':' not in default_search:
802                     default_search += ':'
803                 return self.url_result(default_search + url)
804
805         url, smuggled_data = unsmuggle_url(url)
806         force_videoid = None
807         is_intentional = smuggled_data and smuggled_data.get('to_generic')
808         if smuggled_data and 'force_videoid' in smuggled_data:
809             force_videoid = smuggled_data['force_videoid']
810             video_id = force_videoid
811         else:
812             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
813
814         self.to_screen('%s: Requesting header' % video_id)
815
816         head_req = HEADRequest(url)
817         head_response = self._request_webpage(
818             head_req, video_id,
819             note=False, errnote='Could not send HEAD request to %s' % url,
820             fatal=False)
821
822         if head_response is not False:
823             # Check for redirect
824             new_url = head_response.geturl()
825             if url != new_url:
826                 self.report_following_redirect(new_url)
827                 if force_videoid:
828                     new_url = smuggle_url(
829                         new_url, {'force_videoid': force_videoid})
830                 return self.url_result(new_url)
831
832         full_response = None
833         if head_response is False:
834             full_response = self._request_webpage(url, video_id)
835             head_response = full_response
836
837         # Check for direct link to a video
838         content_type = head_response.headers.get('Content-Type', '')
839         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
840         if m:
841             upload_date = unified_strdate(
842                 head_response.headers.get('Last-Modified'))
843             return {
844                 'id': video_id,
845                 'title': os.path.splitext(url_basename(url))[0],
846                 'direct': True,
847                 'formats': [{
848                     'format_id': m.group('format_id'),
849                     'url': url,
850                     'vcodec': 'none' if m.group('type') == 'audio' else None
851                 }],
852                 'upload_date': upload_date,
853             }
854
855         if not self._downloader.params.get('test', False) and not is_intentional:
856             self._downloader.report_warning('Falling back on generic information extractor.')
857
858         if not full_response:
859             full_response = self._request_webpage(url, video_id)
860
861         # Maybe it's a direct link to a video?
862         # Be careful not to download the whole thing!
863         first_bytes = full_response.read(512)
864         if not is_html(first_bytes):
865             self._downloader.report_warning(
866                 'URL could be a direct video link, returning it as such.')
867             upload_date = unified_strdate(
868                 head_response.headers.get('Last-Modified'))
869             return {
870                 'id': video_id,
871                 'title': os.path.splitext(url_basename(url))[0],
872                 'direct': True,
873                 'url': url,
874                 'upload_date': upload_date,
875             }
876
877         webpage = self._webpage_read_content(
878             full_response, url, video_id, prefix=first_bytes)
879
880         self.report_extraction(video_id)
881
882         # Is it an RSS feed?
883         try:
884             doc = parse_xml(webpage)
885             if doc.tag == 'rss':
886                 return self._extract_rss(url, video_id, doc)
887         except compat_xml_parse_error:
888             pass
889
890         # Is it a Camtasia project?
891         camtasia_res = self._extract_camtasia(url, video_id, webpage)
892         if camtasia_res is not None:
893             return camtasia_res
894
895         # Sometimes embedded video player is hidden behind percent encoding
896         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
897         # Unescaping the whole page allows to handle those cases in a generic way
898         webpage = compat_urllib_parse.unquote(webpage)
899
900         # it's tempting to parse this further, but you would
901         # have to take into account all the variations like
902         #   Video Title - Site Name
903         #   Site Name | Video Title
904         #   Video Title - Tagline | Site Name
905         # and so on and so forth; it's just not practical
906         video_title = self._html_search_regex(
907             r'(?s)<title>(.*?)</title>', webpage, 'video title',
908             default='video')
909
910         # Try to detect age limit automatically
911         age_limit = self._rta_search(webpage)
912         # And then there are the jokers who advertise that they use RTA,
913         # but actually don't.
914         AGE_LIMIT_MARKERS = [
915             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
916         ]
917         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
918             age_limit = 18
919
920         # video uploader is domain name
921         video_uploader = self._search_regex(
922             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
923
924         # Helper method
925         def _playlist_from_matches(matches, getter=None, ie=None):
926             urlrs = orderedSet(
927                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
928                 for m in matches)
929             return self.playlist_result(
930                 urlrs, playlist_id=video_id, playlist_title=video_title)
931
932         # Look for BrightCove:
933         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
934         if bc_urls:
935             self.to_screen('Brightcove video detected.')
936             entries = [{
937                 '_type': 'url',
938                 'url': smuggle_url(bc_url, {'Referer': url}),
939                 'ie_key': 'Brightcove'
940             } for bc_url in bc_urls]
941
942             return {
943                 '_type': 'playlist',
944                 'title': video_title,
945                 'id': video_id,
946                 'entries': entries,
947             }
948
949         # Look for embedded rtl.nl player
950         matches = re.findall(
951             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
952             webpage)
953         if matches:
954             return _playlist_from_matches(matches, ie='RtlNl')
955
956         # Look for embedded (iframe) Vimeo player
957         mobj = re.search(
958             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
959         if mobj:
960             player_url = unescapeHTML(mobj.group('url'))
961             surl = smuggle_url(player_url, {'Referer': url})
962             return self.url_result(surl)
963         # Look for embedded (swf embed) Vimeo player
964         mobj = re.search(
965             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
966         if mobj:
967             return self.url_result(mobj.group(1))
968
969         # Look for embedded YouTube player
970         matches = re.findall(r'''(?x)
971             (?:
972                 <iframe[^>]+?src=|
973                 data-video-url=|
974                 <embed[^>]+?src=|
975                 embedSWF\(?:\s*|
976                 new\s+SWFObject\(
977             )
978             (["\'])
979                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
980                 (?:embed|v|p)/.+?)
981             \1''', webpage)
982         if matches:
983             return _playlist_from_matches(
984                 matches, lambda m: unescapeHTML(m[1]))
985
986         # Look for lazyYT YouTube embed
987         matches = re.findall(
988             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
989         if matches:
990             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
991
992         # Look for embedded Dailymotion player
993         matches = re.findall(
994             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
995         if matches:
996             return _playlist_from_matches(
997                 matches, lambda m: unescapeHTML(m[1]))
998
999         # Look for embedded Dailymotion playlist player (#3822)
1000         m = re.search(
1001             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1002         if m:
1003             playlists = re.findall(
1004                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1005             if playlists:
1006                 return _playlist_from_matches(
1007                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1008
1009         # Look for embedded Wistia player
1010         match = re.search(
1011             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1012         if match:
1013             embed_url = self._proto_relative_url(
1014                 unescapeHTML(match.group('url')))
1015             return {
1016                 '_type': 'url_transparent',
1017                 'url': embed_url,
1018                 'ie_key': 'Wistia',
1019                 'uploader': video_uploader,
1020                 'title': video_title,
1021                 'id': video_id,
1022             }
1023
1024         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1025         if match:
1026             return {
1027                 '_type': 'url_transparent',
1028                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1029                 'ie_key': 'Wistia',
1030                 'uploader': video_uploader,
1031                 'title': video_title,
1032                 'id': match.group('id')
1033             }
1034
1035         # Look for embedded blip.tv player
1036         mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
1037         if mobj:
1038             return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1039         mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1040         if mobj:
1041             return self.url_result(mobj.group(1), 'BlipTV')
1042
1043         # Look for embedded condenast player
1044         matches = re.findall(
1045             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1046             webpage)
1047         if matches:
1048             return {
1049                 '_type': 'playlist',
1050                 'entries': [{
1051                     '_type': 'url',
1052                     'ie_key': 'CondeNast',
1053                     'url': ma,
1054                 } for ma in matches],
1055                 'title': video_title,
1056                 'id': video_id,
1057             }
1058
1059         # Look for Bandcamp pages with custom domain
1060         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1061         if mobj is not None:
1062             burl = unescapeHTML(mobj.group(1))
1063             # Don't set the extractor because it can be a track url or an album
1064             return self.url_result(burl)
1065
1066         # Look for embedded Vevo player
1067         mobj = re.search(
1068             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1069         if mobj is not None:
1070             return self.url_result(mobj.group('url'))
1071
1072         # Look for embedded Viddler player
1073         mobj = re.search(
1074             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1075             webpage)
1076         if mobj is not None:
1077             return self.url_result(mobj.group('url'))
1078
1079         # Look for NYTimes player
1080         mobj = re.search(
1081             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1082             webpage)
1083         if mobj is not None:
1084             return self.url_result(mobj.group('url'))
1085
1086         # Look for Libsyn player
1087         mobj = re.search(
1088             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1089         if mobj is not None:
1090             return self.url_result(mobj.group('url'))
1091
1092         # Look for Ooyala videos
1093         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1094                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1095                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
1096         if mobj is not None:
1097             return OoyalaIE._build_url_result(mobj.group('ec'))
1098
1099         # Look for multiple Ooyala embeds on SBN network websites
1100         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1101         if mobj is not None:
1102             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1103             if embeds:
1104                 return _playlist_from_matches(
1105                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1106
1107         # Look for Aparat videos
1108         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1109         if mobj is not None:
1110             return self.url_result(mobj.group(1), 'Aparat')
1111
1112         # Look for MPORA videos
1113         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1114         if mobj is not None:
1115             return self.url_result(mobj.group(1), 'Mpora')
1116
1117         # Look for embedded NovaMov-based player
1118         mobj = re.search(
1119             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1120                     (?P<url>http://(?:(?:embed|www)\.)?
1121                         (?:novamov\.com|
1122                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1123                            videoweed\.(?:es|com)|
1124                            movshare\.(?:net|sx|ag)|
1125                            divxstage\.(?:eu|net|ch|co|at|ag))
1126                         /embed\.php.+?)\1''', webpage)
1127         if mobj is not None:
1128             return self.url_result(mobj.group('url'))
1129
1130         # Look for embedded Facebook player
1131         mobj = re.search(
1132             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1133         if mobj is not None:
1134             return self.url_result(mobj.group('url'), 'Facebook')
1135
1136         # Look for embedded VK player
1137         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1138         if mobj is not None:
1139             return self.url_result(mobj.group('url'), 'VK')
1140
1141         # Look for embedded ivi player
1142         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1143         if mobj is not None:
1144             return self.url_result(mobj.group('url'), 'Ivi')
1145
1146         # Look for embedded Huffington Post player
1147         mobj = re.search(
1148             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1149         if mobj is not None:
1150             return self.url_result(mobj.group('url'), 'HuffPost')
1151
1152         # Look for embed.ly
1153         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1154         if mobj is not None:
1155             return self.url_result(mobj.group('url'))
1156         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1157         if mobj is not None:
1158             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1159
1160         # Look for funnyordie embed
1161         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1162         if matches:
1163             return _playlist_from_matches(
1164                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1165
1166         # Look for BBC iPlayer embed
1167         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1168         if matches:
1169             return _playlist_from_matches(matches, ie='BBCCoUk')
1170
1171         # Look for embedded RUTV player
1172         rutv_url = RUTVIE._extract_url(webpage)
1173         if rutv_url:
1174             return self.url_result(rutv_url, 'RUTV')
1175
1176         # Look for embedded TED player
1177         mobj = re.search(
1178             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1179         if mobj is not None:
1180             return self.url_result(mobj.group('url'), 'TED')
1181
1182         # Look for embedded Ustream videos
1183         mobj = re.search(
1184             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1185         if mobj is not None:
1186             return self.url_result(mobj.group('url'), 'Ustream')
1187
1188         # Look for embedded arte.tv player
1189         mobj = re.search(
1190             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1191             webpage)
1192         if mobj is not None:
1193             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1194
1195         # Look for embedded smotri.com player
1196         smotri_url = SmotriIE._extract_url(webpage)
1197         if smotri_url:
1198             return self.url_result(smotri_url, 'Smotri')
1199
1200         # Look for embeded soundcloud player
1201         mobj = re.search(
1202             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1203             webpage)
1204         if mobj is not None:
1205             url = unescapeHTML(mobj.group('url'))
1206             return self.url_result(url)
1207
1208         # Look for embedded vulture.com player
1209         mobj = re.search(
1210             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1211             webpage)
1212         if mobj is not None:
1213             url = unescapeHTML(mobj.group('url'))
1214             return self.url_result(url, ie='Vulture')
1215
1216         # Look for embedded mtvservices player
1217         mobj = re.search(
1218             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1219             webpage)
1220         if mobj is not None:
1221             url = unescapeHTML(mobj.group('url'))
1222             return self.url_result(url, ie='MTVServicesEmbedded')
1223
1224         # Look for embedded yahoo player
1225         mobj = re.search(
1226             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1227             webpage)
1228         if mobj is not None:
1229             return self.url_result(mobj.group('url'), 'Yahoo')
1230
1231         # Look for embedded sbs.com.au player
1232         mobj = re.search(
1233             r'''(?x)
1234             (?:
1235                 <meta\s+property="og:video"\s+content=|
1236                 <iframe[^>]+?src=
1237             )
1238             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1239             webpage)
1240         if mobj is not None:
1241             return self.url_result(mobj.group('url'), 'SBS')
1242
1243         # Look for embedded Cinchcast player
1244         mobj = re.search(
1245             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1246             webpage)
1247         if mobj is not None:
1248             return self.url_result(mobj.group('url'), 'Cinchcast')
1249
1250         mobj = re.search(
1251             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1252             webpage)
1253         if mobj is not None:
1254             return self.url_result(mobj.group('url'), 'MLB')
1255
1256         mobj = re.search(
1257             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1258             webpage)
1259         if mobj is not None:
1260             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1261
1262         mobj = re.search(
1263             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1264             webpage)
1265         if mobj is not None:
1266             return self.url_result(mobj.group('url'), 'Livestream')
1267
1268         # Look for Zapiks embed
1269         mobj = re.search(
1270             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1271         if mobj is not None:
1272             return self.url_result(mobj.group('url'), 'Zapiks')
1273
1274         # Look for Kaltura embeds
1275         mobj = re.search(
1276             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1277         if mobj is not None:
1278             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1279
1280         # Look for Eagle.Platform embeds
1281         mobj = re.search(
1282             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1283         if mobj is not None:
1284             return self.url_result(mobj.group('url'), 'EaglePlatform')
1285
1286         # Look for ClipYou (uses Eagle.Platform) embeds
1287         mobj = re.search(
1288             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1289         if mobj is not None:
1290             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1291
1292         # Look for Pladform embeds
1293         mobj = re.search(
1294             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1295         if mobj is not None:
1296             return self.url_result(mobj.group('url'), 'Pladform')
1297
1298         # Look for 5min embeds
1299         mobj = re.search(
1300             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1301         if mobj is not None:
1302             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1303
1304         # Look for Crooks and Liars embeds
1305         mobj = re.search(
1306             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1307         if mobj is not None:
1308             return self.url_result(mobj.group('url'))
1309
1310         # Look for NBC Sports VPlayer embeds
1311         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1312         if nbc_sports_url:
1313             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1314
1315         # Look for UDN embeds
1316         mobj = re.search(
1317             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1318         if mobj is not None:
1319             return self.url_result(
1320                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1321
1322         def check_video(vurl):
1323             if YoutubeIE.suitable(vurl):
1324                 return True
1325             vpath = compat_urlparse.urlparse(vurl).path
1326             vext = determine_ext(vpath)
1327             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1328
1329         def filter_video(urls):
1330             return list(filter(check_video, urls))
1331
1332         # Start with something easy: JW Player in SWFObject
1333         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1334         if not found:
1335             # Look for gorilla-vid style embedding
1336             found = filter_video(re.findall(r'''(?sx)
1337                 (?:
1338                     jw_plugins|
1339                     JWPlayerOptions|
1340                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1341                 )
1342                 .*?
1343                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1344         if not found:
1345             # Broaden the search a little bit
1346             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1347         if not found:
1348             # Broaden the findall a little bit: JWPlayer JS loader
1349             found = filter_video(re.findall(
1350                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1351         if not found:
1352             # Flow player
1353             found = filter_video(re.findall(r'''(?xs)
1354                 flowplayer\("[^"]+",\s*
1355                     \{[^}]+?\}\s*,
1356                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1357                         ["']?url["']?\s*:\s*["']([^"']+)["']
1358             ''', webpage))
1359         if not found:
1360             # Cinerama player
1361             found = re.findall(
1362                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1363         if not found:
1364             # Try to find twitter cards info
1365             found = filter_video(re.findall(
1366                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1367         if not found:
1368             # We look for Open Graph info:
1369             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1370             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1371             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1372             if m_video_type is not None:
1373                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1374         if not found:
1375             # HTML5 video
1376             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1377         if not found:
1378             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1379             found = re.search(
1380                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1381                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1382                 webpage)
1383             if not found:
1384                 # Look also in Refresh HTTP header
1385                 refresh_header = head_response.headers.get('Refresh')
1386                 if refresh_header:
1387                     found = re.search(REDIRECT_REGEX, refresh_header)
1388             if found:
1389                 new_url = found.group(1)
1390                 self.report_following_redirect(new_url)
1391                 return {
1392                     '_type': 'url',
1393                     'url': new_url,
1394                 }
1395         if not found:
1396             raise UnsupportedError(url)
1397
1398         entries = []
1399         for video_url in found:
1400             video_url = compat_urlparse.urljoin(url, video_url)
1401             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1402
1403             # Sometimes, jwplayer extraction will result in a YouTube URL
1404             if YoutubeIE.suitable(video_url):
1405                 entries.append(self.url_result(video_url, 'Youtube'))
1406                 continue
1407
1408             # here's a fun little line of code for you:
1409             video_id = os.path.splitext(video_id)[0]
1410
1411             entries.append({
1412                 'id': video_id,
1413                 'url': video_url,
1414                 'uploader': video_uploader,
1415                 'title': video_title,
1416                 'age_limit': age_limit,
1417             })
1418
1419         if len(entries) == 1:
1420             return entries[0]
1421         else:
1422             for num, e in enumerate(entries, start=1):
1423                 # 'url' results don't have a title
1424                 if e.get('title') is not None:
1425                     e['title'] = '%s (%d)' % (e['title'], num)
1426             return {
1427                 '_type': 'playlist',
1428                 'entries': entries,
1429             }