Merge branch 'gfycat' of https://github.com/julianrichen/youtube-dl into julianrichen...
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urlparse,
13     compat_xml_parse_error,
14 )
15 from ..utils import (
16     determine_ext,
17     ExtractorError,
18     float_or_none,
19     HEADRequest,
20     is_html,
21     orderedSet,
22     parse_xml,
23     smuggle_url,
24     unescapeHTML,
25     unified_strdate,
26     unsmuggle_url,
27     UnsupportedError,
28     url_basename,
29     xpath_text,
30 )
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
37 from .udn import UDNEmbedIE
38
39
40 class GenericIE(InfoExtractor):
41     IE_DESC = 'Generic downloader that works on some sites'
42     _VALID_URL = r'.*'
43     IE_NAME = 'generic'
44     _TESTS = [
45         {
46             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
47             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
48             'info_dict': {
49                 'id': '13601338388002',
50                 'ext': 'mp4',
51                 'uploader': 'www.hodiho.fr',
52                 'title': 'R\u00e9gis plante sa Jeep',
53             }
54         },
55         # bandcamp page with custom domain
56         {
57             'add_ie': ['Bandcamp'],
58             'url': 'http://bronyrock.com/track/the-pony-mash',
59             'info_dict': {
60                 'id': '3235767654',
61                 'ext': 'mp3',
62                 'title': 'The Pony Mash',
63                 'uploader': 'M_Pallante',
64             },
65             'skip': 'There is a limit of 200 free downloads / month for the test song',
66         },
67         # embedded brightcove video
68         # it also tests brightcove videos that need to set the 'Referer' in the
69         # http requests
70         {
71             'add_ie': ['Brightcove'],
72             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
73             'info_dict': {
74                 'id': '2765128793001',
75                 'ext': 'mp4',
76                 'title': 'Le cours de bourse : l’analyse technique',
77                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
78                 'uploader': 'BFM BUSINESS',
79             },
80             'params': {
81                 'skip_download': True,
82             },
83         },
84         {
85             # https://github.com/rg3/youtube-dl/issues/2253
86             'url': 'http://bcove.me/i6nfkrc3',
87             'md5': '0ba9446db037002366bab3b3eb30c88c',
88             'info_dict': {
89                 'id': '3101154703001',
90                 'ext': 'mp4',
91                 'title': 'Still no power',
92                 'uploader': 'thestar.com',
93                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
94             },
95             'add_ie': ['Brightcove'],
96         },
97         {
98             'url': 'http://www.championat.com/video/football/v/87/87499.html',
99             'md5': 'fb973ecf6e4a78a67453647444222983',
100             'info_dict': {
101                 'id': '3414141473001',
102                 'ext': 'mp4',
103                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
104                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
105                 'uploader': 'Championat',
106             },
107         },
108         {
109             # https://github.com/rg3/youtube-dl/issues/3541
110             'add_ie': ['Brightcove'],
111             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
112             'info_dict': {
113                 'id': '3866516442001',
114                 'ext': 'mp4',
115                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
116                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
117                 'uploader': 'SBS Broadcasting',
118             },
119             'skip': 'Restricted to Netherlands',
120             'params': {
121                 'skip_download': True,  # m3u8 download
122             },
123         },
124         # Direct link to a video
125         {
126             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
127             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
128             'info_dict': {
129                 'id': 'trailer',
130                 'ext': 'mp4',
131                 'title': 'trailer',
132                 'upload_date': '20100513',
133             }
134         },
135         # ooyala video
136         {
137             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
138             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
139             'info_dict': {
140                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
141                 'ext': 'mp4',
142                 'title': '2cc213299525360.mov',  # that's what we get
143             },
144             'add_ie': ['Ooyala'],
145         },
146         # multiple ooyala embeds on SBN network websites
147         {
148             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
149             'info_dict': {
150                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
151                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
152             },
153             'playlist_mincount': 3,
154             'params': {
155                 'skip_download': True,
156             },
157             'add_ie': ['Ooyala'],
158         },
159         # google redirect
160         {
161             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
162             'info_dict': {
163                 'id': 'cmQHVoWB5FY',
164                 'ext': 'mp4',
165                 'upload_date': '20130224',
166                 'uploader_id': 'TheVerge',
167                 'description': 're:^Chris Ziegler takes a look at the\.*',
168                 'uploader': 'The Verge',
169                 'title': 'First Firefox OS phones side-by-side',
170             },
171             'params': {
172                 'skip_download': False,
173             }
174         },
175         # embed.ly video
176         {
177             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
178             'info_dict': {
179                 'id': '9ODmcdjQcHQ',
180                 'ext': 'mp4',
181                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
182                 'upload_date': '20140225',
183                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
184                 'uploader': 'Tested',
185                 'uploader_id': 'testedcom',
186             },
187             # No need to test YoutubeIE here
188             'params': {
189                 'skip_download': True,
190             },
191         },
192         # funnyordie embed
193         {
194             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
195             'info_dict': {
196                 'id': '18e820ec3f',
197                 'ext': 'mp4',
198                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
199                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
200             },
201         },
202         # BBC iPlayer embeds
203         {
204             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
205             'info_dict': {
206                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
207             },
208             'playlist_mincount': 18,
209         },
210         # RUTV embed
211         {
212             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
213             'info_dict': {
214                 'id': '776940',
215                 'ext': 'mp4',
216                 'title': 'Охотское море стало целиком российским',
217                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
218             },
219             'params': {
220                 # m3u8 download
221                 'skip_download': True,
222             },
223         },
224         # Embedded TED video
225         {
226             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
227             'md5': '65fdff94098e4a607385a60c5177c638',
228             'info_dict': {
229                 'id': '1969',
230                 'ext': 'mp4',
231                 'title': 'Hidden miracles of the natural world',
232                 'uploader': 'Louie Schwartzberg',
233                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
234             }
235         },
236         # Embeded Ustream video
237         {
238             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
239             'md5': '27b99cdb639c9b12a79bca876a073417',
240             'info_dict': {
241                 'id': '45734260',
242                 'ext': 'flv',
243                 'uploader': 'AU SPA:  The NSA and Privacy',
244                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
245             }
246         },
247         # nowvideo embed hidden behind percent encoding
248         {
249             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
250             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
251             'info_dict': {
252                 'id': '06e53103ca9aa',
253                 'ext': 'flv',
254                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
255                 'description': 'No description',
256             },
257         },
258         # arte embed
259         {
260             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
261             'md5': '7653032cbb25bf6c80d80f217055fa43',
262             'info_dict': {
263                 'id': '048195-004_PLUS7-F',
264                 'ext': 'flv',
265                 'title': 'X:enius',
266                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
267                 'upload_date': '20140320',
268             },
269             'params': {
270                 'skip_download': 'Requires rtmpdump'
271             }
272         },
273         # Condé Nast embed
274         {
275             'url': 'http://www.wired.com/2014/04/honda-asimo/',
276             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
277             'info_dict': {
278                 'id': '53501be369702d3275860000',
279                 'ext': 'mp4',
280                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
281             }
282         },
283         # Dailymotion embed
284         {
285             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
286             'md5': '441aeeb82eb72c422c7f14ec533999cd',
287             'info_dict': {
288                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
289                 'ext': 'mp4',
290                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
291                 'uploader': 'Spi0n',
292             },
293             'add_ie': ['Dailymotion'],
294         },
295         # YouTube embed
296         {
297             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
298             'info_dict': {
299                 'id': 'FXRb4ykk4S0',
300                 'ext': 'mp4',
301                 'title': 'The NBL Auction 2014',
302                 'uploader': 'BADMINTON England',
303                 'uploader_id': 'BADMINTONEvents',
304                 'upload_date': '20140603',
305                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
306             },
307             'add_ie': ['Youtube'],
308             'params': {
309                 'skip_download': True,
310             }
311         },
312         # MTVSercices embed
313         {
314             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
315             'md5': '35727f82f58c76d996fc188f9755b0d5',
316             'info_dict': {
317                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
318                 'ext': 'mp4',
319                 'title': 'Review',
320                 'description': 'Mario\'s life in the fast lane has never looked so good.',
321             },
322         },
323         # YouTube embed via <data-embed-url="">
324         {
325             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
326             'info_dict': {
327                 'id': '4vAffPZIT44',
328                 'ext': 'mp4',
329                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
330                 'uploader': 'Gameloft',
331                 'uploader_id': 'gameloft',
332                 'upload_date': '20140828',
333                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
334             },
335             'params': {
336                 'skip_download': True,
337             }
338         },
339         # Camtasia studio
340         {
341             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
342             'playlist': [{
343                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
344                 'info_dict': {
345                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
346                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
347                     'ext': 'flv',
348                     'duration': 2235.90,
349                 }
350             }, {
351                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
352                 'info_dict': {
353                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
354                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
355                     'ext': 'flv',
356                     'duration': 2235.93,
357                 }
358             }],
359             'info_dict': {
360                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
361             }
362         },
363         # Flowplayer
364         {
365             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
366             'md5': '9d65602bf31c6e20014319c7d07fba27',
367             'info_dict': {
368                 'id': '5123ea6d5e5a7',
369                 'ext': 'mp4',
370                 'age_limit': 18,
371                 'uploader': 'www.handjobhub.com',
372                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
373             }
374         },
375         # RSS feed
376         {
377             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
378             'info_dict': {
379                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
380                 'title': 'Zero Punctuation',
381                 'description': 're:.*groundbreaking video review series.*'
382             },
383             'playlist_mincount': 11,
384         },
385         # Multiple brightcove videos
386         # https://github.com/rg3/youtube-dl/issues/2283
387         {
388             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
389             'info_dict': {
390                 'id': 'always-never',
391                 'title': 'Always / Never - The New Yorker',
392             },
393             'playlist_count': 3,
394             'params': {
395                 'extract_flat': False,
396                 'skip_download': True,
397             }
398         },
399         # MLB embed
400         {
401             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
402             'md5': '96f09a37e44da40dd083e12d9a683327',
403             'info_dict': {
404                 'id': '33322633',
405                 'ext': 'mp4',
406                 'title': 'Ump changes call to ball',
407                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
408                 'duration': 48,
409                 'timestamp': 1401537900,
410                 'upload_date': '20140531',
411                 'thumbnail': 're:^https?://.*\.jpg$',
412             },
413         },
414         # Wistia embed
415         {
416             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
417             'md5': '8788b683c777a5cf25621eaf286d0c23',
418             'info_dict': {
419                 'id': '1cfaf6b7ea',
420                 'ext': 'mov',
421                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
422                 'duration': 643.0,
423                 'filesize': 182808282,
424                 'uploader': 'education-portal.com',
425             },
426         },
427         {
428             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
429             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
430             'info_dict': {
431                 'id': 'uxjb0lwrcz',
432                 'ext': 'mp4',
433                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
434                 'duration': 1715.0,
435                 'uploader': 'thoughtworks.wistia.com',
436             },
437         },
438         # Direct download with broken HEAD
439         {
440             'url': 'http://ai-radio.org:8000/radio.opus',
441             'info_dict': {
442                 'id': 'radio',
443                 'ext': 'opus',
444                 'title': 'radio',
445             },
446             'params': {
447                 'skip_download': True,  # infinite live stream
448             },
449             'expected_warnings': [
450                 r'501.*Not Implemented'
451             ],
452         },
453         # Soundcloud embed
454         {
455             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
456             'info_dict': {
457                 'id': '174391317',
458                 'ext': 'mp3',
459                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
460                 'uploader': 'Sophos Security',
461                 'title': 'Chet Chat 171 - Oct 29, 2014',
462                 'upload_date': '20141029',
463             }
464         },
465         # Livestream embed
466         {
467             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
468             'info_dict': {
469                 'id': '67864563',
470                 'ext': 'flv',
471                 'upload_date': '20141112',
472                 'title': 'Rosetta #CometLanding webcast HL 10',
473             }
474         },
475         # LazyYT
476         {
477             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
478             'info_dict': {
479                 'id': '1986',
480                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
481             },
482             'playlist_mincount': 2,
483         },
484         # Direct link with incorrect MIME type
485         {
486             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
487             'md5': '4ccbebe5f36706d85221f204d7eb5913',
488             'info_dict': {
489                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
490                 'id': '5_Lennart_Poettering_-_Systemd',
491                 'ext': 'webm',
492                 'title': '5_Lennart_Poettering_-_Systemd',
493                 'upload_date': '20141120',
494             },
495             'expected_warnings': [
496                 'URL could be a direct video link, returning it as such.'
497             ]
498         },
499         # Cinchcast embed
500         {
501             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
502             'info_dict': {
503                 'id': '7141703',
504                 'ext': 'mp3',
505                 'upload_date': '20141126',
506                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
507             }
508         },
509         # Cinerama player
510         {
511             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
512             'info_dict': {
513                 'id': '730m_DandD_1901_512k',
514                 'ext': 'mp4',
515                 'uploader': 'www.abc.net.au',
516                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
517             }
518         },
519         # embedded viddler video
520         {
521             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
522             'info_dict': {
523                 'id': '4d03aad9',
524                 'ext': 'mp4',
525                 'uploader': 'deadspin',
526                 'title': 'WALL-TO-GORTAT',
527                 'timestamp': 1422285291,
528                 'upload_date': '20150126',
529             },
530             'add_ie': ['Viddler'],
531         },
532         # Libsyn embed
533         {
534             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
535             'info_dict': {
536                 'id': '3377616',
537                 'ext': 'mp3',
538                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
539                 'description': 'md5:601cb790edd05908957dae8aaa866465',
540                 'upload_date': '20150220',
541             },
542         },
543         # jwplayer YouTube
544         {
545             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
546             'info_dict': {
547                 'id': 'Mrj4DVp2zeA',
548                 'ext': 'mp4',
549                 'upload_date': '20150212',
550                 'uploader': 'The National Archives UK',
551                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
552                 'uploader_id': 'NationalArchives08',
553                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
554             },
555         },
556         # rtl.nl embed
557         {
558             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
559             'playlist_mincount': 5,
560             'info_dict': {
561                 'id': 'aanslagen-kopenhagen',
562                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
563             }
564         },
565         # Zapiks embed
566         {
567             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
568             'info_dict': {
569                 'id': '118046',
570                 'ext': 'mp4',
571                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
572             }
573         },
574         # Kaltura embed
575         {
576             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
577             'info_dict': {
578                 'id': '1_eergr3h1',
579                 'ext': 'mp4',
580                 'upload_date': '20150226',
581                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
582                 'timestamp': int,
583                 'title': 'John Carlson Postgame 2/25/15',
584             },
585         },
586         # Eagle.Platform embed (generic URL)
587         {
588             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
589             'info_dict': {
590                 'id': '227304',
591                 'ext': 'mp4',
592                 'title': 'Навальный вышел на свободу',
593                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
594                 'thumbnail': 're:^https?://.*\.jpg$',
595                 'duration': 87,
596                 'view_count': int,
597                 'age_limit': 0,
598             },
599         },
600         # ClipYou (Eagle.Platform) embed (custom URL)
601         {
602             'url': 'http://muz-tv.ru/play/7129/',
603             'info_dict': {
604                 'id': '12820',
605                 'ext': 'mp4',
606                 'title': "'O Sole Mio",
607                 'thumbnail': 're:^https?://.*\.jpg$',
608                 'duration': 216,
609                 'view_count': int,
610             },
611         },
612         # Pladform embed
613         {
614             'url': 'http://muz-tv.ru/kinozal/view/7400/',
615             'info_dict': {
616                 'id': '100183293',
617                 'ext': 'mp4',
618                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
619                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
620                 'thumbnail': 're:^https?://.*\.jpg$',
621                 'duration': 694,
622                 'age_limit': 0,
623             },
624         },
625         # Playwire embed
626         {
627             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
628             'info_dict': {
629                 'id': '3519514',
630                 'ext': 'mp4',
631                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
632                 'thumbnail': 're:^https?://.*\.png$',
633                 'duration': 45.115,
634             },
635         },
636         # 5min embed
637         {
638             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
639             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
640             'info_dict': {
641                 'id': '518726732',
642                 'ext': 'mp4',
643                 'title': 'Facebook Creates "On This Day" | Crunch Report',
644             },
645         },
646         # RSS feed with enclosure
647         {
648             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
649             'info_dict': {
650                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
651                 'ext': 'm4v',
652                 'upload_date': '20150228',
653                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
654             }
655         },
656         # Crooks and Liars embed
657         {
658             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
659             'info_dict': {
660                 'id': '8RUoRhRi',
661                 'ext': 'mp4',
662                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
663                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
664                 'timestamp': 1428207000,
665                 'upload_date': '20150405',
666                 'uploader': 'Heather',
667             },
668         },
669         # Crooks and Liars external embed
670         {
671             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
672             'info_dict': {
673                 'id': 'MTE3MjUtMzQ2MzA',
674                 'ext': 'mp4',
675                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
676                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
677                 'timestamp': 1265032391,
678                 'upload_date': '20100201',
679                 'uploader': 'Heather',
680             },
681         },
682         # NBC Sports vplayer embed
683         {
684             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
685             'info_dict': {
686                 'id': 'ln7x1qSThw4k',
687                 'ext': 'flv',
688                 'title': "PFT Live: New leader in the 'new-look' defense",
689                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
690             },
691         },
692         # UDN embed
693         {
694             'url': 'http://www.udn.com/news/story/7314/822787',
695             'md5': 'fd2060e988c326991037b9aff9df21a6',
696             'info_dict': {
697                 'id': '300346',
698                 'ext': 'mp4',
699                 'title': '中一中男師變性 全校師生力挺',
700                 'thumbnail': 're:^https?://.*\.jpg$',
701             }
702         },
703         # Ooyala embed
704         {
705             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
706             'info_dict': {
707                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
708                 'ext': 'mp4',
709                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
710                 'title': 'This is what separates the Excel masters from the wannabes',
711             },
712             'params': {
713                 # m3u8 downloads
714                 'skip_download': True,
715             }
716         },
717         # Contains a SMIL manifest
718         {
719             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
720             'info_dict': {
721                 'id': 'file',
722                 'ext': 'flv',
723                 'title': '+ Football: Lottery Champions League Europe',
724                 'uploader': 'www.telewebion.com',
725             },
726             'params': {
727                 # rtmpe downloads
728                 'skip_download': True,
729             }
730         }
731     ]
732
733     def report_following_redirect(self, new_url):
734         """Report information extraction."""
735         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
736
737     def _extract_rss(self, url, video_id, doc):
738         playlist_title = doc.find('./channel/title').text
739         playlist_desc_el = doc.find('./channel/description')
740         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
741
742         entries = []
743         for it in doc.findall('./channel/item'):
744             next_url = xpath_text(it, 'link', fatal=False)
745             if not next_url:
746                 enclosure_nodes = it.findall('./enclosure')
747                 for e in enclosure_nodes:
748                     next_url = e.attrib.get('url')
749                     if next_url:
750                         break
751
752             if not next_url:
753                 continue
754
755             entries.append({
756                 '_type': 'url',
757                 'url': next_url,
758                 'title': it.find('title').text,
759             })
760
761         return {
762             '_type': 'playlist',
763             'id': url,
764             'title': playlist_title,
765             'description': playlist_desc,
766             'entries': entries,
767         }
768
769     def _extract_camtasia(self, url, video_id, webpage):
770         """ Returns None if no camtasia video can be found. """
771
772         camtasia_cfg = self._search_regex(
773             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
774             webpage, 'camtasia configuration file', default=None)
775         if camtasia_cfg is None:
776             return None
777
778         title = self._html_search_meta('DC.title', webpage, fatal=True)
779
780         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
781         camtasia_cfg = self._download_xml(
782             camtasia_url, video_id,
783             note='Downloading camtasia configuration',
784             errnote='Failed to download camtasia configuration')
785         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
786
787         entries = []
788         for n in fileset_node.getchildren():
789             url_n = n.find('./uri')
790             if url_n is None:
791                 continue
792
793             entries.append({
794                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
795                 'title': '%s - %s' % (title, n.tag),
796                 'url': compat_urlparse.urljoin(url, url_n.text),
797                 'duration': float_or_none(n.find('./duration').text),
798             })
799
800         return {
801             '_type': 'playlist',
802             'entries': entries,
803             'title': title,
804         }
805
806     def _real_extract(self, url):
807         if url.startswith('//'):
808             return {
809                 '_type': 'url',
810                 'url': self.http_scheme() + url,
811             }
812
813         parsed_url = compat_urlparse.urlparse(url)
814         if not parsed_url.scheme:
815             default_search = self._downloader.params.get('default_search')
816             if default_search is None:
817                 default_search = 'fixup_error'
818
819             if default_search in ('auto', 'auto_warning', 'fixup_error'):
820                 if '/' in url:
821                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
822                     return self.url_result('http://' + url)
823                 elif default_search != 'fixup_error':
824                     if default_search == 'auto_warning':
825                         if re.match(r'^(?:url|URL)$', url):
826                             raise ExtractorError(
827                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
828                                 expected=True)
829                         else:
830                             self._downloader.report_warning(
831                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
832                     return self.url_result('ytsearch:' + url)
833
834             if default_search in ('error', 'fixup_error'):
835                 raise ExtractorError(
836                     '%r is not a valid URL. '
837                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
838                     % (url, url), expected=True)
839             else:
840                 if ':' not in default_search:
841                     default_search += ':'
842                 return self.url_result(default_search + url)
843
844         url, smuggled_data = unsmuggle_url(url)
845         force_videoid = None
846         is_intentional = smuggled_data and smuggled_data.get('to_generic')
847         if smuggled_data and 'force_videoid' in smuggled_data:
848             force_videoid = smuggled_data['force_videoid']
849             video_id = force_videoid
850         else:
851             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
852
853         self.to_screen('%s: Requesting header' % video_id)
854
855         head_req = HEADRequest(url)
856         head_response = self._request_webpage(
857             head_req, video_id,
858             note=False, errnote='Could not send HEAD request to %s' % url,
859             fatal=False)
860
861         if head_response is not False:
862             # Check for redirect
863             new_url = head_response.geturl()
864             if url != new_url:
865                 self.report_following_redirect(new_url)
866                 if force_videoid:
867                     new_url = smuggle_url(
868                         new_url, {'force_videoid': force_videoid})
869                 return self.url_result(new_url)
870
871         full_response = None
872         if head_response is False:
873             full_response = self._request_webpage(url, video_id)
874             head_response = full_response
875
876         # Check for direct link to a video
877         content_type = head_response.headers.get('Content-Type', '')
878         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
879         if m:
880             upload_date = unified_strdate(
881                 head_response.headers.get('Last-Modified'))
882             return {
883                 'id': video_id,
884                 'title': os.path.splitext(url_basename(url))[0],
885                 'direct': True,
886                 'formats': [{
887                     'format_id': m.group('format_id'),
888                     'url': url,
889                     'vcodec': 'none' if m.group('type') == 'audio' else None
890                 }],
891                 'upload_date': upload_date,
892             }
893
894         if not self._downloader.params.get('test', False) and not is_intentional:
895             self._downloader.report_warning('Falling back on generic information extractor.')
896
897         if not full_response:
898             full_response = self._request_webpage(url, video_id)
899
900         # Maybe it's a direct link to a video?
901         # Be careful not to download the whole thing!
902         first_bytes = full_response.read(512)
903         if not is_html(first_bytes):
904             self._downloader.report_warning(
905                 'URL could be a direct video link, returning it as such.')
906             upload_date = unified_strdate(
907                 head_response.headers.get('Last-Modified'))
908             return {
909                 'id': video_id,
910                 'title': os.path.splitext(url_basename(url))[0],
911                 'direct': True,
912                 'url': url,
913                 'upload_date': upload_date,
914             }
915
916         webpage = self._webpage_read_content(
917             full_response, url, video_id, prefix=first_bytes)
918
919         self.report_extraction(video_id)
920
921         # Is it an RSS feed?
922         try:
923             doc = parse_xml(webpage)
924             if doc.tag == 'rss':
925                 return self._extract_rss(url, video_id, doc)
926         except compat_xml_parse_error:
927             pass
928
929         # Is it a Camtasia project?
930         camtasia_res = self._extract_camtasia(url, video_id, webpage)
931         if camtasia_res is not None:
932             return camtasia_res
933
934         # Sometimes embedded video player is hidden behind percent encoding
935         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
936         # Unescaping the whole page allows to handle those cases in a generic way
937         webpage = compat_urllib_parse.unquote(webpage)
938
939         # it's tempting to parse this further, but you would
940         # have to take into account all the variations like
941         #   Video Title - Site Name
942         #   Site Name | Video Title
943         #   Video Title - Tagline | Site Name
944         # and so on and so forth; it's just not practical
945         video_title = self._html_search_regex(
946             r'(?s)<title>(.*?)</title>', webpage, 'video title',
947             default='video')
948
949         # Try to detect age limit automatically
950         age_limit = self._rta_search(webpage)
951         # And then there are the jokers who advertise that they use RTA,
952         # but actually don't.
953         AGE_LIMIT_MARKERS = [
954             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
955         ]
956         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
957             age_limit = 18
958
959         # video uploader is domain name
960         video_uploader = self._search_regex(
961             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
962
963         # Helper method
964         def _playlist_from_matches(matches, getter=None, ie=None):
965             urlrs = orderedSet(
966                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
967                 for m in matches)
968             return self.playlist_result(
969                 urlrs, playlist_id=video_id, playlist_title=video_title)
970
971         # Look for BrightCove:
972         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
973         if bc_urls:
974             self.to_screen('Brightcove video detected.')
975             entries = [{
976                 '_type': 'url',
977                 'url': smuggle_url(bc_url, {'Referer': url}),
978                 'ie_key': 'Brightcove'
979             } for bc_url in bc_urls]
980
981             return {
982                 '_type': 'playlist',
983                 'title': video_title,
984                 'id': video_id,
985                 'entries': entries,
986             }
987
988         # Look for embedded rtl.nl player
989         matches = re.findall(
990             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
991             webpage)
992         if matches:
993             return _playlist_from_matches(matches, ie='RtlNl')
994
995         # Look for embedded (iframe) Vimeo player
996         mobj = re.search(
997             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
998         if mobj:
999             player_url = unescapeHTML(mobj.group('url'))
1000             surl = smuggle_url(player_url, {'Referer': url})
1001             return self.url_result(surl)
1002         # Look for embedded (swf embed) Vimeo player
1003         mobj = re.search(
1004             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1005         if mobj:
1006             return self.url_result(mobj.group(1))
1007
1008         # Look for embedded YouTube player
1009         matches = re.findall(r'''(?x)
1010             (?:
1011                 <iframe[^>]+?src=|
1012                 data-video-url=|
1013                 <embed[^>]+?src=|
1014                 embedSWF\(?:\s*|
1015                 new\s+SWFObject\(
1016             )
1017             (["\'])
1018                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1019                 (?:embed|v|p)/.+?)
1020             \1''', webpage)
1021         if matches:
1022             return _playlist_from_matches(
1023                 matches, lambda m: unescapeHTML(m[1]))
1024
1025         # Look for lazyYT YouTube embed
1026         matches = re.findall(
1027             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1028         if matches:
1029             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1030
1031         # Look for embedded Dailymotion player
1032         matches = re.findall(
1033             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1034         if matches:
1035             return _playlist_from_matches(
1036                 matches, lambda m: unescapeHTML(m[1]))
1037
1038         # Look for embedded Dailymotion playlist player (#3822)
1039         m = re.search(
1040             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1041         if m:
1042             playlists = re.findall(
1043                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1044             if playlists:
1045                 return _playlist_from_matches(
1046                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1047
1048         # Look for embedded Wistia player
1049         match = re.search(
1050             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1051         if match:
1052             embed_url = self._proto_relative_url(
1053                 unescapeHTML(match.group('url')))
1054             return {
1055                 '_type': 'url_transparent',
1056                 'url': embed_url,
1057                 'ie_key': 'Wistia',
1058                 'uploader': video_uploader,
1059                 'title': video_title,
1060                 'id': video_id,
1061             }
1062
1063         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1064         if match:
1065             return {
1066                 '_type': 'url_transparent',
1067                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1068                 'ie_key': 'Wistia',
1069                 'uploader': video_uploader,
1070                 'title': video_title,
1071                 'id': match.group('id')
1072             }
1073
1074         # Look for embedded blip.tv player
1075         mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
1076         if mobj:
1077             return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1078         mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1079         if mobj:
1080             return self.url_result(mobj.group(1), 'BlipTV')
1081
1082         # Look for embedded condenast player
1083         matches = re.findall(
1084             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1085             webpage)
1086         if matches:
1087             return {
1088                 '_type': 'playlist',
1089                 'entries': [{
1090                     '_type': 'url',
1091                     'ie_key': 'CondeNast',
1092                     'url': ma,
1093                 } for ma in matches],
1094                 'title': video_title,
1095                 'id': video_id,
1096             }
1097
1098         # Look for Bandcamp pages with custom domain
1099         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1100         if mobj is not None:
1101             burl = unescapeHTML(mobj.group(1))
1102             # Don't set the extractor because it can be a track url or an album
1103             return self.url_result(burl)
1104
1105         # Look for embedded Vevo player
1106         mobj = re.search(
1107             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1108         if mobj is not None:
1109             return self.url_result(mobj.group('url'))
1110
1111         # Look for embedded Viddler player
1112         mobj = re.search(
1113             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1114             webpage)
1115         if mobj is not None:
1116             return self.url_result(mobj.group('url'))
1117
1118         # Look for NYTimes player
1119         mobj = re.search(
1120             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1121             webpage)
1122         if mobj is not None:
1123             return self.url_result(mobj.group('url'))
1124
1125         # Look for Libsyn player
1126         mobj = re.search(
1127             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1128         if mobj is not None:
1129             return self.url_result(mobj.group('url'))
1130
1131         # Look for Ooyala videos
1132         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1133                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1134                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1135                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1136         if mobj is not None:
1137             return OoyalaIE._build_url_result(mobj.group('ec'))
1138
1139         # Look for multiple Ooyala embeds on SBN network websites
1140         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1141         if mobj is not None:
1142             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1143             if embeds:
1144                 return _playlist_from_matches(
1145                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1146
1147         # Look for Aparat videos
1148         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1149         if mobj is not None:
1150             return self.url_result(mobj.group(1), 'Aparat')
1151
1152         # Look for MPORA videos
1153         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1154         if mobj is not None:
1155             return self.url_result(mobj.group(1), 'Mpora')
1156
1157         # Look for embedded NovaMov-based player
1158         mobj = re.search(
1159             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1160                     (?P<url>http://(?:(?:embed|www)\.)?
1161                         (?:novamov\.com|
1162                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1163                            videoweed\.(?:es|com)|
1164                            movshare\.(?:net|sx|ag)|
1165                            divxstage\.(?:eu|net|ch|co|at|ag))
1166                         /embed\.php.+?)\1''', webpage)
1167         if mobj is not None:
1168             return self.url_result(mobj.group('url'))
1169
1170         # Look for embedded Facebook player
1171         mobj = re.search(
1172             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1173         if mobj is not None:
1174             return self.url_result(mobj.group('url'), 'Facebook')
1175
1176         # Look for embedded VK player
1177         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1178         if mobj is not None:
1179             return self.url_result(mobj.group('url'), 'VK')
1180
1181         # Look for embedded ivi player
1182         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1183         if mobj is not None:
1184             return self.url_result(mobj.group('url'), 'Ivi')
1185
1186         # Look for embedded Huffington Post player
1187         mobj = re.search(
1188             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1189         if mobj is not None:
1190             return self.url_result(mobj.group('url'), 'HuffPost')
1191
1192         # Look for embed.ly
1193         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1194         if mobj is not None:
1195             return self.url_result(mobj.group('url'))
1196         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1197         if mobj is not None:
1198             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1199
1200         # Look for funnyordie embed
1201         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1202         if matches:
1203             return _playlist_from_matches(
1204                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1205
1206         # Look for BBC iPlayer embed
1207         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1208         if matches:
1209             return _playlist_from_matches(matches, ie='BBCCoUk')
1210
1211         # Look for embedded RUTV player
1212         rutv_url = RUTVIE._extract_url(webpage)
1213         if rutv_url:
1214             return self.url_result(rutv_url, 'RUTV')
1215
1216         # Look for embedded TED player
1217         mobj = re.search(
1218             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1219         if mobj is not None:
1220             return self.url_result(mobj.group('url'), 'TED')
1221
1222         # Look for embedded Ustream videos
1223         mobj = re.search(
1224             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1225         if mobj is not None:
1226             return self.url_result(mobj.group('url'), 'Ustream')
1227
1228         # Look for embedded arte.tv player
1229         mobj = re.search(
1230             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1231             webpage)
1232         if mobj is not None:
1233             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1234
1235         # Look for embedded smotri.com player
1236         smotri_url = SmotriIE._extract_url(webpage)
1237         if smotri_url:
1238             return self.url_result(smotri_url, 'Smotri')
1239
1240         # Look for embeded soundcloud player
1241         mobj = re.search(
1242             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1243             webpage)
1244         if mobj is not None:
1245             url = unescapeHTML(mobj.group('url'))
1246             return self.url_result(url)
1247
1248         # Look for embedded vulture.com player
1249         mobj = re.search(
1250             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1251             webpage)
1252         if mobj is not None:
1253             url = unescapeHTML(mobj.group('url'))
1254             return self.url_result(url, ie='Vulture')
1255
1256         # Look for embedded mtvservices player
1257         mobj = re.search(
1258             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1259             webpage)
1260         if mobj is not None:
1261             url = unescapeHTML(mobj.group('url'))
1262             return self.url_result(url, ie='MTVServicesEmbedded')
1263
1264         # Look for embedded yahoo player
1265         mobj = re.search(
1266             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1267             webpage)
1268         if mobj is not None:
1269             return self.url_result(mobj.group('url'), 'Yahoo')
1270
1271         # Look for embedded sbs.com.au player
1272         mobj = re.search(
1273             r'''(?x)
1274             (?:
1275                 <meta\s+property="og:video"\s+content=|
1276                 <iframe[^>]+?src=
1277             )
1278             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1279             webpage)
1280         if mobj is not None:
1281             return self.url_result(mobj.group('url'), 'SBS')
1282
1283         # Look for embedded Cinchcast player
1284         mobj = re.search(
1285             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1286             webpage)
1287         if mobj is not None:
1288             return self.url_result(mobj.group('url'), 'Cinchcast')
1289
1290         mobj = re.search(
1291             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1292             webpage)
1293         if mobj is not None:
1294             return self.url_result(mobj.group('url'), 'MLB')
1295
1296         mobj = re.search(
1297             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1298             webpage)
1299         if mobj is not None:
1300             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1301
1302         mobj = re.search(
1303             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1304             webpage)
1305         if mobj is not None:
1306             return self.url_result(mobj.group('url'), 'Livestream')
1307
1308         # Look for Zapiks embed
1309         mobj = re.search(
1310             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1311         if mobj is not None:
1312             return self.url_result(mobj.group('url'), 'Zapiks')
1313
1314         # Look for Kaltura embeds
1315         mobj = re.search(
1316             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1317         if mobj is not None:
1318             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1319
1320         # Look for Eagle.Platform embeds
1321         mobj = re.search(
1322             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1323         if mobj is not None:
1324             return self.url_result(mobj.group('url'), 'EaglePlatform')
1325
1326         # Look for ClipYou (uses Eagle.Platform) embeds
1327         mobj = re.search(
1328             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1329         if mobj is not None:
1330             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1331
1332         # Look for Pladform embeds
1333         mobj = re.search(
1334             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1335         if mobj is not None:
1336             return self.url_result(mobj.group('url'), 'Pladform')
1337
1338         # Look for Playwire embeds
1339         mobj = re.search(
1340             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1341         if mobj is not None:
1342             return self.url_result(mobj.group('url'))
1343
1344         # Look for 5min embeds
1345         mobj = re.search(
1346             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1347         if mobj is not None:
1348             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1349
1350         # Look for Crooks and Liars embeds
1351         mobj = re.search(
1352             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1353         if mobj is not None:
1354             return self.url_result(mobj.group('url'))
1355
1356         # Look for NBC Sports VPlayer embeds
1357         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1358         if nbc_sports_url:
1359             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1360
1361         # Look for UDN embeds
1362         mobj = re.search(
1363             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1364         if mobj is not None:
1365             return self.url_result(
1366                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1367
1368         def check_video(vurl):
1369             if YoutubeIE.suitable(vurl):
1370                 return True
1371             vpath = compat_urlparse.urlparse(vurl).path
1372             vext = determine_ext(vpath)
1373             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1374
1375         def filter_video(urls):
1376             return list(filter(check_video, urls))
1377
1378         # Start with something easy: JW Player in SWFObject
1379         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1380         if not found:
1381             # Look for gorilla-vid style embedding
1382             found = filter_video(re.findall(r'''(?sx)
1383                 (?:
1384                     jw_plugins|
1385                     JWPlayerOptions|
1386                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1387                 )
1388                 .*?
1389                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1390         if not found:
1391             # Broaden the search a little bit
1392             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1393         if not found:
1394             # Broaden the findall a little bit: JWPlayer JS loader
1395             found = filter_video(re.findall(
1396                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1397         if not found:
1398             # Flow player
1399             found = filter_video(re.findall(r'''(?xs)
1400                 flowplayer\("[^"]+",\s*
1401                     \{[^}]+?\}\s*,
1402                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1403                         ["']?url["']?\s*:\s*["']([^"']+)["']
1404             ''', webpage))
1405         if not found:
1406             # Cinerama player
1407             found = re.findall(
1408                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1409         if not found:
1410             # Try to find twitter cards info
1411             found = filter_video(re.findall(
1412                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1413         if not found:
1414             # We look for Open Graph info:
1415             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1416             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1417             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1418             if m_video_type is not None:
1419                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1420         if not found:
1421             # HTML5 video
1422             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1423         if not found:
1424             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1425             found = re.search(
1426                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1427                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1428                 webpage)
1429             if not found:
1430                 # Look also in Refresh HTTP header
1431                 refresh_header = head_response.headers.get('Refresh')
1432                 if refresh_header:
1433                     found = re.search(REDIRECT_REGEX, refresh_header)
1434             if found:
1435                 new_url = found.group(1)
1436                 self.report_following_redirect(new_url)
1437                 return {
1438                     '_type': 'url',
1439                     'url': new_url,
1440                 }
1441         if not found:
1442             raise UnsupportedError(url)
1443
1444         entries = []
1445         for video_url in found:
1446             video_url = compat_urlparse.urljoin(url, video_url)
1447             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1448
1449             # Sometimes, jwplayer extraction will result in a YouTube URL
1450             if YoutubeIE.suitable(video_url):
1451                 entries.append(self.url_result(video_url, 'Youtube'))
1452                 continue
1453
1454             # here's a fun little line of code for you:
1455             video_id = os.path.splitext(video_id)[0]
1456
1457             if determine_ext(video_url) == 'smil':
1458                 entries.append({
1459                     'id': video_id,
1460                     'formats': self._extract_smil_formats(video_url, video_id),
1461                     'uploader': video_uploader,
1462                     'title': video_title,
1463                     'age_limit': age_limit,
1464                 })
1465             else:
1466                 entries.append({
1467                     'id': video_id,
1468                     'url': video_url,
1469                     'uploader': video_uploader,
1470                     'title': video_title,
1471                     'age_limit': age_limit,
1472                 })
1473
1474         if len(entries) == 1:
1475             return entries[0]
1476         else:
1477             for num, e in enumerate(entries, start=1):
1478                 # 'url' results don't have a title
1479                 if e.get('title') is not None:
1480                     e['title'] = '%s (%d)' % (e['title'], num)
1481             return {
1482                 '_type': 'playlist',
1483                 'entries': entries,
1484             }