[extractor/generic] Force Accept-Encoding to any for extraction pass
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urlparse,
13     compat_xml_parse_error,
14     compat_urllib_request,
15 )
16 from ..utils import (
17     determine_ext,
18     ExtractorError,
19     float_or_none,
20     HEADRequest,
21     is_html,
22     orderedSet,
23     parse_xml,
24     smuggle_url,
25     unescapeHTML,
26     unified_strdate,
27     unsmuggle_url,
28     UnsupportedError,
29     url_basename,
30     xpath_text,
31 )
32 from .brightcove import BrightcoveIE
33 from .nbc import NBCSportsVPlayerIE
34 from .ooyala import OoyalaIE
35 from .rutv import RUTVIE
36 from .sportbox import SportBoxEmbedIE
37 from .smotri import SmotriIE
38 from .condenast import CondeNastIE
39 from .udn import UDNEmbedIE
40 from .senateisvp import SenateISVPIE
41 from .bliptv import BlipTVIE
42 from .svt import SVTIE
43
44
45 class GenericIE(InfoExtractor):
46     IE_DESC = 'Generic downloader that works on some sites'
47     _VALID_URL = r'.*'
48     IE_NAME = 'generic'
49     _TESTS = [
50         {
51             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
52             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
53             'info_dict': {
54                 'id': '13601338388002',
55                 'ext': 'mp4',
56                 'uploader': 'www.hodiho.fr',
57                 'title': 'R\u00e9gis plante sa Jeep',
58             }
59         },
60         # bandcamp page with custom domain
61         {
62             'add_ie': ['Bandcamp'],
63             'url': 'http://bronyrock.com/track/the-pony-mash',
64             'info_dict': {
65                 'id': '3235767654',
66                 'ext': 'mp3',
67                 'title': 'The Pony Mash',
68                 'uploader': 'M_Pallante',
69             },
70             'skip': 'There is a limit of 200 free downloads / month for the test song',
71         },
72         # embedded brightcove video
73         # it also tests brightcove videos that need to set the 'Referer' in the
74         # http requests
75         {
76             'add_ie': ['Brightcove'],
77             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
78             'info_dict': {
79                 'id': '2765128793001',
80                 'ext': 'mp4',
81                 'title': 'Le cours de bourse : l’analyse technique',
82                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
83                 'uploader': 'BFM BUSINESS',
84             },
85             'params': {
86                 'skip_download': True,
87             },
88         },
89         {
90             # https://github.com/rg3/youtube-dl/issues/2253
91             'url': 'http://bcove.me/i6nfkrc3',
92             'md5': '0ba9446db037002366bab3b3eb30c88c',
93             'info_dict': {
94                 'id': '3101154703001',
95                 'ext': 'mp4',
96                 'title': 'Still no power',
97                 'uploader': 'thestar.com',
98                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
99             },
100             'add_ie': ['Brightcove'],
101         },
102         {
103             'url': 'http://www.championat.com/video/football/v/87/87499.html',
104             'md5': 'fb973ecf6e4a78a67453647444222983',
105             'info_dict': {
106                 'id': '3414141473001',
107                 'ext': 'mp4',
108                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
109                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
110                 'uploader': 'Championat',
111             },
112         },
113         {
114             # https://github.com/rg3/youtube-dl/issues/3541
115             'add_ie': ['Brightcove'],
116             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
117             'info_dict': {
118                 'id': '3866516442001',
119                 'ext': 'mp4',
120                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
121                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
122                 'uploader': 'SBS Broadcasting',
123             },
124             'skip': 'Restricted to Netherlands',
125             'params': {
126                 'skip_download': True,  # m3u8 download
127             },
128         },
129         # Direct link to a video
130         {
131             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
132             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
133             'info_dict': {
134                 'id': 'trailer',
135                 'ext': 'mp4',
136                 'title': 'trailer',
137                 'upload_date': '20100513',
138             }
139         },
140         # ooyala video
141         {
142             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
143             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
144             'info_dict': {
145                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
146                 'ext': 'mp4',
147                 'title': '2cc213299525360.mov',  # that's what we get
148             },
149             'add_ie': ['Ooyala'],
150         },
151         # multiple ooyala embeds on SBN network websites
152         {
153             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
154             'info_dict': {
155                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
156                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
157             },
158             'playlist_mincount': 3,
159             'params': {
160                 'skip_download': True,
161             },
162             'add_ie': ['Ooyala'],
163         },
164         # google redirect
165         {
166             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
167             'info_dict': {
168                 'id': 'cmQHVoWB5FY',
169                 'ext': 'mp4',
170                 'upload_date': '20130224',
171                 'uploader_id': 'TheVerge',
172                 'description': 're:^Chris Ziegler takes a look at the\.*',
173                 'uploader': 'The Verge',
174                 'title': 'First Firefox OS phones side-by-side',
175             },
176             'params': {
177                 'skip_download': False,
178             }
179         },
180         # embed.ly video
181         {
182             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
183             'info_dict': {
184                 'id': '9ODmcdjQcHQ',
185                 'ext': 'mp4',
186                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
187                 'upload_date': '20140225',
188                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
189                 'uploader': 'Tested',
190                 'uploader_id': 'testedcom',
191             },
192             # No need to test YoutubeIE here
193             'params': {
194                 'skip_download': True,
195             },
196         },
197         # funnyordie embed
198         {
199             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
200             'info_dict': {
201                 'id': '18e820ec3f',
202                 'ext': 'mp4',
203                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
204                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
205             },
206         },
207         # BBC iPlayer embeds
208         {
209             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
210             'info_dict': {
211                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
212             },
213             'playlist_mincount': 18,
214         },
215         # RUTV embed
216         {
217             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
218             'info_dict': {
219                 'id': '776940',
220                 'ext': 'mp4',
221                 'title': 'Охотское море стало целиком российским',
222                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
223             },
224             'params': {
225                 # m3u8 download
226                 'skip_download': True,
227             },
228         },
229         # SportBox embed
230         {
231             'url': 'http://www.vestifinance.ru/articles/25753',
232             'info_dict': {
233                 'id': '25753',
234                 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
235             },
236             'playlist': [{
237                 'info_dict': {
238                     'id': '370908',
239                     'title': 'Госзаказ. День 3',
240                     'ext': 'mp4',
241                 }
242             }, {
243                 'info_dict': {
244                     'id': '370905',
245                     'title': 'Госзаказ. День 2',
246                     'ext': 'mp4',
247                 }
248             }, {
249                 'info_dict': {
250                     'id': '370902',
251                     'title': 'Госзаказ. День 1',
252                     'ext': 'mp4',
253                 }
254             }],
255             'params': {
256                 # m3u8 download
257                 'skip_download': True,
258             },
259         },
260         # Embedded TED video
261         {
262             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
263             'md5': '65fdff94098e4a607385a60c5177c638',
264             'info_dict': {
265                 'id': '1969',
266                 'ext': 'mp4',
267                 'title': 'Hidden miracles of the natural world',
268                 'uploader': 'Louie Schwartzberg',
269                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
270             }
271         },
272         # Embeded Ustream video
273         {
274             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
275             'md5': '27b99cdb639c9b12a79bca876a073417',
276             'info_dict': {
277                 'id': '45734260',
278                 'ext': 'flv',
279                 'uploader': 'AU SPA:  The NSA and Privacy',
280                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
281             }
282         },
283         # nowvideo embed hidden behind percent encoding
284         {
285             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
286             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
287             'info_dict': {
288                 'id': '06e53103ca9aa',
289                 'ext': 'flv',
290                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
291                 'description': 'No description',
292             },
293         },
294         # arte embed
295         {
296             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
297             'md5': '7653032cbb25bf6c80d80f217055fa43',
298             'info_dict': {
299                 'id': '048195-004_PLUS7-F',
300                 'ext': 'flv',
301                 'title': 'X:enius',
302                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
303                 'upload_date': '20140320',
304             },
305             'params': {
306                 'skip_download': 'Requires rtmpdump'
307             }
308         },
309         # Condé Nast embed
310         {
311             'url': 'http://www.wired.com/2014/04/honda-asimo/',
312             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
313             'info_dict': {
314                 'id': '53501be369702d3275860000',
315                 'ext': 'mp4',
316                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
317             }
318         },
319         # Dailymotion embed
320         {
321             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
322             'md5': '441aeeb82eb72c422c7f14ec533999cd',
323             'info_dict': {
324                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
325                 'ext': 'mp4',
326                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
327                 'uploader': 'Spi0n',
328             },
329             'add_ie': ['Dailymotion'],
330         },
331         # YouTube embed
332         {
333             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
334             'info_dict': {
335                 'id': 'FXRb4ykk4S0',
336                 'ext': 'mp4',
337                 'title': 'The NBL Auction 2014',
338                 'uploader': 'BADMINTON England',
339                 'uploader_id': 'BADMINTONEvents',
340                 'upload_date': '20140603',
341                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
342             },
343             'add_ie': ['Youtube'],
344             'params': {
345                 'skip_download': True,
346             }
347         },
348         # MTVSercices embed
349         {
350             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
351             'md5': '35727f82f58c76d996fc188f9755b0d5',
352             'info_dict': {
353                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
354                 'ext': 'mp4',
355                 'title': 'Review',
356                 'description': 'Mario\'s life in the fast lane has never looked so good.',
357             },
358         },
359         # YouTube embed via <data-embed-url="">
360         {
361             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
362             'info_dict': {
363                 'id': '4vAffPZIT44',
364                 'ext': 'mp4',
365                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
366                 'uploader': 'Gameloft',
367                 'uploader_id': 'gameloft',
368                 'upload_date': '20140828',
369                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
370             },
371             'params': {
372                 'skip_download': True,
373             }
374         },
375         # Camtasia studio
376         {
377             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
378             'playlist': [{
379                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
380                 'info_dict': {
381                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
382                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
383                     'ext': 'flv',
384                     'duration': 2235.90,
385                 }
386             }, {
387                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
388                 'info_dict': {
389                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
390                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
391                     'ext': 'flv',
392                     'duration': 2235.93,
393                 }
394             }],
395             'info_dict': {
396                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
397             }
398         },
399         # Flowplayer
400         {
401             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
402             'md5': '9d65602bf31c6e20014319c7d07fba27',
403             'info_dict': {
404                 'id': '5123ea6d5e5a7',
405                 'ext': 'mp4',
406                 'age_limit': 18,
407                 'uploader': 'www.handjobhub.com',
408                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
409             }
410         },
411         # RSS feed
412         {
413             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
414             'info_dict': {
415                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
416                 'title': 'Zero Punctuation',
417                 'description': 're:.*groundbreaking video review series.*'
418             },
419             'playlist_mincount': 11,
420         },
421         # Multiple brightcove videos
422         # https://github.com/rg3/youtube-dl/issues/2283
423         {
424             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
425             'info_dict': {
426                 'id': 'always-never',
427                 'title': 'Always / Never - The New Yorker',
428             },
429             'playlist_count': 3,
430             'params': {
431                 'extract_flat': False,
432                 'skip_download': True,
433             }
434         },
435         # MLB embed
436         {
437             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
438             'md5': '96f09a37e44da40dd083e12d9a683327',
439             'info_dict': {
440                 'id': '33322633',
441                 'ext': 'mp4',
442                 'title': 'Ump changes call to ball',
443                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
444                 'duration': 48,
445                 'timestamp': 1401537900,
446                 'upload_date': '20140531',
447                 'thumbnail': 're:^https?://.*\.jpg$',
448             },
449         },
450         # Wistia embed
451         {
452             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
453             'md5': '8788b683c777a5cf25621eaf286d0c23',
454             'info_dict': {
455                 'id': '1cfaf6b7ea',
456                 'ext': 'mov',
457                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
458                 'duration': 643.0,
459                 'filesize': 182808282,
460                 'uploader': 'education-portal.com',
461             },
462         },
463         {
464             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
465             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
466             'info_dict': {
467                 'id': 'uxjb0lwrcz',
468                 'ext': 'mp4',
469                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
470                 'duration': 1715.0,
471                 'uploader': 'thoughtworks.wistia.com',
472             },
473         },
474         # Direct download with broken HEAD
475         {
476             'url': 'http://ai-radio.org:8000/radio.opus',
477             'info_dict': {
478                 'id': 'radio',
479                 'ext': 'opus',
480                 'title': 'radio',
481             },
482             'params': {
483                 'skip_download': True,  # infinite live stream
484             },
485             'expected_warnings': [
486                 r'501.*Not Implemented'
487             ],
488         },
489         # Soundcloud embed
490         {
491             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
492             'info_dict': {
493                 'id': '174391317',
494                 'ext': 'mp3',
495                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
496                 'uploader': 'Sophos Security',
497                 'title': 'Chet Chat 171 - Oct 29, 2014',
498                 'upload_date': '20141029',
499             }
500         },
501         # Livestream embed
502         {
503             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
504             'info_dict': {
505                 'id': '67864563',
506                 'ext': 'flv',
507                 'upload_date': '20141112',
508                 'title': 'Rosetta #CometLanding webcast HL 10',
509             }
510         },
511         # LazyYT
512         {
513             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
514             'info_dict': {
515                 'id': '1986',
516                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
517             },
518             'playlist_mincount': 2,
519         },
520         # Direct link with incorrect MIME type
521         {
522             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
523             'md5': '4ccbebe5f36706d85221f204d7eb5913',
524             'info_dict': {
525                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
526                 'id': '5_Lennart_Poettering_-_Systemd',
527                 'ext': 'webm',
528                 'title': '5_Lennart_Poettering_-_Systemd',
529                 'upload_date': '20141120',
530             },
531             'expected_warnings': [
532                 'URL could be a direct video link, returning it as such.'
533             ]
534         },
535         # Cinchcast embed
536         {
537             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
538             'info_dict': {
539                 'id': '7141703',
540                 'ext': 'mp3',
541                 'upload_date': '20141126',
542                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
543             }
544         },
545         # Cinerama player
546         {
547             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
548             'info_dict': {
549                 'id': '730m_DandD_1901_512k',
550                 'ext': 'mp4',
551                 'uploader': 'www.abc.net.au',
552                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
553             }
554         },
555         # embedded viddler video
556         {
557             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
558             'info_dict': {
559                 'id': '4d03aad9',
560                 'ext': 'mp4',
561                 'uploader': 'deadspin',
562                 'title': 'WALL-TO-GORTAT',
563                 'timestamp': 1422285291,
564                 'upload_date': '20150126',
565             },
566             'add_ie': ['Viddler'],
567         },
568         # Libsyn embed
569         {
570             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
571             'info_dict': {
572                 'id': '3377616',
573                 'ext': 'mp3',
574                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
575                 'description': 'md5:601cb790edd05908957dae8aaa866465',
576                 'upload_date': '20150220',
577             },
578         },
579         # jwplayer YouTube
580         {
581             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
582             'info_dict': {
583                 'id': 'Mrj4DVp2zeA',
584                 'ext': 'mp4',
585                 'upload_date': '20150212',
586                 'uploader': 'The National Archives UK',
587                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
588                 'uploader_id': 'NationalArchives08',
589                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
590             },
591         },
592         # rtl.nl embed
593         {
594             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
595             'playlist_mincount': 5,
596             'info_dict': {
597                 'id': 'aanslagen-kopenhagen',
598                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
599             }
600         },
601         # Zapiks embed
602         {
603             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
604             'info_dict': {
605                 'id': '118046',
606                 'ext': 'mp4',
607                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
608             }
609         },
610         # Kaltura embed
611         {
612             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
613             'info_dict': {
614                 'id': '1_eergr3h1',
615                 'ext': 'mp4',
616                 'upload_date': '20150226',
617                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
618                 'timestamp': int,
619                 'title': 'John Carlson Postgame 2/25/15',
620             },
621         },
622         # Eagle.Platform embed (generic URL)
623         {
624             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
625             'info_dict': {
626                 'id': '227304',
627                 'ext': 'mp4',
628                 'title': 'Навальный вышел на свободу',
629                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
630                 'thumbnail': 're:^https?://.*\.jpg$',
631                 'duration': 87,
632                 'view_count': int,
633                 'age_limit': 0,
634             },
635         },
636         # ClipYou (Eagle.Platform) embed (custom URL)
637         {
638             'url': 'http://muz-tv.ru/play/7129/',
639             'info_dict': {
640                 'id': '12820',
641                 'ext': 'mp4',
642                 'title': "'O Sole Mio",
643                 'thumbnail': 're:^https?://.*\.jpg$',
644                 'duration': 216,
645                 'view_count': int,
646             },
647         },
648         # Pladform embed
649         {
650             'url': 'http://muz-tv.ru/kinozal/view/7400/',
651             'info_dict': {
652                 'id': '100183293',
653                 'ext': 'mp4',
654                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
655                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
656                 'thumbnail': 're:^https?://.*\.jpg$',
657                 'duration': 694,
658                 'age_limit': 0,
659             },
660         },
661         # Playwire embed
662         {
663             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
664             'info_dict': {
665                 'id': '3519514',
666                 'ext': 'mp4',
667                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
668                 'thumbnail': 're:^https?://.*\.png$',
669                 'duration': 45.115,
670             },
671         },
672         # 5min embed
673         {
674             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
675             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
676             'info_dict': {
677                 'id': '518726732',
678                 'ext': 'mp4',
679                 'title': 'Facebook Creates "On This Day" | Crunch Report',
680             },
681         },
682         # SVT embed
683         {
684             'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
685             'info_dict': {
686                 'id': '2900353',
687                 'ext': 'flv',
688                 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
689                 'duration': 27,
690                 'age_limit': 0,
691             },
692         },
693         # RSS feed with enclosure
694         {
695             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
696             'info_dict': {
697                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
698                 'ext': 'm4v',
699                 'upload_date': '20150228',
700                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
701             }
702         },
703         # Crooks and Liars embed
704         {
705             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
706             'info_dict': {
707                 'id': '8RUoRhRi',
708                 'ext': 'mp4',
709                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
710                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
711                 'timestamp': 1428207000,
712                 'upload_date': '20150405',
713                 'uploader': 'Heather',
714             },
715         },
716         # Crooks and Liars external embed
717         {
718             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
719             'info_dict': {
720                 'id': 'MTE3MjUtMzQ2MzA',
721                 'ext': 'mp4',
722                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
723                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
724                 'timestamp': 1265032391,
725                 'upload_date': '20100201',
726                 'uploader': 'Heather',
727             },
728         },
729         # NBC Sports vplayer embed
730         {
731             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
732             'info_dict': {
733                 'id': 'ln7x1qSThw4k',
734                 'ext': 'flv',
735                 'title': "PFT Live: New leader in the 'new-look' defense",
736                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
737             },
738         },
739         # UDN embed
740         {
741             'url': 'http://www.udn.com/news/story/7314/822787',
742             'md5': 'fd2060e988c326991037b9aff9df21a6',
743             'info_dict': {
744                 'id': '300346',
745                 'ext': 'mp4',
746                 'title': '中一中男師變性 全校師生力挺',
747                 'thumbnail': 're:^https?://.*\.jpg$',
748             }
749         },
750         # Ooyala embed
751         {
752             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
753             'info_dict': {
754                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
755                 'ext': 'mp4',
756                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
757                 'title': 'This is what separates the Excel masters from the wannabes',
758             },
759             'params': {
760                 # m3u8 downloads
761                 'skip_download': True,
762             }
763         },
764         # Contains a SMIL manifest
765         {
766             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
767             'info_dict': {
768                 'id': 'file',
769                 'ext': 'flv',
770                 'title': '+ Football: Lottery Champions League Europe',
771                 'uploader': 'www.telewebion.com',
772             },
773             'params': {
774                 # rtmpe downloads
775                 'skip_download': True,
776             }
777         }
778     ]
779
780     def report_following_redirect(self, new_url):
781         """Report information extraction."""
782         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
783
784     def _extract_rss(self, url, video_id, doc):
785         playlist_title = doc.find('./channel/title').text
786         playlist_desc_el = doc.find('./channel/description')
787         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
788
789         entries = []
790         for it in doc.findall('./channel/item'):
791             next_url = xpath_text(it, 'link', fatal=False)
792             if not next_url:
793                 enclosure_nodes = it.findall('./enclosure')
794                 for e in enclosure_nodes:
795                     next_url = e.attrib.get('url')
796                     if next_url:
797                         break
798
799             if not next_url:
800                 continue
801
802             entries.append({
803                 '_type': 'url',
804                 'url': next_url,
805                 'title': it.find('title').text,
806             })
807
808         return {
809             '_type': 'playlist',
810             'id': url,
811             'title': playlist_title,
812             'description': playlist_desc,
813             'entries': entries,
814         }
815
816     def _extract_camtasia(self, url, video_id, webpage):
817         """ Returns None if no camtasia video can be found. """
818
819         camtasia_cfg = self._search_regex(
820             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
821             webpage, 'camtasia configuration file', default=None)
822         if camtasia_cfg is None:
823             return None
824
825         title = self._html_search_meta('DC.title', webpage, fatal=True)
826
827         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
828         camtasia_cfg = self._download_xml(
829             camtasia_url, video_id,
830             note='Downloading camtasia configuration',
831             errnote='Failed to download camtasia configuration')
832         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
833
834         entries = []
835         for n in fileset_node.getchildren():
836             url_n = n.find('./uri')
837             if url_n is None:
838                 continue
839
840             entries.append({
841                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
842                 'title': '%s - %s' % (title, n.tag),
843                 'url': compat_urlparse.urljoin(url, url_n.text),
844                 'duration': float_or_none(n.find('./duration').text),
845             })
846
847         return {
848             '_type': 'playlist',
849             'entries': entries,
850             'title': title,
851         }
852
853     def _real_extract(self, url):
854         if url.startswith('//'):
855             return {
856                 '_type': 'url',
857                 'url': self.http_scheme() + url,
858             }
859
860         parsed_url = compat_urlparse.urlparse(url)
861         if not parsed_url.scheme:
862             default_search = self._downloader.params.get('default_search')
863             if default_search is None:
864                 default_search = 'fixup_error'
865
866             if default_search in ('auto', 'auto_warning', 'fixup_error'):
867                 if '/' in url:
868                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
869                     return self.url_result('http://' + url)
870                 elif default_search != 'fixup_error':
871                     if default_search == 'auto_warning':
872                         if re.match(r'^(?:url|URL)$', url):
873                             raise ExtractorError(
874                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
875                                 expected=True)
876                         else:
877                             self._downloader.report_warning(
878                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
879                     return self.url_result('ytsearch:' + url)
880
881             if default_search in ('error', 'fixup_error'):
882                 raise ExtractorError(
883                     '%r is not a valid URL. '
884                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
885                     % (url, url), expected=True)
886             else:
887                 if ':' not in default_search:
888                     default_search += ':'
889                 return self.url_result(default_search + url)
890
891         url, smuggled_data = unsmuggle_url(url)
892         force_videoid = None
893         is_intentional = smuggled_data and smuggled_data.get('to_generic')
894         if smuggled_data and 'force_videoid' in smuggled_data:
895             force_videoid = smuggled_data['force_videoid']
896             video_id = force_videoid
897         else:
898             video_id = os.path.splitext(compat_urllib_parse.unquote(url.rstrip('/').split('/')[-1]))[0]
899
900         self.to_screen('%s: Requesting header' % video_id)
901
902         head_req = HEADRequest(url)
903         head_response = self._request_webpage(
904             head_req, video_id,
905             note=False, errnote='Could not send HEAD request to %s' % url,
906             fatal=False)
907
908         if head_response is not False:
909             # Check for redirect
910             new_url = head_response.geturl()
911             if url != new_url:
912                 self.report_following_redirect(new_url)
913                 if force_videoid:
914                     new_url = smuggle_url(
915                         new_url, {'force_videoid': force_videoid})
916                 return self.url_result(new_url)
917
918         full_response = None
919         if head_response is False:
920             request = compat_urllib_request.Request(url)
921             request.add_header('Accept-Encoding', '*')
922             full_response = self._request_webpage(request, video_id)
923             head_response = full_response
924
925         # Check for direct link to a video
926         content_type = head_response.headers.get('Content-Type', '')
927         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
928         if m:
929             upload_date = unified_strdate(
930                 head_response.headers.get('Last-Modified'))
931             return {
932                 'id': video_id,
933                 'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0],
934                 'direct': True,
935                 'formats': [{
936                     'format_id': m.group('format_id'),
937                     'url': url,
938                     'vcodec': 'none' if m.group('type') == 'audio' else None
939                 }],
940                 'upload_date': upload_date,
941             }
942
943         if not self._downloader.params.get('test', False) and not is_intentional:
944             self._downloader.report_warning('Falling back on generic information extractor.')
945
946         if not full_response:
947             request = compat_urllib_request.Request(url)
948             # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
949             # making it impossible to download only chunk of the file (yet we need only 512kB to
950             # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
951             # that will always result in downloading the whole file that is not desirable.
952             # Therefore for extraction pass we have to override Accept-Encoding to any in order
953             # to accept raw bytes and being able to download only a chunk.
954             # It may probably better to solve this by checking Content-Type for application/octet-stream
955             # after HEAD request finishes, but not sure if we can rely on this.
956             request.add_header('Accept-Encoding', '*')
957             full_response = self._request_webpage(request, video_id)
958
959         # Maybe it's a direct link to a video?
960         # Be careful not to download the whole thing!
961         first_bytes = full_response.read(512)
962         if not is_html(first_bytes):
963             self._downloader.report_warning(
964                 'URL could be a direct video link, returning it as such.')
965             upload_date = unified_strdate(
966                 head_response.headers.get('Last-Modified'))
967             return {
968                 'id': video_id,
969                 'title': os.path.splitext(compat_urllib_parse.unquote(url_basename(url)))[0],
970                 'direct': True,
971                 'url': url,
972                 'upload_date': upload_date,
973             }
974
975         webpage = self._webpage_read_content(
976             full_response, url, video_id, prefix=first_bytes)
977
978         self.report_extraction(video_id)
979
980         # Is it an RSS feed?
981         try:
982             doc = parse_xml(webpage)
983             if doc.tag == 'rss':
984                 return self._extract_rss(url, video_id, doc)
985         except compat_xml_parse_error:
986             pass
987
988         # Is it a Camtasia project?
989         camtasia_res = self._extract_camtasia(url, video_id, webpage)
990         if camtasia_res is not None:
991             return camtasia_res
992
993         # Sometimes embedded video player is hidden behind percent encoding
994         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
995         # Unescaping the whole page allows to handle those cases in a generic way
996         webpage = compat_urllib_parse.unquote(webpage)
997
998         # it's tempting to parse this further, but you would
999         # have to take into account all the variations like
1000         #   Video Title - Site Name
1001         #   Site Name | Video Title
1002         #   Video Title - Tagline | Site Name
1003         # and so on and so forth; it's just not practical
1004         video_title = self._html_search_regex(
1005             r'(?s)<title>(.*?)</title>', webpage, 'video title',
1006             default='video')
1007
1008         # Try to detect age limit automatically
1009         age_limit = self._rta_search(webpage)
1010         # And then there are the jokers who advertise that they use RTA,
1011         # but actually don't.
1012         AGE_LIMIT_MARKERS = [
1013             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1014         ]
1015         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1016             age_limit = 18
1017
1018         # video uploader is domain name
1019         video_uploader = self._search_regex(
1020             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1021
1022         # Helper method
1023         def _playlist_from_matches(matches, getter=None, ie=None):
1024             urlrs = orderedSet(
1025                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1026                 for m in matches)
1027             return self.playlist_result(
1028                 urlrs, playlist_id=video_id, playlist_title=video_title)
1029
1030         # Look for BrightCove:
1031         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1032         if bc_urls:
1033             self.to_screen('Brightcove video detected.')
1034             entries = [{
1035                 '_type': 'url',
1036                 'url': smuggle_url(bc_url, {'Referer': url}),
1037                 'ie_key': 'Brightcove'
1038             } for bc_url in bc_urls]
1039
1040             return {
1041                 '_type': 'playlist',
1042                 'title': video_title,
1043                 'id': video_id,
1044                 'entries': entries,
1045             }
1046
1047         # Look for embedded rtl.nl player
1048         matches = re.findall(
1049             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
1050             webpage)
1051         if matches:
1052             return _playlist_from_matches(matches, ie='RtlNl')
1053
1054         # Look for embedded (iframe) Vimeo player
1055         mobj = re.search(
1056             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1057         if mobj:
1058             player_url = unescapeHTML(mobj.group('url'))
1059             surl = smuggle_url(player_url, {'Referer': url})
1060             return self.url_result(surl)
1061         # Look for embedded (swf embed) Vimeo player
1062         mobj = re.search(
1063             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1064         if mobj:
1065             return self.url_result(mobj.group(1))
1066
1067         # Look for embedded YouTube player
1068         matches = re.findall(r'''(?x)
1069             (?:
1070                 <iframe[^>]+?src=|
1071                 data-video-url=|
1072                 <embed[^>]+?src=|
1073                 embedSWF\(?:\s*|
1074                 new\s+SWFObject\(
1075             )
1076             (["\'])
1077                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1078                 (?:embed|v|p)/.+?)
1079             \1''', webpage)
1080         if matches:
1081             return _playlist_from_matches(
1082                 matches, lambda m: unescapeHTML(m[1]))
1083
1084         # Look for lazyYT YouTube embed
1085         matches = re.findall(
1086             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1087         if matches:
1088             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1089
1090         # Look for embedded Dailymotion player
1091         matches = re.findall(
1092             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1093         if matches:
1094             return _playlist_from_matches(
1095                 matches, lambda m: unescapeHTML(m[1]))
1096
1097         # Look for embedded Dailymotion playlist player (#3822)
1098         m = re.search(
1099             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1100         if m:
1101             playlists = re.findall(
1102                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1103             if playlists:
1104                 return _playlist_from_matches(
1105                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1106
1107         # Look for embedded Wistia player
1108         match = re.search(
1109             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1110         if match:
1111             embed_url = self._proto_relative_url(
1112                 unescapeHTML(match.group('url')))
1113             return {
1114                 '_type': 'url_transparent',
1115                 'url': embed_url,
1116                 'ie_key': 'Wistia',
1117                 'uploader': video_uploader,
1118                 'title': video_title,
1119                 'id': video_id,
1120             }
1121
1122         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1123         if match:
1124             return {
1125                 '_type': 'url_transparent',
1126                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1127                 'ie_key': 'Wistia',
1128                 'uploader': video_uploader,
1129                 'title': video_title,
1130                 'id': match.group('id')
1131             }
1132
1133         # Look for embedded blip.tv player
1134         bliptv_url = BlipTVIE._extract_url(webpage)
1135         if bliptv_url:
1136             return self.url_result(bliptv_url, 'BlipTV')
1137
1138         # Look for SVT player
1139         svt_url = SVTIE._extract_url(webpage)
1140         if svt_url:
1141             return self.url_result(svt_url, 'SVT')
1142
1143         # Look for embedded condenast player
1144         matches = re.findall(
1145             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1146             webpage)
1147         if matches:
1148             return {
1149                 '_type': 'playlist',
1150                 'entries': [{
1151                     '_type': 'url',
1152                     'ie_key': 'CondeNast',
1153                     'url': ma,
1154                 } for ma in matches],
1155                 'title': video_title,
1156                 'id': video_id,
1157             }
1158
1159         # Look for Bandcamp pages with custom domain
1160         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1161         if mobj is not None:
1162             burl = unescapeHTML(mobj.group(1))
1163             # Don't set the extractor because it can be a track url or an album
1164             return self.url_result(burl)
1165
1166         # Look for embedded Vevo player
1167         mobj = re.search(
1168             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1169         if mobj is not None:
1170             return self.url_result(mobj.group('url'))
1171
1172         # Look for embedded Viddler player
1173         mobj = re.search(
1174             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1175             webpage)
1176         if mobj is not None:
1177             return self.url_result(mobj.group('url'))
1178
1179         # Look for NYTimes player
1180         mobj = re.search(
1181             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1182             webpage)
1183         if mobj is not None:
1184             return self.url_result(mobj.group('url'))
1185
1186         # Look for Libsyn player
1187         mobj = re.search(
1188             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1189         if mobj is not None:
1190             return self.url_result(mobj.group('url'))
1191
1192         # Look for Ooyala videos
1193         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1194                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1195                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1196                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1197         if mobj is not None:
1198             return OoyalaIE._build_url_result(mobj.group('ec'))
1199
1200         # Look for multiple Ooyala embeds on SBN network websites
1201         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1202         if mobj is not None:
1203             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1204             if embeds:
1205                 return _playlist_from_matches(
1206                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1207
1208         # Look for Aparat videos
1209         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1210         if mobj is not None:
1211             return self.url_result(mobj.group(1), 'Aparat')
1212
1213         # Look for MPORA videos
1214         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1215         if mobj is not None:
1216             return self.url_result(mobj.group(1), 'Mpora')
1217
1218         # Look for embedded NovaMov-based player
1219         mobj = re.search(
1220             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1221                     (?P<url>http://(?:(?:embed|www)\.)?
1222                         (?:novamov\.com|
1223                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1224                            videoweed\.(?:es|com)|
1225                            movshare\.(?:net|sx|ag)|
1226                            divxstage\.(?:eu|net|ch|co|at|ag))
1227                         /embed\.php.+?)\1''', webpage)
1228         if mobj is not None:
1229             return self.url_result(mobj.group('url'))
1230
1231         # Look for embedded Facebook player
1232         mobj = re.search(
1233             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1234         if mobj is not None:
1235             return self.url_result(mobj.group('url'), 'Facebook')
1236
1237         # Look for embedded VK player
1238         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1239         if mobj is not None:
1240             return self.url_result(mobj.group('url'), 'VK')
1241
1242         # Look for embedded ivi player
1243         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1244         if mobj is not None:
1245             return self.url_result(mobj.group('url'), 'Ivi')
1246
1247         # Look for embedded Huffington Post player
1248         mobj = re.search(
1249             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1250         if mobj is not None:
1251             return self.url_result(mobj.group('url'), 'HuffPost')
1252
1253         # Look for embed.ly
1254         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1255         if mobj is not None:
1256             return self.url_result(mobj.group('url'))
1257         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1258         if mobj is not None:
1259             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1260
1261         # Look for funnyordie embed
1262         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1263         if matches:
1264             return _playlist_from_matches(
1265                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1266
1267         # Look for BBC iPlayer embed
1268         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1269         if matches:
1270             return _playlist_from_matches(matches, ie='BBCCoUk')
1271
1272         # Look for embedded RUTV player
1273         rutv_url = RUTVIE._extract_url(webpage)
1274         if rutv_url:
1275             return self.url_result(rutv_url, 'RUTV')
1276
1277         # Look for embedded SportBox player
1278         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1279         if sportbox_urls:
1280             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1281
1282         # Look for embedded TED player
1283         mobj = re.search(
1284             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1285         if mobj is not None:
1286             return self.url_result(mobj.group('url'), 'TED')
1287
1288         # Look for embedded Ustream videos
1289         mobj = re.search(
1290             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1291         if mobj is not None:
1292             return self.url_result(mobj.group('url'), 'Ustream')
1293
1294         # Look for embedded arte.tv player
1295         mobj = re.search(
1296             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1297             webpage)
1298         if mobj is not None:
1299             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1300
1301         # Look for embedded smotri.com player
1302         smotri_url = SmotriIE._extract_url(webpage)
1303         if smotri_url:
1304             return self.url_result(smotri_url, 'Smotri')
1305
1306         # Look for embeded soundcloud player
1307         mobj = re.search(
1308             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1309             webpage)
1310         if mobj is not None:
1311             url = unescapeHTML(mobj.group('url'))
1312             return self.url_result(url)
1313
1314         # Look for embedded vulture.com player
1315         mobj = re.search(
1316             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1317             webpage)
1318         if mobj is not None:
1319             url = unescapeHTML(mobj.group('url'))
1320             return self.url_result(url, ie='Vulture')
1321
1322         # Look for embedded mtvservices player
1323         mobj = re.search(
1324             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1325             webpage)
1326         if mobj is not None:
1327             url = unescapeHTML(mobj.group('url'))
1328             return self.url_result(url, ie='MTVServicesEmbedded')
1329
1330         # Look for embedded yahoo player
1331         mobj = re.search(
1332             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1333             webpage)
1334         if mobj is not None:
1335             return self.url_result(mobj.group('url'), 'Yahoo')
1336
1337         # Look for embedded sbs.com.au player
1338         mobj = re.search(
1339             r'''(?x)
1340             (?:
1341                 <meta\s+property="og:video"\s+content=|
1342                 <iframe[^>]+?src=
1343             )
1344             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1345             webpage)
1346         if mobj is not None:
1347             return self.url_result(mobj.group('url'), 'SBS')
1348
1349         # Look for embedded Cinchcast player
1350         mobj = re.search(
1351             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1352             webpage)
1353         if mobj is not None:
1354             return self.url_result(mobj.group('url'), 'Cinchcast')
1355
1356         mobj = re.search(
1357             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1358             webpage)
1359         if not mobj:
1360             mobj = re.search(
1361                 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1362                 webpage)
1363         if mobj is not None:
1364             return self.url_result(mobj.group('url'), 'MLB')
1365
1366         mobj = re.search(
1367             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1368             webpage)
1369         if mobj is not None:
1370             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1371
1372         mobj = re.search(
1373             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1374             webpage)
1375         if mobj is not None:
1376             return self.url_result(mobj.group('url'), 'Livestream')
1377
1378         # Look for Zapiks embed
1379         mobj = re.search(
1380             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1381         if mobj is not None:
1382             return self.url_result(mobj.group('url'), 'Zapiks')
1383
1384         # Look for Kaltura embeds
1385         mobj = re.search(
1386             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1387         if mobj is not None:
1388             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1389
1390         # Look for Eagle.Platform embeds
1391         mobj = re.search(
1392             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1393         if mobj is not None:
1394             return self.url_result(mobj.group('url'), 'EaglePlatform')
1395
1396         # Look for ClipYou (uses Eagle.Platform) embeds
1397         mobj = re.search(
1398             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1399         if mobj is not None:
1400             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1401
1402         # Look for Pladform embeds
1403         mobj = re.search(
1404             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1405         if mobj is not None:
1406             return self.url_result(mobj.group('url'), 'Pladform')
1407
1408         # Look for Playwire embeds
1409         mobj = re.search(
1410             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1411         if mobj is not None:
1412             return self.url_result(mobj.group('url'))
1413
1414         # Look for 5min embeds
1415         mobj = re.search(
1416             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1417         if mobj is not None:
1418             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1419
1420         # Look for Crooks and Liars embeds
1421         mobj = re.search(
1422             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1423         if mobj is not None:
1424             return self.url_result(mobj.group('url'))
1425
1426         # Look for NBC Sports VPlayer embeds
1427         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1428         if nbc_sports_url:
1429             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1430
1431         # Look for UDN embeds
1432         mobj = re.search(
1433             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1434         if mobj is not None:
1435             return self.url_result(
1436                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1437
1438         # Look for Senate ISVP iframe
1439         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1440         if senate_isvp_url:
1441             return self.url_result(senate_isvp_url, 'SenateISVP')
1442
1443         def check_video(vurl):
1444             if YoutubeIE.suitable(vurl):
1445                 return True
1446             vpath = compat_urlparse.urlparse(vurl).path
1447             vext = determine_ext(vpath)
1448             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1449
1450         def filter_video(urls):
1451             return list(filter(check_video, urls))
1452
1453         # Start with something easy: JW Player in SWFObject
1454         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1455         if not found:
1456             # Look for gorilla-vid style embedding
1457             found = filter_video(re.findall(r'''(?sx)
1458                 (?:
1459                     jw_plugins|
1460                     JWPlayerOptions|
1461                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1462                 )
1463                 .*?
1464                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1465         if not found:
1466             # Broaden the search a little bit
1467             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1468         if not found:
1469             # Broaden the findall a little bit: JWPlayer JS loader
1470             found = filter_video(re.findall(
1471                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1472         if not found:
1473             # Flow player
1474             found = filter_video(re.findall(r'''(?xs)
1475                 flowplayer\("[^"]+",\s*
1476                     \{[^}]+?\}\s*,
1477                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1478                         ["']?url["']?\s*:\s*["']([^"']+)["']
1479             ''', webpage))
1480         if not found:
1481             # Cinerama player
1482             found = re.findall(
1483                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1484         if not found:
1485             # Try to find twitter cards info
1486             found = filter_video(re.findall(
1487                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1488         if not found:
1489             # We look for Open Graph info:
1490             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1491             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1492             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1493             if m_video_type is not None:
1494                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1495         if not found:
1496             # HTML5 video
1497             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1498         if not found:
1499             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1500             found = re.search(
1501                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1502                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1503                 webpage)
1504             if not found:
1505                 # Look also in Refresh HTTP header
1506                 refresh_header = head_response.headers.get('Refresh')
1507                 if refresh_header:
1508                     found = re.search(REDIRECT_REGEX, refresh_header)
1509             if found:
1510                 new_url = compat_urlparse.urljoin(url, found.group(1))
1511                 self.report_following_redirect(new_url)
1512                 return {
1513                     '_type': 'url',
1514                     'url': new_url,
1515                 }
1516         if not found:
1517             raise UnsupportedError(url)
1518
1519         entries = []
1520         for video_url in found:
1521             video_url = compat_urlparse.urljoin(url, video_url)
1522             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1523
1524             # Sometimes, jwplayer extraction will result in a YouTube URL
1525             if YoutubeIE.suitable(video_url):
1526                 entries.append(self.url_result(video_url, 'Youtube'))
1527                 continue
1528
1529             # here's a fun little line of code for you:
1530             video_id = os.path.splitext(video_id)[0]
1531
1532             if determine_ext(video_url) == 'smil':
1533                 entries.append({
1534                     'id': video_id,
1535                     'formats': self._extract_smil_formats(video_url, video_id),
1536                     'uploader': video_uploader,
1537                     'title': video_title,
1538                     'age_limit': age_limit,
1539                 })
1540             else:
1541                 entries.append({
1542                     'id': video_id,
1543                     'url': video_url,
1544                     'uploader': video_uploader,
1545                     'title': video_title,
1546                     'age_limit': age_limit,
1547                 })
1548
1549         if len(entries) == 1:
1550             return entries[0]
1551         else:
1552             for num, e in enumerate(entries, start=1):
1553                 # 'url' results don't have a title
1554                 if e.get('title') is not None:
1555                     e['title'] = '%s (%d)' % (e['title'], num)
1556             return {
1557                 '_type': 'playlist',
1558                 'entries': entries,
1559             }