[extractor/generic] Fix following incomplete redirects (#5640)
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urlparse,
13     compat_xml_parse_error,
14 )
15 from ..utils import (
16     determine_ext,
17     ExtractorError,
18     float_or_none,
19     HEADRequest,
20     is_html,
21     orderedSet,
22     parse_xml,
23     smuggle_url,
24     unescapeHTML,
25     unified_strdate,
26     unsmuggle_url,
27     UnsupportedError,
28     url_basename,
29     xpath_text,
30 )
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
37 from .udn import UDNEmbedIE
38 from .senateisvp import SenateISVPIE
39 from .bliptv import BlipTVIE
40
41
42 class GenericIE(InfoExtractor):
43     IE_DESC = 'Generic downloader that works on some sites'
44     _VALID_URL = r'.*'
45     IE_NAME = 'generic'
46     _TESTS = [
47         {
48             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
49             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
50             'info_dict': {
51                 'id': '13601338388002',
52                 'ext': 'mp4',
53                 'uploader': 'www.hodiho.fr',
54                 'title': 'R\u00e9gis plante sa Jeep',
55             }
56         },
57         # bandcamp page with custom domain
58         {
59             'add_ie': ['Bandcamp'],
60             'url': 'http://bronyrock.com/track/the-pony-mash',
61             'info_dict': {
62                 'id': '3235767654',
63                 'ext': 'mp3',
64                 'title': 'The Pony Mash',
65                 'uploader': 'M_Pallante',
66             },
67             'skip': 'There is a limit of 200 free downloads / month for the test song',
68         },
69         # embedded brightcove video
70         # it also tests brightcove videos that need to set the 'Referer' in the
71         # http requests
72         {
73             'add_ie': ['Brightcove'],
74             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
75             'info_dict': {
76                 'id': '2765128793001',
77                 'ext': 'mp4',
78                 'title': 'Le cours de bourse : l’analyse technique',
79                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
80                 'uploader': 'BFM BUSINESS',
81             },
82             'params': {
83                 'skip_download': True,
84             },
85         },
86         {
87             # https://github.com/rg3/youtube-dl/issues/2253
88             'url': 'http://bcove.me/i6nfkrc3',
89             'md5': '0ba9446db037002366bab3b3eb30c88c',
90             'info_dict': {
91                 'id': '3101154703001',
92                 'ext': 'mp4',
93                 'title': 'Still no power',
94                 'uploader': 'thestar.com',
95                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
96             },
97             'add_ie': ['Brightcove'],
98         },
99         {
100             'url': 'http://www.championat.com/video/football/v/87/87499.html',
101             'md5': 'fb973ecf6e4a78a67453647444222983',
102             'info_dict': {
103                 'id': '3414141473001',
104                 'ext': 'mp4',
105                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
106                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
107                 'uploader': 'Championat',
108             },
109         },
110         {
111             # https://github.com/rg3/youtube-dl/issues/3541
112             'add_ie': ['Brightcove'],
113             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
114             'info_dict': {
115                 'id': '3866516442001',
116                 'ext': 'mp4',
117                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
118                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
119                 'uploader': 'SBS Broadcasting',
120             },
121             'skip': 'Restricted to Netherlands',
122             'params': {
123                 'skip_download': True,  # m3u8 download
124             },
125         },
126         # Direct link to a video
127         {
128             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
129             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
130             'info_dict': {
131                 'id': 'trailer',
132                 'ext': 'mp4',
133                 'title': 'trailer',
134                 'upload_date': '20100513',
135             }
136         },
137         # ooyala video
138         {
139             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
140             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
141             'info_dict': {
142                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
143                 'ext': 'mp4',
144                 'title': '2cc213299525360.mov',  # that's what we get
145             },
146             'add_ie': ['Ooyala'],
147         },
148         # multiple ooyala embeds on SBN network websites
149         {
150             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
151             'info_dict': {
152                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
153                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
154             },
155             'playlist_mincount': 3,
156             'params': {
157                 'skip_download': True,
158             },
159             'add_ie': ['Ooyala'],
160         },
161         # google redirect
162         {
163             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
164             'info_dict': {
165                 'id': 'cmQHVoWB5FY',
166                 'ext': 'mp4',
167                 'upload_date': '20130224',
168                 'uploader_id': 'TheVerge',
169                 'description': 're:^Chris Ziegler takes a look at the\.*',
170                 'uploader': 'The Verge',
171                 'title': 'First Firefox OS phones side-by-side',
172             },
173             'params': {
174                 'skip_download': False,
175             }
176         },
177         # embed.ly video
178         {
179             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
180             'info_dict': {
181                 'id': '9ODmcdjQcHQ',
182                 'ext': 'mp4',
183                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
184                 'upload_date': '20140225',
185                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
186                 'uploader': 'Tested',
187                 'uploader_id': 'testedcom',
188             },
189             # No need to test YoutubeIE here
190             'params': {
191                 'skip_download': True,
192             },
193         },
194         # funnyordie embed
195         {
196             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
197             'info_dict': {
198                 'id': '18e820ec3f',
199                 'ext': 'mp4',
200                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
201                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
202             },
203         },
204         # BBC iPlayer embeds
205         {
206             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
207             'info_dict': {
208                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
209             },
210             'playlist_mincount': 18,
211         },
212         # RUTV embed
213         {
214             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
215             'info_dict': {
216                 'id': '776940',
217                 'ext': 'mp4',
218                 'title': 'Охотское море стало целиком российским',
219                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
220             },
221             'params': {
222                 # m3u8 download
223                 'skip_download': True,
224             },
225         },
226         # Embedded TED video
227         {
228             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
229             'md5': '65fdff94098e4a607385a60c5177c638',
230             'info_dict': {
231                 'id': '1969',
232                 'ext': 'mp4',
233                 'title': 'Hidden miracles of the natural world',
234                 'uploader': 'Louie Schwartzberg',
235                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
236             }
237         },
238         # Embeded Ustream video
239         {
240             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
241             'md5': '27b99cdb639c9b12a79bca876a073417',
242             'info_dict': {
243                 'id': '45734260',
244                 'ext': 'flv',
245                 'uploader': 'AU SPA:  The NSA and Privacy',
246                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
247             }
248         },
249         # nowvideo embed hidden behind percent encoding
250         {
251             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
252             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
253             'info_dict': {
254                 'id': '06e53103ca9aa',
255                 'ext': 'flv',
256                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
257                 'description': 'No description',
258             },
259         },
260         # arte embed
261         {
262             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
263             'md5': '7653032cbb25bf6c80d80f217055fa43',
264             'info_dict': {
265                 'id': '048195-004_PLUS7-F',
266                 'ext': 'flv',
267                 'title': 'X:enius',
268                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
269                 'upload_date': '20140320',
270             },
271             'params': {
272                 'skip_download': 'Requires rtmpdump'
273             }
274         },
275         # Condé Nast embed
276         {
277             'url': 'http://www.wired.com/2014/04/honda-asimo/',
278             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
279             'info_dict': {
280                 'id': '53501be369702d3275860000',
281                 'ext': 'mp4',
282                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
283             }
284         },
285         # Dailymotion embed
286         {
287             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
288             'md5': '441aeeb82eb72c422c7f14ec533999cd',
289             'info_dict': {
290                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
291                 'ext': 'mp4',
292                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
293                 'uploader': 'Spi0n',
294             },
295             'add_ie': ['Dailymotion'],
296         },
297         # YouTube embed
298         {
299             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
300             'info_dict': {
301                 'id': 'FXRb4ykk4S0',
302                 'ext': 'mp4',
303                 'title': 'The NBL Auction 2014',
304                 'uploader': 'BADMINTON England',
305                 'uploader_id': 'BADMINTONEvents',
306                 'upload_date': '20140603',
307                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
308             },
309             'add_ie': ['Youtube'],
310             'params': {
311                 'skip_download': True,
312             }
313         },
314         # MTVSercices embed
315         {
316             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
317             'md5': '35727f82f58c76d996fc188f9755b0d5',
318             'info_dict': {
319                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
320                 'ext': 'mp4',
321                 'title': 'Review',
322                 'description': 'Mario\'s life in the fast lane has never looked so good.',
323             },
324         },
325         # YouTube embed via <data-embed-url="">
326         {
327             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
328             'info_dict': {
329                 'id': '4vAffPZIT44',
330                 'ext': 'mp4',
331                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
332                 'uploader': 'Gameloft',
333                 'uploader_id': 'gameloft',
334                 'upload_date': '20140828',
335                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
336             },
337             'params': {
338                 'skip_download': True,
339             }
340         },
341         # Camtasia studio
342         {
343             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
344             'playlist': [{
345                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
346                 'info_dict': {
347                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
348                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
349                     'ext': 'flv',
350                     'duration': 2235.90,
351                 }
352             }, {
353                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
354                 'info_dict': {
355                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
356                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
357                     'ext': 'flv',
358                     'duration': 2235.93,
359                 }
360             }],
361             'info_dict': {
362                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
363             }
364         },
365         # Flowplayer
366         {
367             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
368             'md5': '9d65602bf31c6e20014319c7d07fba27',
369             'info_dict': {
370                 'id': '5123ea6d5e5a7',
371                 'ext': 'mp4',
372                 'age_limit': 18,
373                 'uploader': 'www.handjobhub.com',
374                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
375             }
376         },
377         # RSS feed
378         {
379             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
380             'info_dict': {
381                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
382                 'title': 'Zero Punctuation',
383                 'description': 're:.*groundbreaking video review series.*'
384             },
385             'playlist_mincount': 11,
386         },
387         # Multiple brightcove videos
388         # https://github.com/rg3/youtube-dl/issues/2283
389         {
390             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
391             'info_dict': {
392                 'id': 'always-never',
393                 'title': 'Always / Never - The New Yorker',
394             },
395             'playlist_count': 3,
396             'params': {
397                 'extract_flat': False,
398                 'skip_download': True,
399             }
400         },
401         # MLB embed
402         {
403             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
404             'md5': '96f09a37e44da40dd083e12d9a683327',
405             'info_dict': {
406                 'id': '33322633',
407                 'ext': 'mp4',
408                 'title': 'Ump changes call to ball',
409                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
410                 'duration': 48,
411                 'timestamp': 1401537900,
412                 'upload_date': '20140531',
413                 'thumbnail': 're:^https?://.*\.jpg$',
414             },
415         },
416         # MLB articles
417         {
418             'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
419             'md5': 'b190e70141fb9a1552a85426b4da1b5d',
420             'info_dict': {
421                 'id': '75609783',
422                 'ext': 'mp4',
423                 'title': 'Must C: Pillar climbs for catch',
424                 'description': '4/15/15: Blue Jays outfielder Kevin Pillar continues his defensive dominance by climbing the wall in left to rob Tim Beckham of a home run',
425                 'timestamp': 1429124820,
426                 'upload_date': '20150415',
427             }
428         },
429         # Wistia embed
430         {
431             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
432             'md5': '8788b683c777a5cf25621eaf286d0c23',
433             'info_dict': {
434                 'id': '1cfaf6b7ea',
435                 'ext': 'mov',
436                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
437                 'duration': 643.0,
438                 'filesize': 182808282,
439                 'uploader': 'education-portal.com',
440             },
441         },
442         {
443             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
444             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
445             'info_dict': {
446                 'id': 'uxjb0lwrcz',
447                 'ext': 'mp4',
448                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
449                 'duration': 1715.0,
450                 'uploader': 'thoughtworks.wistia.com',
451             },
452         },
453         # Direct download with broken HEAD
454         {
455             'url': 'http://ai-radio.org:8000/radio.opus',
456             'info_dict': {
457                 'id': 'radio',
458                 'ext': 'opus',
459                 'title': 'radio',
460             },
461             'params': {
462                 'skip_download': True,  # infinite live stream
463             },
464             'expected_warnings': [
465                 r'501.*Not Implemented'
466             ],
467         },
468         # Soundcloud embed
469         {
470             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
471             'info_dict': {
472                 'id': '174391317',
473                 'ext': 'mp3',
474                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
475                 'uploader': 'Sophos Security',
476                 'title': 'Chet Chat 171 - Oct 29, 2014',
477                 'upload_date': '20141029',
478             }
479         },
480         # Livestream embed
481         {
482             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
483             'info_dict': {
484                 'id': '67864563',
485                 'ext': 'flv',
486                 'upload_date': '20141112',
487                 'title': 'Rosetta #CometLanding webcast HL 10',
488             }
489         },
490         # LazyYT
491         {
492             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
493             'info_dict': {
494                 'id': '1986',
495                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
496             },
497             'playlist_mincount': 2,
498         },
499         # Direct link with incorrect MIME type
500         {
501             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
502             'md5': '4ccbebe5f36706d85221f204d7eb5913',
503             'info_dict': {
504                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
505                 'id': '5_Lennart_Poettering_-_Systemd',
506                 'ext': 'webm',
507                 'title': '5_Lennart_Poettering_-_Systemd',
508                 'upload_date': '20141120',
509             },
510             'expected_warnings': [
511                 'URL could be a direct video link, returning it as such.'
512             ]
513         },
514         # Cinchcast embed
515         {
516             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
517             'info_dict': {
518                 'id': '7141703',
519                 'ext': 'mp3',
520                 'upload_date': '20141126',
521                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
522             }
523         },
524         # Cinerama player
525         {
526             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
527             'info_dict': {
528                 'id': '730m_DandD_1901_512k',
529                 'ext': 'mp4',
530                 'uploader': 'www.abc.net.au',
531                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
532             }
533         },
534         # embedded viddler video
535         {
536             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
537             'info_dict': {
538                 'id': '4d03aad9',
539                 'ext': 'mp4',
540                 'uploader': 'deadspin',
541                 'title': 'WALL-TO-GORTAT',
542                 'timestamp': 1422285291,
543                 'upload_date': '20150126',
544             },
545             'add_ie': ['Viddler'],
546         },
547         # Libsyn embed
548         {
549             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
550             'info_dict': {
551                 'id': '3377616',
552                 'ext': 'mp3',
553                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
554                 'description': 'md5:601cb790edd05908957dae8aaa866465',
555                 'upload_date': '20150220',
556             },
557         },
558         # jwplayer YouTube
559         {
560             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
561             'info_dict': {
562                 'id': 'Mrj4DVp2zeA',
563                 'ext': 'mp4',
564                 'upload_date': '20150212',
565                 'uploader': 'The National Archives UK',
566                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
567                 'uploader_id': 'NationalArchives08',
568                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
569             },
570         },
571         # rtl.nl embed
572         {
573             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
574             'playlist_mincount': 5,
575             'info_dict': {
576                 'id': 'aanslagen-kopenhagen',
577                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
578             }
579         },
580         # Zapiks embed
581         {
582             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
583             'info_dict': {
584                 'id': '118046',
585                 'ext': 'mp4',
586                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
587             }
588         },
589         # Kaltura embed
590         {
591             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
592             'info_dict': {
593                 'id': '1_eergr3h1',
594                 'ext': 'mp4',
595                 'upload_date': '20150226',
596                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
597                 'timestamp': int,
598                 'title': 'John Carlson Postgame 2/25/15',
599             },
600         },
601         # Eagle.Platform embed (generic URL)
602         {
603             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
604             'info_dict': {
605                 'id': '227304',
606                 'ext': 'mp4',
607                 'title': 'Навальный вышел на свободу',
608                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
609                 'thumbnail': 're:^https?://.*\.jpg$',
610                 'duration': 87,
611                 'view_count': int,
612                 'age_limit': 0,
613             },
614         },
615         # ClipYou (Eagle.Platform) embed (custom URL)
616         {
617             'url': 'http://muz-tv.ru/play/7129/',
618             'info_dict': {
619                 'id': '12820',
620                 'ext': 'mp4',
621                 'title': "'O Sole Mio",
622                 'thumbnail': 're:^https?://.*\.jpg$',
623                 'duration': 216,
624                 'view_count': int,
625             },
626         },
627         # Pladform embed
628         {
629             'url': 'http://muz-tv.ru/kinozal/view/7400/',
630             'info_dict': {
631                 'id': '100183293',
632                 'ext': 'mp4',
633                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
634                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
635                 'thumbnail': 're:^https?://.*\.jpg$',
636                 'duration': 694,
637                 'age_limit': 0,
638             },
639         },
640         # Playwire embed
641         {
642             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
643             'info_dict': {
644                 'id': '3519514',
645                 'ext': 'mp4',
646                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
647                 'thumbnail': 're:^https?://.*\.png$',
648                 'duration': 45.115,
649             },
650         },
651         # 5min embed
652         {
653             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
654             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
655             'info_dict': {
656                 'id': '518726732',
657                 'ext': 'mp4',
658                 'title': 'Facebook Creates "On This Day" | Crunch Report',
659             },
660         },
661         # RSS feed with enclosure
662         {
663             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
664             'info_dict': {
665                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
666                 'ext': 'm4v',
667                 'upload_date': '20150228',
668                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
669             }
670         },
671         # Crooks and Liars embed
672         {
673             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
674             'info_dict': {
675                 'id': '8RUoRhRi',
676                 'ext': 'mp4',
677                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
678                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
679                 'timestamp': 1428207000,
680                 'upload_date': '20150405',
681                 'uploader': 'Heather',
682             },
683         },
684         # Crooks and Liars external embed
685         {
686             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
687             'info_dict': {
688                 'id': 'MTE3MjUtMzQ2MzA',
689                 'ext': 'mp4',
690                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
691                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
692                 'timestamp': 1265032391,
693                 'upload_date': '20100201',
694                 'uploader': 'Heather',
695             },
696         },
697         # NBC Sports vplayer embed
698         {
699             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
700             'info_dict': {
701                 'id': 'ln7x1qSThw4k',
702                 'ext': 'flv',
703                 'title': "PFT Live: New leader in the 'new-look' defense",
704                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
705             },
706         },
707         # UDN embed
708         {
709             'url': 'http://www.udn.com/news/story/7314/822787',
710             'md5': 'fd2060e988c326991037b9aff9df21a6',
711             'info_dict': {
712                 'id': '300346',
713                 'ext': 'mp4',
714                 'title': '中一中男師變性 全校師生力挺',
715                 'thumbnail': 're:^https?://.*\.jpg$',
716             }
717         },
718         # Ooyala embed
719         {
720             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
721             'info_dict': {
722                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
723                 'ext': 'mp4',
724                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
725                 'title': 'This is what separates the Excel masters from the wannabes',
726             },
727             'params': {
728                 # m3u8 downloads
729                 'skip_download': True,
730             }
731         },
732         # Contains a SMIL manifest
733         {
734             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
735             'info_dict': {
736                 'id': 'file',
737                 'ext': 'flv',
738                 'title': '+ Football: Lottery Champions League Europe',
739                 'uploader': 'www.telewebion.com',
740             },
741             'params': {
742                 # rtmpe downloads
743                 'skip_download': True,
744             }
745         }
746     ]
747
748     def report_following_redirect(self, new_url):
749         """Report information extraction."""
750         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
751
752     def _extract_rss(self, url, video_id, doc):
753         playlist_title = doc.find('./channel/title').text
754         playlist_desc_el = doc.find('./channel/description')
755         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
756
757         entries = []
758         for it in doc.findall('./channel/item'):
759             next_url = xpath_text(it, 'link', fatal=False)
760             if not next_url:
761                 enclosure_nodes = it.findall('./enclosure')
762                 for e in enclosure_nodes:
763                     next_url = e.attrib.get('url')
764                     if next_url:
765                         break
766
767             if not next_url:
768                 continue
769
770             entries.append({
771                 '_type': 'url',
772                 'url': next_url,
773                 'title': it.find('title').text,
774             })
775
776         return {
777             '_type': 'playlist',
778             'id': url,
779             'title': playlist_title,
780             'description': playlist_desc,
781             'entries': entries,
782         }
783
784     def _extract_camtasia(self, url, video_id, webpage):
785         """ Returns None if no camtasia video can be found. """
786
787         camtasia_cfg = self._search_regex(
788             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
789             webpage, 'camtasia configuration file', default=None)
790         if camtasia_cfg is None:
791             return None
792
793         title = self._html_search_meta('DC.title', webpage, fatal=True)
794
795         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
796         camtasia_cfg = self._download_xml(
797             camtasia_url, video_id,
798             note='Downloading camtasia configuration',
799             errnote='Failed to download camtasia configuration')
800         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
801
802         entries = []
803         for n in fileset_node.getchildren():
804             url_n = n.find('./uri')
805             if url_n is None:
806                 continue
807
808             entries.append({
809                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
810                 'title': '%s - %s' % (title, n.tag),
811                 'url': compat_urlparse.urljoin(url, url_n.text),
812                 'duration': float_or_none(n.find('./duration').text),
813             })
814
815         return {
816             '_type': 'playlist',
817             'entries': entries,
818             'title': title,
819         }
820
821     def _real_extract(self, url):
822         if url.startswith('//'):
823             return {
824                 '_type': 'url',
825                 'url': self.http_scheme() + url,
826             }
827
828         parsed_url = compat_urlparse.urlparse(url)
829         if not parsed_url.scheme:
830             default_search = self._downloader.params.get('default_search')
831             if default_search is None:
832                 default_search = 'fixup_error'
833
834             if default_search in ('auto', 'auto_warning', 'fixup_error'):
835                 if '/' in url:
836                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
837                     return self.url_result('http://' + url)
838                 elif default_search != 'fixup_error':
839                     if default_search == 'auto_warning':
840                         if re.match(r'^(?:url|URL)$', url):
841                             raise ExtractorError(
842                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
843                                 expected=True)
844                         else:
845                             self._downloader.report_warning(
846                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
847                     return self.url_result('ytsearch:' + url)
848
849             if default_search in ('error', 'fixup_error'):
850                 raise ExtractorError(
851                     '%r is not a valid URL. '
852                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
853                     % (url, url), expected=True)
854             else:
855                 if ':' not in default_search:
856                     default_search += ':'
857                 return self.url_result(default_search + url)
858
859         url, smuggled_data = unsmuggle_url(url)
860         force_videoid = None
861         is_intentional = smuggled_data and smuggled_data.get('to_generic')
862         if smuggled_data and 'force_videoid' in smuggled_data:
863             force_videoid = smuggled_data['force_videoid']
864             video_id = force_videoid
865         else:
866             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
867
868         self.to_screen('%s: Requesting header' % video_id)
869
870         head_req = HEADRequest(url)
871         head_response = self._request_webpage(
872             head_req, video_id,
873             note=False, errnote='Could not send HEAD request to %s' % url,
874             fatal=False)
875
876         if head_response is not False:
877             # Check for redirect
878             new_url = head_response.geturl()
879             if url != new_url:
880                 self.report_following_redirect(new_url)
881                 if force_videoid:
882                     new_url = smuggle_url(
883                         new_url, {'force_videoid': force_videoid})
884                 return self.url_result(new_url)
885
886         full_response = None
887         if head_response is False:
888             full_response = self._request_webpage(url, video_id)
889             head_response = full_response
890
891         # Check for direct link to a video
892         content_type = head_response.headers.get('Content-Type', '')
893         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
894         if m:
895             upload_date = unified_strdate(
896                 head_response.headers.get('Last-Modified'))
897             return {
898                 'id': video_id,
899                 'title': os.path.splitext(url_basename(url))[0],
900                 'direct': True,
901                 'formats': [{
902                     'format_id': m.group('format_id'),
903                     'url': url,
904                     'vcodec': 'none' if m.group('type') == 'audio' else None
905                 }],
906                 'upload_date': upload_date,
907             }
908
909         if not self._downloader.params.get('test', False) and not is_intentional:
910             self._downloader.report_warning('Falling back on generic information extractor.')
911
912         if not full_response:
913             full_response = self._request_webpage(url, video_id)
914
915         # Maybe it's a direct link to a video?
916         # Be careful not to download the whole thing!
917         first_bytes = full_response.read(512)
918         if not is_html(first_bytes):
919             self._downloader.report_warning(
920                 'URL could be a direct video link, returning it as such.')
921             upload_date = unified_strdate(
922                 head_response.headers.get('Last-Modified'))
923             return {
924                 'id': video_id,
925                 'title': os.path.splitext(url_basename(url))[0],
926                 'direct': True,
927                 'url': url,
928                 'upload_date': upload_date,
929             }
930
931         webpage = self._webpage_read_content(
932             full_response, url, video_id, prefix=first_bytes)
933
934         self.report_extraction(video_id)
935
936         # Is it an RSS feed?
937         try:
938             doc = parse_xml(webpage)
939             if doc.tag == 'rss':
940                 return self._extract_rss(url, video_id, doc)
941         except compat_xml_parse_error:
942             pass
943
944         # Is it a Camtasia project?
945         camtasia_res = self._extract_camtasia(url, video_id, webpage)
946         if camtasia_res is not None:
947             return camtasia_res
948
949         # Sometimes embedded video player is hidden behind percent encoding
950         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
951         # Unescaping the whole page allows to handle those cases in a generic way
952         webpage = compat_urllib_parse.unquote(webpage)
953
954         # it's tempting to parse this further, but you would
955         # have to take into account all the variations like
956         #   Video Title - Site Name
957         #   Site Name | Video Title
958         #   Video Title - Tagline | Site Name
959         # and so on and so forth; it's just not practical
960         video_title = self._html_search_regex(
961             r'(?s)<title>(.*?)</title>', webpage, 'video title',
962             default='video')
963
964         # Try to detect age limit automatically
965         age_limit = self._rta_search(webpage)
966         # And then there are the jokers who advertise that they use RTA,
967         # but actually don't.
968         AGE_LIMIT_MARKERS = [
969             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
970         ]
971         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
972             age_limit = 18
973
974         # video uploader is domain name
975         video_uploader = self._search_regex(
976             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
977
978         # Helper method
979         def _playlist_from_matches(matches, getter=None, ie=None):
980             urlrs = orderedSet(
981                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
982                 for m in matches)
983             return self.playlist_result(
984                 urlrs, playlist_id=video_id, playlist_title=video_title)
985
986         # Look for BrightCove:
987         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
988         if bc_urls:
989             self.to_screen('Brightcove video detected.')
990             entries = [{
991                 '_type': 'url',
992                 'url': smuggle_url(bc_url, {'Referer': url}),
993                 'ie_key': 'Brightcove'
994             } for bc_url in bc_urls]
995
996             return {
997                 '_type': 'playlist',
998                 'title': video_title,
999                 'id': video_id,
1000                 'entries': entries,
1001             }
1002
1003         # Look for embedded rtl.nl player
1004         matches = re.findall(
1005             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
1006             webpage)
1007         if matches:
1008             return _playlist_from_matches(matches, ie='RtlNl')
1009
1010         # Look for embedded (iframe) Vimeo player
1011         mobj = re.search(
1012             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1013         if mobj:
1014             player_url = unescapeHTML(mobj.group('url'))
1015             surl = smuggle_url(player_url, {'Referer': url})
1016             return self.url_result(surl)
1017         # Look for embedded (swf embed) Vimeo player
1018         mobj = re.search(
1019             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1020         if mobj:
1021             return self.url_result(mobj.group(1))
1022
1023         # Look for embedded YouTube player
1024         matches = re.findall(r'''(?x)
1025             (?:
1026                 <iframe[^>]+?src=|
1027                 data-video-url=|
1028                 <embed[^>]+?src=|
1029                 embedSWF\(?:\s*|
1030                 new\s+SWFObject\(
1031             )
1032             (["\'])
1033                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1034                 (?:embed|v|p)/.+?)
1035             \1''', webpage)
1036         if matches:
1037             return _playlist_from_matches(
1038                 matches, lambda m: unescapeHTML(m[1]))
1039
1040         # Look for lazyYT YouTube embed
1041         matches = re.findall(
1042             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1043         if matches:
1044             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1045
1046         # Look for embedded Dailymotion player
1047         matches = re.findall(
1048             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1049         if matches:
1050             return _playlist_from_matches(
1051                 matches, lambda m: unescapeHTML(m[1]))
1052
1053         # Look for embedded Dailymotion playlist player (#3822)
1054         m = re.search(
1055             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1056         if m:
1057             playlists = re.findall(
1058                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1059             if playlists:
1060                 return _playlist_from_matches(
1061                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1062
1063         # Look for embedded Wistia player
1064         match = re.search(
1065             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1066         if match:
1067             embed_url = self._proto_relative_url(
1068                 unescapeHTML(match.group('url')))
1069             return {
1070                 '_type': 'url_transparent',
1071                 'url': embed_url,
1072                 'ie_key': 'Wistia',
1073                 'uploader': video_uploader,
1074                 'title': video_title,
1075                 'id': video_id,
1076             }
1077
1078         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1079         if match:
1080             return {
1081                 '_type': 'url_transparent',
1082                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1083                 'ie_key': 'Wistia',
1084                 'uploader': video_uploader,
1085                 'title': video_title,
1086                 'id': match.group('id')
1087             }
1088
1089         # Look for embedded blip.tv player
1090         bliptv_url = BlipTVIE._extract_url(webpage)
1091         if bliptv_url:
1092             return self.url_result(bliptv_url, 'BlipTV')
1093
1094         # Look for embedded condenast player
1095         matches = re.findall(
1096             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1097             webpage)
1098         if matches:
1099             return {
1100                 '_type': 'playlist',
1101                 'entries': [{
1102                     '_type': 'url',
1103                     'ie_key': 'CondeNast',
1104                     'url': ma,
1105                 } for ma in matches],
1106                 'title': video_title,
1107                 'id': video_id,
1108             }
1109
1110         # Look for Bandcamp pages with custom domain
1111         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1112         if mobj is not None:
1113             burl = unescapeHTML(mobj.group(1))
1114             # Don't set the extractor because it can be a track url or an album
1115             return self.url_result(burl)
1116
1117         # Look for embedded Vevo player
1118         mobj = re.search(
1119             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1120         if mobj is not None:
1121             return self.url_result(mobj.group('url'))
1122
1123         # Look for embedded Viddler player
1124         mobj = re.search(
1125             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1126             webpage)
1127         if mobj is not None:
1128             return self.url_result(mobj.group('url'))
1129
1130         # Look for NYTimes player
1131         mobj = re.search(
1132             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1133             webpage)
1134         if mobj is not None:
1135             return self.url_result(mobj.group('url'))
1136
1137         # Look for Libsyn player
1138         mobj = re.search(
1139             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1140         if mobj is not None:
1141             return self.url_result(mobj.group('url'))
1142
1143         # Look for Ooyala videos
1144         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1145                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1146                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1147                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1148         if mobj is not None:
1149             return OoyalaIE._build_url_result(mobj.group('ec'))
1150
1151         # Look for multiple Ooyala embeds on SBN network websites
1152         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1153         if mobj is not None:
1154             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1155             if embeds:
1156                 return _playlist_from_matches(
1157                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1158
1159         # Look for Aparat videos
1160         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1161         if mobj is not None:
1162             return self.url_result(mobj.group(1), 'Aparat')
1163
1164         # Look for MPORA videos
1165         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1166         if mobj is not None:
1167             return self.url_result(mobj.group(1), 'Mpora')
1168
1169         # Look for embedded NovaMov-based player
1170         mobj = re.search(
1171             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1172                     (?P<url>http://(?:(?:embed|www)\.)?
1173                         (?:novamov\.com|
1174                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1175                            videoweed\.(?:es|com)|
1176                            movshare\.(?:net|sx|ag)|
1177                            divxstage\.(?:eu|net|ch|co|at|ag))
1178                         /embed\.php.+?)\1''', webpage)
1179         if mobj is not None:
1180             return self.url_result(mobj.group('url'))
1181
1182         # Look for embedded Facebook player
1183         mobj = re.search(
1184             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1185         if mobj is not None:
1186             return self.url_result(mobj.group('url'), 'Facebook')
1187
1188         # Look for embedded VK player
1189         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1190         if mobj is not None:
1191             return self.url_result(mobj.group('url'), 'VK')
1192
1193         # Look for embedded ivi player
1194         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1195         if mobj is not None:
1196             return self.url_result(mobj.group('url'), 'Ivi')
1197
1198         # Look for embedded Huffington Post player
1199         mobj = re.search(
1200             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1201         if mobj is not None:
1202             return self.url_result(mobj.group('url'), 'HuffPost')
1203
1204         # Look for embed.ly
1205         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1206         if mobj is not None:
1207             return self.url_result(mobj.group('url'))
1208         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1209         if mobj is not None:
1210             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1211
1212         # Look for funnyordie embed
1213         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1214         if matches:
1215             return _playlist_from_matches(
1216                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1217
1218         # Look for BBC iPlayer embed
1219         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1220         if matches:
1221             return _playlist_from_matches(matches, ie='BBCCoUk')
1222
1223         # Look for embedded RUTV player
1224         rutv_url = RUTVIE._extract_url(webpage)
1225         if rutv_url:
1226             return self.url_result(rutv_url, 'RUTV')
1227
1228         # Look for embedded TED player
1229         mobj = re.search(
1230             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1231         if mobj is not None:
1232             return self.url_result(mobj.group('url'), 'TED')
1233
1234         # Look for embedded Ustream videos
1235         mobj = re.search(
1236             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1237         if mobj is not None:
1238             return self.url_result(mobj.group('url'), 'Ustream')
1239
1240         # Look for embedded arte.tv player
1241         mobj = re.search(
1242             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1243             webpage)
1244         if mobj is not None:
1245             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1246
1247         # Look for embedded smotri.com player
1248         smotri_url = SmotriIE._extract_url(webpage)
1249         if smotri_url:
1250             return self.url_result(smotri_url, 'Smotri')
1251
1252         # Look for embeded soundcloud player
1253         mobj = re.search(
1254             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1255             webpage)
1256         if mobj is not None:
1257             url = unescapeHTML(mobj.group('url'))
1258             return self.url_result(url)
1259
1260         # Look for embedded vulture.com player
1261         mobj = re.search(
1262             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1263             webpage)
1264         if mobj is not None:
1265             url = unescapeHTML(mobj.group('url'))
1266             return self.url_result(url, ie='Vulture')
1267
1268         # Look for embedded mtvservices player
1269         mobj = re.search(
1270             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1271             webpage)
1272         if mobj is not None:
1273             url = unescapeHTML(mobj.group('url'))
1274             return self.url_result(url, ie='MTVServicesEmbedded')
1275
1276         # Look for embedded yahoo player
1277         mobj = re.search(
1278             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1279             webpage)
1280         if mobj is not None:
1281             return self.url_result(mobj.group('url'), 'Yahoo')
1282
1283         # Look for embedded sbs.com.au player
1284         mobj = re.search(
1285             r'''(?x)
1286             (?:
1287                 <meta\s+property="og:video"\s+content=|
1288                 <iframe[^>]+?src=
1289             )
1290             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1291             webpage)
1292         if mobj is not None:
1293             return self.url_result(mobj.group('url'), 'SBS')
1294
1295         # Look for embedded Cinchcast player
1296         mobj = re.search(
1297             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1298             webpage)
1299         if mobj is not None:
1300             return self.url_result(mobj.group('url'), 'Cinchcast')
1301
1302         mobj = re.search(
1303             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1304             webpage)
1305         if not mobj:
1306             mobj = re.search(
1307                 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1308                 webpage)
1309         if mobj is not None:
1310             return self.url_result(mobj.group('url'), 'MLB')
1311
1312         mobj = re.search(
1313             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1314             webpage)
1315         if mobj is not None:
1316             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1317
1318         mobj = re.search(
1319             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1320             webpage)
1321         if mobj is not None:
1322             return self.url_result(mobj.group('url'), 'Livestream')
1323
1324         # Look for Zapiks embed
1325         mobj = re.search(
1326             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1327         if mobj is not None:
1328             return self.url_result(mobj.group('url'), 'Zapiks')
1329
1330         # Look for Kaltura embeds
1331         mobj = re.search(
1332             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1333         if mobj is not None:
1334             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1335
1336         # Look for Eagle.Platform embeds
1337         mobj = re.search(
1338             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1339         if mobj is not None:
1340             return self.url_result(mobj.group('url'), 'EaglePlatform')
1341
1342         # Look for ClipYou (uses Eagle.Platform) embeds
1343         mobj = re.search(
1344             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1345         if mobj is not None:
1346             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1347
1348         # Look for Pladform embeds
1349         mobj = re.search(
1350             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1351         if mobj is not None:
1352             return self.url_result(mobj.group('url'), 'Pladform')
1353
1354         # Look for Playwire embeds
1355         mobj = re.search(
1356             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1357         if mobj is not None:
1358             return self.url_result(mobj.group('url'))
1359
1360         # Look for 5min embeds
1361         mobj = re.search(
1362             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1363         if mobj is not None:
1364             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1365
1366         # Look for Crooks and Liars embeds
1367         mobj = re.search(
1368             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1369         if mobj is not None:
1370             return self.url_result(mobj.group('url'))
1371
1372         # Look for NBC Sports VPlayer embeds
1373         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1374         if nbc_sports_url:
1375             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1376
1377         # Look for UDN embeds
1378         mobj = re.search(
1379             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1380         if mobj is not None:
1381             return self.url_result(
1382                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1383
1384         # Look for Senate ISVP iframe
1385         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1386         if senate_isvp_url:
1387             return self.url_result(surl, 'SenateISVP')
1388
1389         def check_video(vurl):
1390             if YoutubeIE.suitable(vurl):
1391                 return True
1392             vpath = compat_urlparse.urlparse(vurl).path
1393             vext = determine_ext(vpath)
1394             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1395
1396         def filter_video(urls):
1397             return list(filter(check_video, urls))
1398
1399         # Start with something easy: JW Player in SWFObject
1400         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1401         if not found:
1402             # Look for gorilla-vid style embedding
1403             found = filter_video(re.findall(r'''(?sx)
1404                 (?:
1405                     jw_plugins|
1406                     JWPlayerOptions|
1407                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1408                 )
1409                 .*?
1410                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1411         if not found:
1412             # Broaden the search a little bit
1413             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1414         if not found:
1415             # Broaden the findall a little bit: JWPlayer JS loader
1416             found = filter_video(re.findall(
1417                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1418         if not found:
1419             # Flow player
1420             found = filter_video(re.findall(r'''(?xs)
1421                 flowplayer\("[^"]+",\s*
1422                     \{[^}]+?\}\s*,
1423                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1424                         ["']?url["']?\s*:\s*["']([^"']+)["']
1425             ''', webpage))
1426         if not found:
1427             # Cinerama player
1428             found = re.findall(
1429                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1430         if not found:
1431             # Try to find twitter cards info
1432             found = filter_video(re.findall(
1433                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1434         if not found:
1435             # We look for Open Graph info:
1436             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1437             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1438             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1439             if m_video_type is not None:
1440                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1441         if not found:
1442             # HTML5 video
1443             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1444         if not found:
1445             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1446             found = re.search(
1447                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1448                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1449                 webpage)
1450             if not found:
1451                 # Look also in Refresh HTTP header
1452                 refresh_header = head_response.headers.get('Refresh')
1453                 if refresh_header:
1454                     found = re.search(REDIRECT_REGEX, refresh_header)
1455             if found:
1456                 new_url = compat_urlparse.urljoin(url, found.group(1))
1457                 self.report_following_redirect(new_url)
1458                 return {
1459                     '_type': 'url',
1460                     'url': new_url,
1461                 }
1462         if not found:
1463             raise UnsupportedError(url)
1464
1465         entries = []
1466         for video_url in found:
1467             video_url = compat_urlparse.urljoin(url, video_url)
1468             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1469
1470             # Sometimes, jwplayer extraction will result in a YouTube URL
1471             if YoutubeIE.suitable(video_url):
1472                 entries.append(self.url_result(video_url, 'Youtube'))
1473                 continue
1474
1475             # here's a fun little line of code for you:
1476             video_id = os.path.splitext(video_id)[0]
1477
1478             if determine_ext(video_url) == 'smil':
1479                 entries.append({
1480                     'id': video_id,
1481                     'formats': self._extract_smil_formats(video_url, video_id),
1482                     'uploader': video_uploader,
1483                     'title': video_title,
1484                     'age_limit': age_limit,
1485                 })
1486             else:
1487                 entries.append({
1488                     'id': video_id,
1489                     'url': video_url,
1490                     'uploader': video_uploader,
1491                     'title': video_title,
1492                     'age_limit': age_limit,
1493                 })
1494
1495         if len(entries) == 1:
1496             return entries[0]
1497         else:
1498             for num, e in enumerate(entries, start=1):
1499                 # 'url' results don't have a title
1500                 if e.get('title') is not None:
1501                     e['title'] = '%s (%d)' % (e['title'], num)
1502             return {
1503                 '_type': 'playlist',
1504                 'entries': entries,
1505             }