[NBC/ThePlatform/Generic] Add a generic detector for NBCSportsVPlayer and enhance...
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urlparse,
13     compat_xml_parse_error,
14 )
15 from ..utils import (
16     determine_ext,
17     ExtractorError,
18     float_or_none,
19     HEADRequest,
20     is_html,
21     orderedSet,
22     parse_xml,
23     smuggle_url,
24     unescapeHTML,
25     unified_strdate,
26     unsmuggle_url,
27     UnsupportedError,
28     url_basename,
29     xpath_text,
30 )
31 from .brightcove import BrightcoveIE
32 from .nbc import NBCSportsVPlayerIE
33 from .ooyala import OoyalaIE
34 from .rutv import RUTVIE
35 from .smotri import SmotriIE
36 from .condenast import CondeNastIE
37
38
39 class GenericIE(InfoExtractor):
40     IE_DESC = 'Generic downloader that works on some sites'
41     _VALID_URL = r'.*'
42     IE_NAME = 'generic'
43     _TESTS = [
44         {
45             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
46             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
47             'info_dict': {
48                 'id': '13601338388002',
49                 'ext': 'mp4',
50                 'uploader': 'www.hodiho.fr',
51                 'title': 'R\u00e9gis plante sa Jeep',
52             }
53         },
54         # bandcamp page with custom domain
55         {
56             'add_ie': ['Bandcamp'],
57             'url': 'http://bronyrock.com/track/the-pony-mash',
58             'info_dict': {
59                 'id': '3235767654',
60                 'ext': 'mp3',
61                 'title': 'The Pony Mash',
62                 'uploader': 'M_Pallante',
63             },
64             'skip': 'There is a limit of 200 free downloads / month for the test song',
65         },
66         # embedded brightcove video
67         # it also tests brightcove videos that need to set the 'Referer' in the
68         # http requests
69         {
70             'add_ie': ['Brightcove'],
71             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
72             'info_dict': {
73                 'id': '2765128793001',
74                 'ext': 'mp4',
75                 'title': 'Le cours de bourse : l’analyse technique',
76                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
77                 'uploader': 'BFM BUSINESS',
78             },
79             'params': {
80                 'skip_download': True,
81             },
82         },
83         {
84             # https://github.com/rg3/youtube-dl/issues/2253
85             'url': 'http://bcove.me/i6nfkrc3',
86             'md5': '0ba9446db037002366bab3b3eb30c88c',
87             'info_dict': {
88                 'id': '3101154703001',
89                 'ext': 'mp4',
90                 'title': 'Still no power',
91                 'uploader': 'thestar.com',
92                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
93             },
94             'add_ie': ['Brightcove'],
95         },
96         {
97             'url': 'http://www.championat.com/video/football/v/87/87499.html',
98             'md5': 'fb973ecf6e4a78a67453647444222983',
99             'info_dict': {
100                 'id': '3414141473001',
101                 'ext': 'mp4',
102                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
103                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
104                 'uploader': 'Championat',
105             },
106         },
107         {
108             # https://github.com/rg3/youtube-dl/issues/3541
109             'add_ie': ['Brightcove'],
110             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
111             'info_dict': {
112                 'id': '3866516442001',
113                 'ext': 'mp4',
114                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
115                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
116                 'uploader': 'SBS Broadcasting',
117             },
118             'skip': 'Restricted to Netherlands',
119             'params': {
120                 'skip_download': True,  # m3u8 download
121             },
122         },
123         # Direct link to a video
124         {
125             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
126             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
127             'info_dict': {
128                 'id': 'trailer',
129                 'ext': 'mp4',
130                 'title': 'trailer',
131                 'upload_date': '20100513',
132             }
133         },
134         # ooyala video
135         {
136             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
137             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
138             'info_dict': {
139                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
140                 'ext': 'mp4',
141                 'title': '2cc213299525360.mov',  # that's what we get
142             },
143             'add_ie': ['Ooyala'],
144         },
145         # multiple ooyala embeds on SBN network websites
146         {
147             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
148             'info_dict': {
149                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
150                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
151             },
152             'playlist_mincount': 3,
153             'params': {
154                 'skip_download': True,
155             },
156             'add_ie': ['Ooyala'],
157         },
158         # google redirect
159         {
160             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
161             'info_dict': {
162                 'id': 'cmQHVoWB5FY',
163                 'ext': 'mp4',
164                 'upload_date': '20130224',
165                 'uploader_id': 'TheVerge',
166                 'description': 're:^Chris Ziegler takes a look at the\.*',
167                 'uploader': 'The Verge',
168                 'title': 'First Firefox OS phones side-by-side',
169             },
170             'params': {
171                 'skip_download': False,
172             }
173         },
174         # embed.ly video
175         {
176             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
177             'info_dict': {
178                 'id': '9ODmcdjQcHQ',
179                 'ext': 'mp4',
180                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
181                 'upload_date': '20140225',
182                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
183                 'uploader': 'Tested',
184                 'uploader_id': 'testedcom',
185             },
186             # No need to test YoutubeIE here
187             'params': {
188                 'skip_download': True,
189             },
190         },
191         # funnyordie embed
192         {
193             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
194             'info_dict': {
195                 'id': '18e820ec3f',
196                 'ext': 'mp4',
197                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
198                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
199             },
200         },
201         # BBC iPlayer embeds
202         {
203             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
204             'info_dict': {
205                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
206             },
207             'playlist_mincount': 18,
208         },
209         # RUTV embed
210         {
211             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
212             'info_dict': {
213                 'id': '776940',
214                 'ext': 'mp4',
215                 'title': 'Охотское море стало целиком российским',
216                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
217             },
218             'params': {
219                 # m3u8 download
220                 'skip_download': True,
221             },
222         },
223         # Embedded TED video
224         {
225             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
226             'md5': '65fdff94098e4a607385a60c5177c638',
227             'info_dict': {
228                 'id': '1969',
229                 'ext': 'mp4',
230                 'title': 'Hidden miracles of the natural world',
231                 'uploader': 'Louie Schwartzberg',
232                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
233             }
234         },
235         # Embeded Ustream video
236         {
237             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
238             'md5': '27b99cdb639c9b12a79bca876a073417',
239             'info_dict': {
240                 'id': '45734260',
241                 'ext': 'flv',
242                 'uploader': 'AU SPA:  The NSA and Privacy',
243                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
244             }
245         },
246         # nowvideo embed hidden behind percent encoding
247         {
248             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
249             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
250             'info_dict': {
251                 'id': '06e53103ca9aa',
252                 'ext': 'flv',
253                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
254                 'description': 'No description',
255             },
256         },
257         # arte embed
258         {
259             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
260             'md5': '7653032cbb25bf6c80d80f217055fa43',
261             'info_dict': {
262                 'id': '048195-004_PLUS7-F',
263                 'ext': 'flv',
264                 'title': 'X:enius',
265                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
266                 'upload_date': '20140320',
267             },
268             'params': {
269                 'skip_download': 'Requires rtmpdump'
270             }
271         },
272         # Condé Nast embed
273         {
274             'url': 'http://www.wired.com/2014/04/honda-asimo/',
275             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
276             'info_dict': {
277                 'id': '53501be369702d3275860000',
278                 'ext': 'mp4',
279                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
280             }
281         },
282         # Dailymotion embed
283         {
284             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
285             'md5': '441aeeb82eb72c422c7f14ec533999cd',
286             'info_dict': {
287                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
288                 'ext': 'mp4',
289                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
290                 'uploader': 'Spi0n',
291             },
292             'add_ie': ['Dailymotion'],
293         },
294         # YouTube embed
295         {
296             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
297             'info_dict': {
298                 'id': 'FXRb4ykk4S0',
299                 'ext': 'mp4',
300                 'title': 'The NBL Auction 2014',
301                 'uploader': 'BADMINTON England',
302                 'uploader_id': 'BADMINTONEvents',
303                 'upload_date': '20140603',
304                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
305             },
306             'add_ie': ['Youtube'],
307             'params': {
308                 'skip_download': True,
309             }
310         },
311         # MTVSercices embed
312         {
313             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
314             'md5': '35727f82f58c76d996fc188f9755b0d5',
315             'info_dict': {
316                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
317                 'ext': 'mp4',
318                 'title': 'Review',
319                 'description': 'Mario\'s life in the fast lane has never looked so good.',
320             },
321         },
322         # YouTube embed via <data-embed-url="">
323         {
324             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
325             'info_dict': {
326                 'id': '4vAffPZIT44',
327                 'ext': 'mp4',
328                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
329                 'uploader': 'Gameloft',
330                 'uploader_id': 'gameloft',
331                 'upload_date': '20140828',
332                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
333             },
334             'params': {
335                 'skip_download': True,
336             }
337         },
338         # Camtasia studio
339         {
340             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
341             'playlist': [{
342                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
343                 'info_dict': {
344                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
345                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
346                     'ext': 'flv',
347                     'duration': 2235.90,
348                 }
349             }, {
350                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
351                 'info_dict': {
352                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
353                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
354                     'ext': 'flv',
355                     'duration': 2235.93,
356                 }
357             }],
358             'info_dict': {
359                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
360             }
361         },
362         # Flowplayer
363         {
364             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
365             'md5': '9d65602bf31c6e20014319c7d07fba27',
366             'info_dict': {
367                 'id': '5123ea6d5e5a7',
368                 'ext': 'mp4',
369                 'age_limit': 18,
370                 'uploader': 'www.handjobhub.com',
371                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
372             }
373         },
374         # RSS feed
375         {
376             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
377             'info_dict': {
378                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
379                 'title': 'Zero Punctuation',
380                 'description': 're:.*groundbreaking video review series.*'
381             },
382             'playlist_mincount': 11,
383         },
384         # Multiple brightcove videos
385         # https://github.com/rg3/youtube-dl/issues/2283
386         {
387             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
388             'info_dict': {
389                 'id': 'always-never',
390                 'title': 'Always / Never - The New Yorker',
391             },
392             'playlist_count': 3,
393             'params': {
394                 'extract_flat': False,
395                 'skip_download': True,
396             }
397         },
398         # MLB embed
399         {
400             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
401             'md5': '96f09a37e44da40dd083e12d9a683327',
402             'info_dict': {
403                 'id': '33322633',
404                 'ext': 'mp4',
405                 'title': 'Ump changes call to ball',
406                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
407                 'duration': 48,
408                 'timestamp': 1401537900,
409                 'upload_date': '20140531',
410                 'thumbnail': 're:^https?://.*\.jpg$',
411             },
412         },
413         # Wistia embed
414         {
415             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
416             'md5': '8788b683c777a5cf25621eaf286d0c23',
417             'info_dict': {
418                 'id': '1cfaf6b7ea',
419                 'ext': 'mov',
420                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
421                 'duration': 643.0,
422                 'filesize': 182808282,
423                 'uploader': 'education-portal.com',
424             },
425         },
426         {
427             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
428             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
429             'info_dict': {
430                 'id': 'uxjb0lwrcz',
431                 'ext': 'mp4',
432                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
433                 'duration': 1715.0,
434                 'uploader': 'thoughtworks.wistia.com',
435             },
436         },
437         # Direct download with broken HEAD
438         {
439             'url': 'http://ai-radio.org:8000/radio.opus',
440             'info_dict': {
441                 'id': 'radio',
442                 'ext': 'opus',
443                 'title': 'radio',
444             },
445             'params': {
446                 'skip_download': True,  # infinite live stream
447             },
448             'expected_warnings': [
449                 r'501.*Not Implemented'
450             ],
451         },
452         # Soundcloud embed
453         {
454             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
455             'info_dict': {
456                 'id': '174391317',
457                 'ext': 'mp3',
458                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
459                 'uploader': 'Sophos Security',
460                 'title': 'Chet Chat 171 - Oct 29, 2014',
461                 'upload_date': '20141029',
462             }
463         },
464         # Livestream embed
465         {
466             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
467             'info_dict': {
468                 'id': '67864563',
469                 'ext': 'flv',
470                 'upload_date': '20141112',
471                 'title': 'Rosetta #CometLanding webcast HL 10',
472             }
473         },
474         # LazyYT
475         {
476             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
477             'info_dict': {
478                 'id': '1986',
479                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
480             },
481             'playlist_mincount': 2,
482         },
483         # Direct link with incorrect MIME type
484         {
485             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
486             'md5': '4ccbebe5f36706d85221f204d7eb5913',
487             'info_dict': {
488                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
489                 'id': '5_Lennart_Poettering_-_Systemd',
490                 'ext': 'webm',
491                 'title': '5_Lennart_Poettering_-_Systemd',
492                 'upload_date': '20141120',
493             },
494             'expected_warnings': [
495                 'URL could be a direct video link, returning it as such.'
496             ]
497         },
498         # Cinchcast embed
499         {
500             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
501             'info_dict': {
502                 'id': '7141703',
503                 'ext': 'mp3',
504                 'upload_date': '20141126',
505                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
506             }
507         },
508         # Cinerama player
509         {
510             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
511             'info_dict': {
512                 'id': '730m_DandD_1901_512k',
513                 'ext': 'mp4',
514                 'uploader': 'www.abc.net.au',
515                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
516             }
517         },
518         # embedded viddler video
519         {
520             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
521             'info_dict': {
522                 'id': '4d03aad9',
523                 'ext': 'mp4',
524                 'uploader': 'deadspin',
525                 'title': 'WALL-TO-GORTAT',
526                 'timestamp': 1422285291,
527                 'upload_date': '20150126',
528             },
529             'add_ie': ['Viddler'],
530         },
531         # Libsyn embed
532         {
533             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
534             'info_dict': {
535                 'id': '3377616',
536                 'ext': 'mp3',
537                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
538                 'description': 'md5:601cb790edd05908957dae8aaa866465',
539                 'upload_date': '20150220',
540             },
541         },
542         # jwplayer YouTube
543         {
544             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
545             'info_dict': {
546                 'id': 'Mrj4DVp2zeA',
547                 'ext': 'mp4',
548                 'upload_date': '20150212',
549                 'uploader': 'The National Archives UK',
550                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
551                 'uploader_id': 'NationalArchives08',
552                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
553             },
554         },
555         # rtl.nl embed
556         {
557             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
558             'playlist_mincount': 5,
559             'info_dict': {
560                 'id': 'aanslagen-kopenhagen',
561                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
562             }
563         },
564         # Zapiks embed
565         {
566             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
567             'info_dict': {
568                 'id': '118046',
569                 'ext': 'mp4',
570                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
571             }
572         },
573         # Kaltura embed
574         {
575             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
576             'info_dict': {
577                 'id': '1_eergr3h1',
578                 'ext': 'mp4',
579                 'upload_date': '20150226',
580                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
581                 'timestamp': int,
582                 'title': 'John Carlson Postgame 2/25/15',
583             },
584         },
585         # Eagle.Platform embed (generic URL)
586         {
587             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
588             'info_dict': {
589                 'id': '227304',
590                 'ext': 'mp4',
591                 'title': 'Навальный вышел на свободу',
592                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
593                 'thumbnail': 're:^https?://.*\.jpg$',
594                 'duration': 87,
595                 'view_count': int,
596                 'age_limit': 0,
597             },
598         },
599         # ClipYou (Eagle.Platform) embed (custom URL)
600         {
601             'url': 'http://muz-tv.ru/play/7129/',
602             'info_dict': {
603                 'id': '12820',
604                 'ext': 'mp4',
605                 'title': "'O Sole Mio",
606                 'thumbnail': 're:^https?://.*\.jpg$',
607                 'duration': 216,
608                 'view_count': int,
609             },
610         },
611         # Pladform embed
612         {
613             'url': 'http://muz-tv.ru/kinozal/view/7400/',
614             'info_dict': {
615                 'id': '100183293',
616                 'ext': 'mp4',
617                 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
618                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
619                 'thumbnail': 're:^https?://.*\.jpg$',
620                 'duration': 694,
621                 'age_limit': 0,
622             },
623         },
624         # 5min embed
625         {
626             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
627             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
628             'info_dict': {
629                 'id': '518726732',
630                 'ext': 'mp4',
631                 'title': 'Facebook Creates "On This Day" | Crunch Report',
632             },
633         },
634         # RSS feed with enclosure
635         {
636             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
637             'info_dict': {
638                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
639                 'ext': 'm4v',
640                 'upload_date': '20150228',
641                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
642             }
643         },
644         # NBC Sports vplayer embeds
645         {
646             'url': 'http://bbs.clutchfans.net/showthread.php?t=244180',
647             'info_dict': {
648                 'id': '_hqLjQ95yx8Z',
649                 'ext': 'flv'
650             },
651             'skip': 'This content expired on 9/17/14 12:23 PM',
652         }
653     ]
654
655     def report_following_redirect(self, new_url):
656         """Report information extraction."""
657         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
658
659     def _extract_rss(self, url, video_id, doc):
660         playlist_title = doc.find('./channel/title').text
661         playlist_desc_el = doc.find('./channel/description')
662         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
663
664         entries = []
665         for it in doc.findall('./channel/item'):
666             next_url = xpath_text(it, 'link', fatal=False)
667             if not next_url:
668                 enclosure_nodes = it.findall('./enclosure')
669                 for e in enclosure_nodes:
670                     next_url = e.attrib.get('url')
671                     if next_url:
672                         break
673
674             if not next_url:
675                 continue
676
677             entries.append({
678                 '_type': 'url',
679                 'url': next_url,
680                 'title': it.find('title').text,
681             })
682
683         return {
684             '_type': 'playlist',
685             'id': url,
686             'title': playlist_title,
687             'description': playlist_desc,
688             'entries': entries,
689         }
690
691     def _extract_camtasia(self, url, video_id, webpage):
692         """ Returns None if no camtasia video can be found. """
693
694         camtasia_cfg = self._search_regex(
695             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
696             webpage, 'camtasia configuration file', default=None)
697         if camtasia_cfg is None:
698             return None
699
700         title = self._html_search_meta('DC.title', webpage, fatal=True)
701
702         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
703         camtasia_cfg = self._download_xml(
704             camtasia_url, video_id,
705             note='Downloading camtasia configuration',
706             errnote='Failed to download camtasia configuration')
707         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
708
709         entries = []
710         for n in fileset_node.getchildren():
711             url_n = n.find('./uri')
712             if url_n is None:
713                 continue
714
715             entries.append({
716                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
717                 'title': '%s - %s' % (title, n.tag),
718                 'url': compat_urlparse.urljoin(url, url_n.text),
719                 'duration': float_or_none(n.find('./duration').text),
720             })
721
722         return {
723             '_type': 'playlist',
724             'entries': entries,
725             'title': title,
726         }
727
728     def _real_extract(self, url):
729         if url.startswith('//'):
730             return {
731                 '_type': 'url',
732                 'url': self.http_scheme() + url,
733             }
734
735         parsed_url = compat_urlparse.urlparse(url)
736         if not parsed_url.scheme:
737             default_search = self._downloader.params.get('default_search')
738             if default_search is None:
739                 default_search = 'fixup_error'
740
741             if default_search in ('auto', 'auto_warning', 'fixup_error'):
742                 if '/' in url:
743                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
744                     return self.url_result('http://' + url)
745                 elif default_search != 'fixup_error':
746                     if default_search == 'auto_warning':
747                         if re.match(r'^(?:url|URL)$', url):
748                             raise ExtractorError(
749                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
750                                 expected=True)
751                         else:
752                             self._downloader.report_warning(
753                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
754                     return self.url_result('ytsearch:' + url)
755
756             if default_search in ('error', 'fixup_error'):
757                 raise ExtractorError(
758                     '%r is not a valid URL. '
759                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
760                     % (url, url), expected=True)
761             else:
762                 if ':' not in default_search:
763                     default_search += ':'
764                 return self.url_result(default_search + url)
765
766         url, smuggled_data = unsmuggle_url(url)
767         force_videoid = None
768         is_intentional = smuggled_data and smuggled_data.get('to_generic')
769         if smuggled_data and 'force_videoid' in smuggled_data:
770             force_videoid = smuggled_data['force_videoid']
771             video_id = force_videoid
772         else:
773             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
774
775         self.to_screen('%s: Requesting header' % video_id)
776
777         head_req = HEADRequest(url)
778         head_response = self._request_webpage(
779             head_req, video_id,
780             note=False, errnote='Could not send HEAD request to %s' % url,
781             fatal=False)
782
783         if head_response is not False:
784             # Check for redirect
785             new_url = head_response.geturl()
786             if url != new_url:
787                 self.report_following_redirect(new_url)
788                 if force_videoid:
789                     new_url = smuggle_url(
790                         new_url, {'force_videoid': force_videoid})
791                 return self.url_result(new_url)
792
793         full_response = None
794         if head_response is False:
795             full_response = self._request_webpage(url, video_id)
796             head_response = full_response
797
798         # Check for direct link to a video
799         content_type = head_response.headers.get('Content-Type', '')
800         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
801         if m:
802             upload_date = unified_strdate(
803                 head_response.headers.get('Last-Modified'))
804             return {
805                 'id': video_id,
806                 'title': os.path.splitext(url_basename(url))[0],
807                 'direct': True,
808                 'formats': [{
809                     'format_id': m.group('format_id'),
810                     'url': url,
811                     'vcodec': 'none' if m.group('type') == 'audio' else None
812                 }],
813                 'upload_date': upload_date,
814             }
815
816         if not self._downloader.params.get('test', False) and not is_intentional:
817             self._downloader.report_warning('Falling back on generic information extractor.')
818
819         if not full_response:
820             full_response = self._request_webpage(url, video_id)
821
822         # Maybe it's a direct link to a video?
823         # Be careful not to download the whole thing!
824         first_bytes = full_response.read(512)
825         if not is_html(first_bytes):
826             self._downloader.report_warning(
827                 'URL could be a direct video link, returning it as such.')
828             upload_date = unified_strdate(
829                 head_response.headers.get('Last-Modified'))
830             return {
831                 'id': video_id,
832                 'title': os.path.splitext(url_basename(url))[0],
833                 'direct': True,
834                 'url': url,
835                 'upload_date': upload_date,
836             }
837
838         webpage = self._webpage_read_content(
839             full_response, url, video_id, prefix=first_bytes)
840
841         self.report_extraction(video_id)
842
843         # Is it an RSS feed?
844         try:
845             doc = parse_xml(webpage)
846             if doc.tag == 'rss':
847                 return self._extract_rss(url, video_id, doc)
848         except compat_xml_parse_error:
849             pass
850
851         # Is it a Camtasia project?
852         camtasia_res = self._extract_camtasia(url, video_id, webpage)
853         if camtasia_res is not None:
854             return camtasia_res
855
856         # Sometimes embedded video player is hidden behind percent encoding
857         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
858         # Unescaping the whole page allows to handle those cases in a generic way
859         webpage = compat_urllib_parse.unquote(webpage)
860
861         # it's tempting to parse this further, but you would
862         # have to take into account all the variations like
863         #   Video Title - Site Name
864         #   Site Name | Video Title
865         #   Video Title - Tagline | Site Name
866         # and so on and so forth; it's just not practical
867         video_title = self._html_search_regex(
868             r'(?s)<title>(.*?)</title>', webpage, 'video title',
869             default='video')
870
871         # Try to detect age limit automatically
872         age_limit = self._rta_search(webpage)
873         # And then there are the jokers who advertise that they use RTA,
874         # but actually don't.
875         AGE_LIMIT_MARKERS = [
876             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
877         ]
878         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
879             age_limit = 18
880
881         # video uploader is domain name
882         video_uploader = self._search_regex(
883             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
884
885         # Helper method
886         def _playlist_from_matches(matches, getter=None, ie=None):
887             urlrs = orderedSet(
888                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
889                 for m in matches)
890             return self.playlist_result(
891                 urlrs, playlist_id=video_id, playlist_title=video_title)
892
893         # Look for BrightCove:
894         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
895         if bc_urls:
896             self.to_screen('Brightcove video detected.')
897             entries = [{
898                 '_type': 'url',
899                 'url': smuggle_url(bc_url, {'Referer': url}),
900                 'ie_key': 'Brightcove'
901             } for bc_url in bc_urls]
902
903             return {
904                 '_type': 'playlist',
905                 'title': video_title,
906                 'id': video_id,
907                 'entries': entries,
908             }
909
910         # Look for embedded rtl.nl player
911         matches = re.findall(
912             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
913             webpage)
914         if matches:
915             return _playlist_from_matches(matches, ie='RtlNl')
916
917         # Look for embedded (iframe) Vimeo player
918         mobj = re.search(
919             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
920         if mobj:
921             player_url = unescapeHTML(mobj.group('url'))
922             surl = smuggle_url(player_url, {'Referer': url})
923             return self.url_result(surl)
924         # Look for embedded (swf embed) Vimeo player
925         mobj = re.search(
926             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
927         if mobj:
928             return self.url_result(mobj.group(1))
929
930         # Look for embedded YouTube player
931         matches = re.findall(r'''(?x)
932             (?:
933                 <iframe[^>]+?src=|
934                 data-video-url=|
935                 <embed[^>]+?src=|
936                 embedSWF\(?:\s*|
937                 new\s+SWFObject\(
938             )
939             (["\'])
940                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
941                 (?:embed|v|p)/.+?)
942             \1''', webpage)
943         if matches:
944             return _playlist_from_matches(
945                 matches, lambda m: unescapeHTML(m[1]))
946
947         # Look for lazyYT YouTube embed
948         matches = re.findall(
949             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
950         if matches:
951             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
952
953         # Look for embedded Dailymotion player
954         matches = re.findall(
955             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
956         if matches:
957             return _playlist_from_matches(
958                 matches, lambda m: unescapeHTML(m[1]))
959
960         # Look for embedded Dailymotion playlist player (#3822)
961         m = re.search(
962             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
963         if m:
964             playlists = re.findall(
965                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
966             if playlists:
967                 return _playlist_from_matches(
968                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
969
970         # Look for embedded Wistia player
971         match = re.search(
972             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
973         if match:
974             embed_url = self._proto_relative_url(
975                 unescapeHTML(match.group('url')))
976             return {
977                 '_type': 'url_transparent',
978                 'url': embed_url,
979                 'ie_key': 'Wistia',
980                 'uploader': video_uploader,
981                 'title': video_title,
982                 'id': video_id,
983             }
984
985         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
986         if match:
987             return {
988                 '_type': 'url_transparent',
989                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
990                 'ie_key': 'Wistia',
991                 'uploader': video_uploader,
992                 'title': video_title,
993                 'id': match.group('id')
994             }
995
996         # Look for embedded blip.tv player
997         mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
998         if mobj:
999             return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1000         mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1001         if mobj:
1002             return self.url_result(mobj.group(1), 'BlipTV')
1003
1004         # Look for embedded condenast player
1005         matches = re.findall(
1006             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1007             webpage)
1008         if matches:
1009             return {
1010                 '_type': 'playlist',
1011                 'entries': [{
1012                     '_type': 'url',
1013                     'ie_key': 'CondeNast',
1014                     'url': ma,
1015                 } for ma in matches],
1016                 'title': video_title,
1017                 'id': video_id,
1018             }
1019
1020         # Look for Bandcamp pages with custom domain
1021         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1022         if mobj is not None:
1023             burl = unescapeHTML(mobj.group(1))
1024             # Don't set the extractor because it can be a track url or an album
1025             return self.url_result(burl)
1026
1027         # Look for embedded Vevo player
1028         mobj = re.search(
1029             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1030         if mobj is not None:
1031             return self.url_result(mobj.group('url'))
1032
1033         # Look for embedded Viddler player
1034         mobj = re.search(
1035             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1036             webpage)
1037         if mobj is not None:
1038             return self.url_result(mobj.group('url'))
1039
1040         # Look for NYTimes player
1041         mobj = re.search(
1042             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1043             webpage)
1044         if mobj is not None:
1045             return self.url_result(mobj.group('url'))
1046
1047         # Look for Libsyn player
1048         mobj = re.search(
1049             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1050         if mobj is not None:
1051             return self.url_result(mobj.group('url'))
1052
1053         # Look for Ooyala videos
1054         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1055                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1056                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage))
1057         if mobj is not None:
1058             return OoyalaIE._build_url_result(mobj.group('ec'))
1059
1060         # Look for multiple Ooyala embeds on SBN network websites
1061         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1062         if mobj is not None:
1063             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1064             if embeds:
1065                 return _playlist_from_matches(
1066                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1067
1068         # Look for Aparat videos
1069         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1070         if mobj is not None:
1071             return self.url_result(mobj.group(1), 'Aparat')
1072
1073         # Look for MPORA videos
1074         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1075         if mobj is not None:
1076             return self.url_result(mobj.group(1), 'Mpora')
1077
1078         # Look for embedded NovaMov-based player
1079         mobj = re.search(
1080             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1081                     (?P<url>http://(?:(?:embed|www)\.)?
1082                         (?:novamov\.com|
1083                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1084                            videoweed\.(?:es|com)|
1085                            movshare\.(?:net|sx|ag)|
1086                            divxstage\.(?:eu|net|ch|co|at|ag))
1087                         /embed\.php.+?)\1''', webpage)
1088         if mobj is not None:
1089             return self.url_result(mobj.group('url'))
1090
1091         # Look for embedded Facebook player
1092         mobj = re.search(
1093             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1094         if mobj is not None:
1095             return self.url_result(mobj.group('url'), 'Facebook')
1096
1097         # Look for embedded VK player
1098         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1099         if mobj is not None:
1100             return self.url_result(mobj.group('url'), 'VK')
1101
1102         # Look for embedded ivi player
1103         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1104         if mobj is not None:
1105             return self.url_result(mobj.group('url'), 'Ivi')
1106
1107         # Look for embedded Huffington Post player
1108         mobj = re.search(
1109             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1110         if mobj is not None:
1111             return self.url_result(mobj.group('url'), 'HuffPost')
1112
1113         # Look for embed.ly
1114         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1115         if mobj is not None:
1116             return self.url_result(mobj.group('url'))
1117         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1118         if mobj is not None:
1119             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1120
1121         # Look for funnyordie embed
1122         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1123         if matches:
1124             return _playlist_from_matches(
1125                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1126
1127         # Look for BBC iPlayer embed
1128         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1129         if matches:
1130             return _playlist_from_matches(matches, ie='BBCCoUk')
1131
1132         # Look for embedded RUTV player
1133         rutv_url = RUTVIE._extract_url(webpage)
1134         if rutv_url:
1135             return self.url_result(rutv_url, 'RUTV')
1136
1137         # Look for embedded TED player
1138         mobj = re.search(
1139             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1140         if mobj is not None:
1141             return self.url_result(mobj.group('url'), 'TED')
1142
1143         # Look for embedded Ustream videos
1144         mobj = re.search(
1145             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1146         if mobj is not None:
1147             return self.url_result(mobj.group('url'), 'Ustream')
1148
1149         # Look for embedded arte.tv player
1150         mobj = re.search(
1151             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1152             webpage)
1153         if mobj is not None:
1154             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1155
1156         # Look for embedded smotri.com player
1157         smotri_url = SmotriIE._extract_url(webpage)
1158         if smotri_url:
1159             return self.url_result(smotri_url, 'Smotri')
1160
1161         # Look for embeded soundcloud player
1162         mobj = re.search(
1163             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1164             webpage)
1165         if mobj is not None:
1166             url = unescapeHTML(mobj.group('url'))
1167             return self.url_result(url)
1168
1169         # Look for embedded vulture.com player
1170         mobj = re.search(
1171             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1172             webpage)
1173         if mobj is not None:
1174             url = unescapeHTML(mobj.group('url'))
1175             return self.url_result(url, ie='Vulture')
1176
1177         # Look for embedded mtvservices player
1178         mobj = re.search(
1179             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1180             webpage)
1181         if mobj is not None:
1182             url = unescapeHTML(mobj.group('url'))
1183             return self.url_result(url, ie='MTVServicesEmbedded')
1184
1185         # Look for embedded yahoo player
1186         mobj = re.search(
1187             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1188             webpage)
1189         if mobj is not None:
1190             return self.url_result(mobj.group('url'), 'Yahoo')
1191
1192         # Look for embedded sbs.com.au player
1193         mobj = re.search(
1194             r'''(?x)
1195             (?:
1196                 <meta\s+property="og:video"\s+content=|
1197                 <iframe[^>]+?src=
1198             )
1199             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1200             webpage)
1201         if mobj is not None:
1202             return self.url_result(mobj.group('url'), 'SBS')
1203
1204         # Look for embedded Cinchcast player
1205         mobj = re.search(
1206             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1207             webpage)
1208         if mobj is not None:
1209             return self.url_result(mobj.group('url'), 'Cinchcast')
1210
1211         mobj = re.search(
1212             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1213             webpage)
1214         if mobj is not None:
1215             return self.url_result(mobj.group('url'), 'MLB')
1216
1217         mobj = re.search(
1218             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1219             webpage)
1220         if mobj is not None:
1221             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1222
1223         mobj = re.search(
1224             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1225             webpage)
1226         if mobj is not None:
1227             return self.url_result(mobj.group('url'), 'Livestream')
1228
1229         # Look for Zapiks embed
1230         mobj = re.search(
1231             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1232         if mobj is not None:
1233             return self.url_result(mobj.group('url'), 'Zapiks')
1234
1235         # Look for Kaltura embeds
1236         mobj = re.search(
1237             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1238         if mobj is not None:
1239             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1240
1241         # Look for Eagle.Platform embeds
1242         mobj = re.search(
1243             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1244         if mobj is not None:
1245             return self.url_result(mobj.group('url'), 'EaglePlatform')
1246
1247         # Look for ClipYou (uses Eagle.Platform) embeds
1248         mobj = re.search(
1249             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1250         if mobj is not None:
1251             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1252
1253         # Look for Pladform embeds
1254         mobj = re.search(
1255             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1256         if mobj is not None:
1257             return self.url_result(mobj.group('url'), 'Pladform')
1258
1259         # Look for 5min embeds
1260         mobj = re.search(
1261             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1262         if mobj is not None:
1263             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1264
1265         # Look for NBC Sports VPlayer embeds
1266         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1267         if nbc_sports_url:
1268             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1269
1270         def check_video(vurl):
1271             if YoutubeIE.suitable(vurl):
1272                 return True
1273             vpath = compat_urlparse.urlparse(vurl).path
1274             vext = determine_ext(vpath)
1275             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1276
1277         def filter_video(urls):
1278             return list(filter(check_video, urls))
1279
1280         # Start with something easy: JW Player in SWFObject
1281         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1282         if not found:
1283             # Look for gorilla-vid style embedding
1284             found = filter_video(re.findall(r'''(?sx)
1285                 (?:
1286                     jw_plugins|
1287                     JWPlayerOptions|
1288                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1289                 )
1290                 .*?
1291                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1292         if not found:
1293             # Broaden the search a little bit
1294             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1295         if not found:
1296             # Broaden the findall a little bit: JWPlayer JS loader
1297             found = filter_video(re.findall(
1298                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1299         if not found:
1300             # Flow player
1301             found = filter_video(re.findall(r'''(?xs)
1302                 flowplayer\("[^"]+",\s*
1303                     \{[^}]+?\}\s*,
1304                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1305                         ["']?url["']?\s*:\s*["']([^"']+)["']
1306             ''', webpage))
1307         if not found:
1308             # Cinerama player
1309             found = re.findall(
1310                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1311         if not found:
1312             # Try to find twitter cards info
1313             found = filter_video(re.findall(
1314                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1315         if not found:
1316             # We look for Open Graph info:
1317             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1318             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1319             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1320             if m_video_type is not None:
1321                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1322         if not found:
1323             # HTML5 video
1324             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1325         if not found:
1326             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1327             found = re.search(
1328                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1329                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1330                 webpage)
1331             if not found:
1332                 # Look also in Refresh HTTP header
1333                 refresh_header = head_response.headers.get('Refresh')
1334                 if refresh_header:
1335                     found = re.search(REDIRECT_REGEX, refresh_header)
1336             if found:
1337                 new_url = found.group(1)
1338                 self.report_following_redirect(new_url)
1339                 return {
1340                     '_type': 'url',
1341                     'url': new_url,
1342                 }
1343         if not found:
1344             raise UnsupportedError(url)
1345
1346         entries = []
1347         for video_url in found:
1348             video_url = compat_urlparse.urljoin(url, video_url)
1349             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1350
1351             # Sometimes, jwplayer extraction will result in a YouTube URL
1352             if YoutubeIE.suitable(video_url):
1353                 entries.append(self.url_result(video_url, 'Youtube'))
1354                 continue
1355
1356             # here's a fun little line of code for you:
1357             video_id = os.path.splitext(video_id)[0]
1358
1359             entries.append({
1360                 'id': video_id,
1361                 'url': video_url,
1362                 'uploader': video_uploader,
1363                 'title': video_title,
1364                 'age_limit': age_limit,
1365             })
1366
1367         if len(entries) == 1:
1368             return entries[0]
1369         else:
1370             for num, e in enumerate(entries, start=1):
1371                 # 'url' results don't have a title
1372                 if e.get('title') is not None:
1373                     e['title'] = '%s (%d)' % (e['title'], num)
1374             return {
1375                 '_type': 'playlist',
1376                 'entries': entries,
1377             }