[extractor/generic] Add test for xhamster embed
[youtube-dl] / youtube_dl / extractor / generic.py
1 # encoding: utf-8
2
3 from __future__ import unicode_literals
4
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from .youtube import YoutubeIE
10 from ..compat import (
11     compat_urllib_parse,
12     compat_urllib_parse_unquote,
13     compat_urllib_request,
14     compat_urlparse,
15     compat_xml_parse_error,
16 )
17 from ..utils import (
18     determine_ext,
19     ExtractorError,
20     float_or_none,
21     HEADRequest,
22     is_html,
23     orderedSet,
24     parse_xml,
25     smuggle_url,
26     unescapeHTML,
27     unified_strdate,
28     unsmuggle_url,
29     UnsupportedError,
30     url_basename,
31     xpath_text,
32 )
33 from .brightcove import BrightcoveIE
34 from .nbc import NBCSportsVPlayerIE
35 from .ooyala import OoyalaIE
36 from .rutv import RUTVIE
37 from .tvc import TVCIE
38 from .sportbox import SportBoxEmbedIE
39 from .smotri import SmotriIE
40 from .condenast import CondeNastIE
41 from .udn import UDNEmbedIE
42 from .senateisvp import SenateISVPIE
43 from .bliptv import BlipTVIE
44 from .svt import SVTIE
45 from .pornhub import PornHubIE
46 from .xhamster import XHamsterEmbedIE
47 from .vimeo import VimeoIE
48 from .dailymotion import DailymotionCloudIE
49
50
51 class GenericIE(InfoExtractor):
52     IE_DESC = 'Generic downloader that works on some sites'
53     _VALID_URL = r'.*'
54     IE_NAME = 'generic'
55     _TESTS = [
56         # Direct link to a video
57         {
58             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
59             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
60             'info_dict': {
61                 'id': 'trailer',
62                 'ext': 'mp4',
63                 'title': 'trailer',
64                 'upload_date': '20100513',
65             }
66         },
67         # Direct link to media delivered compressed (until Accept-Encoding is *)
68         {
69             'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
70             'md5': '128c42e68b13950268b648275386fc74',
71             'info_dict': {
72                 'id': 'FictionJunction-Parallel_Hearts',
73                 'ext': 'flac',
74                 'title': 'FictionJunction-Parallel_Hearts',
75                 'upload_date': '20140522',
76             },
77             'expected_warnings': [
78                 'URL could be a direct video link, returning it as such.'
79             ]
80         },
81         # Direct download with broken HEAD
82         {
83             'url': 'http://ai-radio.org:8000/radio.opus',
84             'info_dict': {
85                 'id': 'radio',
86                 'ext': 'opus',
87                 'title': 'radio',
88             },
89             'params': {
90                 'skip_download': True,  # infinite live stream
91             },
92             'expected_warnings': [
93                 r'501.*Not Implemented'
94             ],
95         },
96         # Direct link with incorrect MIME type
97         {
98             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
99             'md5': '4ccbebe5f36706d85221f204d7eb5913',
100             'info_dict': {
101                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
102                 'id': '5_Lennart_Poettering_-_Systemd',
103                 'ext': 'webm',
104                 'title': '5_Lennart_Poettering_-_Systemd',
105                 'upload_date': '20141120',
106             },
107             'expected_warnings': [
108                 'URL could be a direct video link, returning it as such.'
109             ]
110         },
111         # RSS feed
112         {
113             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
114             'info_dict': {
115                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
116                 'title': 'Zero Punctuation',
117                 'description': 're:.*groundbreaking video review series.*'
118             },
119             'playlist_mincount': 11,
120         },
121         # RSS feed with enclosure
122         {
123             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
124             'info_dict': {
125                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
126                 'ext': 'm4v',
127                 'upload_date': '20150228',
128                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
129             }
130         },
131         # google redirect
132         {
133             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
134             'info_dict': {
135                 'id': 'cmQHVoWB5FY',
136                 'ext': 'mp4',
137                 'upload_date': '20130224',
138                 'uploader_id': 'TheVerge',
139                 'description': 're:^Chris Ziegler takes a look at the\.*',
140                 'uploader': 'The Verge',
141                 'title': 'First Firefox OS phones side-by-side',
142             },
143             'params': {
144                 'skip_download': False,
145             }
146         },
147         {
148             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
149             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
150             'info_dict': {
151                 'id': '13601338388002',
152                 'ext': 'mp4',
153                 'uploader': 'www.hodiho.fr',
154                 'title': 'R\u00e9gis plante sa Jeep',
155             }
156         },
157         # bandcamp page with custom domain
158         {
159             'add_ie': ['Bandcamp'],
160             'url': 'http://bronyrock.com/track/the-pony-mash',
161             'info_dict': {
162                 'id': '3235767654',
163                 'ext': 'mp3',
164                 'title': 'The Pony Mash',
165                 'uploader': 'M_Pallante',
166             },
167             'skip': 'There is a limit of 200 free downloads / month for the test song',
168         },
169         # embedded brightcove video
170         # it also tests brightcove videos that need to set the 'Referer' in the
171         # http requests
172         {
173             'add_ie': ['Brightcove'],
174             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
175             'info_dict': {
176                 'id': '2765128793001',
177                 'ext': 'mp4',
178                 'title': 'Le cours de bourse : l’analyse technique',
179                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
180                 'uploader': 'BFM BUSINESS',
181             },
182             'params': {
183                 'skip_download': True,
184             },
185         },
186         {
187             # https://github.com/rg3/youtube-dl/issues/2253
188             'url': 'http://bcove.me/i6nfkrc3',
189             'md5': '0ba9446db037002366bab3b3eb30c88c',
190             'info_dict': {
191                 'id': '3101154703001',
192                 'ext': 'mp4',
193                 'title': 'Still no power',
194                 'uploader': 'thestar.com',
195                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
196             },
197             'add_ie': ['Brightcove'],
198         },
199         {
200             'url': 'http://www.championat.com/video/football/v/87/87499.html',
201             'md5': 'fb973ecf6e4a78a67453647444222983',
202             'info_dict': {
203                 'id': '3414141473001',
204                 'ext': 'mp4',
205                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
206                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
207                 'uploader': 'Championat',
208             },
209         },
210         {
211             # https://github.com/rg3/youtube-dl/issues/3541
212             'add_ie': ['Brightcove'],
213             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
214             'info_dict': {
215                 'id': '3866516442001',
216                 'ext': 'mp4',
217                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
218                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
219                 'uploader': 'SBS Broadcasting',
220             },
221             'skip': 'Restricted to Netherlands',
222             'params': {
223                 'skip_download': True,  # m3u8 download
224             },
225         },
226         # ooyala video
227         {
228             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
229             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
230             'info_dict': {
231                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
232                 'ext': 'mp4',
233                 'title': '2cc213299525360.mov',  # that's what we get
234             },
235             'add_ie': ['Ooyala'],
236         },
237         # multiple ooyala embeds on SBN network websites
238         {
239             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
240             'info_dict': {
241                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
242                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
243             },
244             'playlist_mincount': 3,
245             'params': {
246                 'skip_download': True,
247             },
248             'add_ie': ['Ooyala'],
249         },
250         # embed.ly video
251         {
252             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
253             'info_dict': {
254                 'id': '9ODmcdjQcHQ',
255                 'ext': 'mp4',
256                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
257                 'upload_date': '20140225',
258                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
259                 'uploader': 'Tested',
260                 'uploader_id': 'testedcom',
261             },
262             # No need to test YoutubeIE here
263             'params': {
264                 'skip_download': True,
265             },
266         },
267         # funnyordie embed
268         {
269             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
270             'info_dict': {
271                 'id': '18e820ec3f',
272                 'ext': 'mp4',
273                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
274                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
275             },
276         },
277         # BBC iPlayer embeds
278         {
279             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
280             'info_dict': {
281                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
282             },
283             'playlist_mincount': 18,
284         },
285         # RUTV embed
286         {
287             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
288             'info_dict': {
289                 'id': '776940',
290                 'ext': 'mp4',
291                 'title': 'Охотское море стало целиком российским',
292                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
293             },
294             'params': {
295                 # m3u8 download
296                 'skip_download': True,
297             },
298         },
299         # TVC embed
300         {
301             'url': 'http://sch1298sz.mskobr.ru/dou_edu/karamel_ki/filial_galleries/video/iframe_src_http_tvc_ru_video_iframe_id_55304_isplay_false_acc_video_id_channel_brand_id_11_show_episodes_episode_id_32307_frameb/',
302             'info_dict': {
303                 'id': '55304',
304                 'ext': 'mp4',
305                 'title': 'Дошкольное воспитание',
306             },
307         },
308         # SportBox embed
309         {
310             'url': 'http://www.vestifinance.ru/articles/25753',
311             'info_dict': {
312                 'id': '25753',
313                 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
314             },
315             'playlist': [{
316                 'info_dict': {
317                     'id': '370908',
318                     'title': 'Госзаказ. День 3',
319                     'ext': 'mp4',
320                 }
321             }, {
322                 'info_dict': {
323                     'id': '370905',
324                     'title': 'Госзаказ. День 2',
325                     'ext': 'mp4',
326                 }
327             }, {
328                 'info_dict': {
329                     'id': '370902',
330                     'title': 'Госзаказ. День 1',
331                     'ext': 'mp4',
332                 }
333             }],
334             'params': {
335                 # m3u8 download
336                 'skip_download': True,
337             },
338         },
339         # XHamster embed
340         {
341             'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8',
342             'info_dict': {
343                 'id': 'showthread',
344                 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )',
345             },
346             'playlist_mincount': 7,
347         },
348         # Embedded TED video
349         {
350             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
351             'md5': '65fdff94098e4a607385a60c5177c638',
352             'info_dict': {
353                 'id': '1969',
354                 'ext': 'mp4',
355                 'title': 'Hidden miracles of the natural world',
356                 'uploader': 'Louie Schwartzberg',
357                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
358             }
359         },
360         # Embeded Ustream video
361         {
362             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
363             'md5': '27b99cdb639c9b12a79bca876a073417',
364             'info_dict': {
365                 'id': '45734260',
366                 'ext': 'flv',
367                 'uploader': 'AU SPA:  The NSA and Privacy',
368                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
369             }
370         },
371         # nowvideo embed hidden behind percent encoding
372         {
373             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
374             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
375             'info_dict': {
376                 'id': '06e53103ca9aa',
377                 'ext': 'flv',
378                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
379                 'description': 'No description',
380             },
381         },
382         # arte embed
383         {
384             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
385             'md5': '7653032cbb25bf6c80d80f217055fa43',
386             'info_dict': {
387                 'id': '048195-004_PLUS7-F',
388                 'ext': 'flv',
389                 'title': 'X:enius',
390                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
391                 'upload_date': '20140320',
392             },
393             'params': {
394                 'skip_download': 'Requires rtmpdump'
395             }
396         },
397         # Condé Nast embed
398         {
399             'url': 'http://www.wired.com/2014/04/honda-asimo/',
400             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
401             'info_dict': {
402                 'id': '53501be369702d3275860000',
403                 'ext': 'mp4',
404                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
405             }
406         },
407         # Dailymotion embed
408         {
409             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
410             'md5': '441aeeb82eb72c422c7f14ec533999cd',
411             'info_dict': {
412                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
413                 'ext': 'mp4',
414                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
415                 'uploader': 'Spi0n',
416             },
417             'add_ie': ['Dailymotion'],
418         },
419         # YouTube embed
420         {
421             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
422             'info_dict': {
423                 'id': 'FXRb4ykk4S0',
424                 'ext': 'mp4',
425                 'title': 'The NBL Auction 2014',
426                 'uploader': 'BADMINTON England',
427                 'uploader_id': 'BADMINTONEvents',
428                 'upload_date': '20140603',
429                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
430             },
431             'add_ie': ['Youtube'],
432             'params': {
433                 'skip_download': True,
434             }
435         },
436         # MTVSercices embed
437         {
438             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
439             'md5': '35727f82f58c76d996fc188f9755b0d5',
440             'info_dict': {
441                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
442                 'ext': 'mp4',
443                 'title': 'Review',
444                 'description': 'Mario\'s life in the fast lane has never looked so good.',
445             },
446         },
447         # YouTube embed via <data-embed-url="">
448         {
449             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
450             'info_dict': {
451                 'id': '4vAffPZIT44',
452                 'ext': 'mp4',
453                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
454                 'uploader': 'Gameloft',
455                 'uploader_id': 'gameloft',
456                 'upload_date': '20140828',
457                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
458             },
459             'params': {
460                 'skip_download': True,
461             }
462         },
463         # Camtasia studio
464         {
465             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
466             'playlist': [{
467                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
468                 'info_dict': {
469                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
470                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
471                     'ext': 'flv',
472                     'duration': 2235.90,
473                 }
474             }, {
475                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
476                 'info_dict': {
477                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
478                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
479                     'ext': 'flv',
480                     'duration': 2235.93,
481                 }
482             }],
483             'info_dict': {
484                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
485             }
486         },
487         # Flowplayer
488         {
489             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
490             'md5': '9d65602bf31c6e20014319c7d07fba27',
491             'info_dict': {
492                 'id': '5123ea6d5e5a7',
493                 'ext': 'mp4',
494                 'age_limit': 18,
495                 'uploader': 'www.handjobhub.com',
496                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
497             }
498         },
499         # Multiple brightcove videos
500         # https://github.com/rg3/youtube-dl/issues/2283
501         {
502             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
503             'info_dict': {
504                 'id': 'always-never',
505                 'title': 'Always / Never - The New Yorker',
506             },
507             'playlist_count': 3,
508             'params': {
509                 'extract_flat': False,
510                 'skip_download': True,
511             }
512         },
513         # MLB embed
514         {
515             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
516             'md5': '96f09a37e44da40dd083e12d9a683327',
517             'info_dict': {
518                 'id': '33322633',
519                 'ext': 'mp4',
520                 'title': 'Ump changes call to ball',
521                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
522                 'duration': 48,
523                 'timestamp': 1401537900,
524                 'upload_date': '20140531',
525                 'thumbnail': 're:^https?://.*\.jpg$',
526             },
527         },
528         # Wistia embed
529         {
530             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
531             'md5': '8788b683c777a5cf25621eaf286d0c23',
532             'info_dict': {
533                 'id': '1cfaf6b7ea',
534                 'ext': 'mov',
535                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
536                 'duration': 643.0,
537                 'filesize': 182808282,
538                 'uploader': 'education-portal.com',
539             },
540         },
541         {
542             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
543             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
544             'info_dict': {
545                 'id': 'uxjb0lwrcz',
546                 'ext': 'mp4',
547                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
548                 'duration': 1715.0,
549                 'uploader': 'thoughtworks.wistia.com',
550             },
551         },
552         # Soundcloud embed
553         {
554             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
555             'info_dict': {
556                 'id': '174391317',
557                 'ext': 'mp3',
558                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
559                 'uploader': 'Sophos Security',
560                 'title': 'Chet Chat 171 - Oct 29, 2014',
561                 'upload_date': '20141029',
562             }
563         },
564         # Livestream embed
565         {
566             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
567             'info_dict': {
568                 'id': '67864563',
569                 'ext': 'flv',
570                 'upload_date': '20141112',
571                 'title': 'Rosetta #CometLanding webcast HL 10',
572             }
573         },
574         # LazyYT
575         {
576             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
577             'info_dict': {
578                 'id': '1986',
579                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
580             },
581             'playlist_mincount': 2,
582         },
583         # Cinchcast embed
584         {
585             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
586             'info_dict': {
587                 'id': '7141703',
588                 'ext': 'mp3',
589                 'upload_date': '20141126',
590                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
591             }
592         },
593         # Cinerama player
594         {
595             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
596             'info_dict': {
597                 'id': '730m_DandD_1901_512k',
598                 'ext': 'mp4',
599                 'uploader': 'www.abc.net.au',
600                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
601             }
602         },
603         # embedded viddler video
604         {
605             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
606             'info_dict': {
607                 'id': '4d03aad9',
608                 'ext': 'mp4',
609                 'uploader': 'deadspin',
610                 'title': 'WALL-TO-GORTAT',
611                 'timestamp': 1422285291,
612                 'upload_date': '20150126',
613             },
614             'add_ie': ['Viddler'],
615         },
616         # Libsyn embed
617         {
618             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
619             'info_dict': {
620                 'id': '3377616',
621                 'ext': 'mp3',
622                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
623                 'description': 'md5:601cb790edd05908957dae8aaa866465',
624                 'upload_date': '20150220',
625             },
626         },
627         # jwplayer YouTube
628         {
629             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
630             'info_dict': {
631                 'id': 'Mrj4DVp2zeA',
632                 'ext': 'mp4',
633                 'upload_date': '20150212',
634                 'uploader': 'The National Archives UK',
635                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
636                 'uploader_id': 'NationalArchives08',
637                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
638             },
639         },
640         # rtl.nl embed
641         {
642             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
643             'playlist_mincount': 5,
644             'info_dict': {
645                 'id': 'aanslagen-kopenhagen',
646                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
647             }
648         },
649         # Zapiks embed
650         {
651             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
652             'info_dict': {
653                 'id': '118046',
654                 'ext': 'mp4',
655                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
656             }
657         },
658         # Kaltura embed
659         {
660             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
661             'info_dict': {
662                 'id': '1_eergr3h1',
663                 'ext': 'mp4',
664                 'upload_date': '20150226',
665                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
666                 'timestamp': int,
667                 'title': 'John Carlson Postgame 2/25/15',
668             },
669         },
670         # Eagle.Platform embed (generic URL)
671         {
672             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
673             'info_dict': {
674                 'id': '227304',
675                 'ext': 'mp4',
676                 'title': 'Навальный вышел на свободу',
677                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
678                 'thumbnail': 're:^https?://.*\.jpg$',
679                 'duration': 87,
680                 'view_count': int,
681                 'age_limit': 0,
682             },
683         },
684         # ClipYou (Eagle.Platform) embed (custom URL)
685         {
686             'url': 'http://muz-tv.ru/play/7129/',
687             'info_dict': {
688                 'id': '12820',
689                 'ext': 'mp4',
690                 'title': "'O Sole Mio",
691                 'thumbnail': 're:^https?://.*\.jpg$',
692                 'duration': 216,
693                 'view_count': int,
694             },
695         },
696         # Pladform embed
697         {
698             'url': 'http://muz-tv.ru/kinozal/view/7400/',
699             'info_dict': {
700                 'id': '100183293',
701                 'ext': 'mp4',
702                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
703                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
704                 'thumbnail': 're:^https?://.*\.jpg$',
705                 'duration': 694,
706                 'age_limit': 0,
707             },
708         },
709         # Playwire embed
710         {
711             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
712             'info_dict': {
713                 'id': '3519514',
714                 'ext': 'mp4',
715                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
716                 'thumbnail': 're:^https?://.*\.png$',
717                 'duration': 45.115,
718             },
719         },
720         # 5min embed
721         {
722             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
723             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
724             'info_dict': {
725                 'id': '518726732',
726                 'ext': 'mp4',
727                 'title': 'Facebook Creates "On This Day" | Crunch Report',
728             },
729         },
730         # SVT embed
731         {
732             'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
733             'info_dict': {
734                 'id': '2900353',
735                 'ext': 'flv',
736                 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
737                 'duration': 27,
738                 'age_limit': 0,
739             },
740         },
741         # Crooks and Liars embed
742         {
743             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
744             'info_dict': {
745                 'id': '8RUoRhRi',
746                 'ext': 'mp4',
747                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
748                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
749                 'timestamp': 1428207000,
750                 'upload_date': '20150405',
751                 'uploader': 'Heather',
752             },
753         },
754         # Crooks and Liars external embed
755         {
756             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
757             'info_dict': {
758                 'id': 'MTE3MjUtMzQ2MzA',
759                 'ext': 'mp4',
760                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
761                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
762                 'timestamp': 1265032391,
763                 'upload_date': '20100201',
764                 'uploader': 'Heather',
765             },
766         },
767         # NBC Sports vplayer embed
768         {
769             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
770             'info_dict': {
771                 'id': 'ln7x1qSThw4k',
772                 'ext': 'flv',
773                 'title': "PFT Live: New leader in the 'new-look' defense",
774                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
775             },
776         },
777         # UDN embed
778         {
779             'url': 'http://www.udn.com/news/story/7314/822787',
780             'md5': 'fd2060e988c326991037b9aff9df21a6',
781             'info_dict': {
782                 'id': '300346',
783                 'ext': 'mp4',
784                 'title': '中一中男師變性 全校師生力挺',
785                 'thumbnail': 're:^https?://.*\.jpg$',
786             }
787         },
788         # Ooyala embed
789         {
790             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
791             'info_dict': {
792                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
793                 'ext': 'mp4',
794                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
795                 'title': 'This is what separates the Excel masters from the wannabes',
796             },
797             'params': {
798                 # m3u8 downloads
799                 'skip_download': True,
800             }
801         },
802         # Contains a SMIL manifest
803         {
804             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
805             'info_dict': {
806                 'id': 'file',
807                 'ext': 'flv',
808                 'title': '+ Football: Lottery Champions League Europe',
809                 'uploader': 'www.telewebion.com',
810             },
811             'params': {
812                 # rtmpe downloads
813                 'skip_download': True,
814             }
815         },
816         # Brightcove URL in single quotes
817         {
818             'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
819             'md5': '4ae374f1f8b91c889c4b9203c8c752af',
820             'info_dict': {
821                 'id': '4255764656001',
822                 'ext': 'mp4',
823                 'title': 'SN Presents: Russell Martin, World Citizen',
824                 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
825                 'uploader': 'Rogers Sportsnet',
826             },
827         },
828         # Dailymotion Cloud video
829         {
830             'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
831             'md5': '49444254273501a64675a7e68c502681',
832             'info_dict': {
833                 'id': '5585de919473990de4bee11b',
834                 'ext': 'mp4',
835                 'title': 'Le débat',
836                 'thumbnail': 're:^https?://.*\.jpe?g$',
837             }
838         }
839     ]
840
841     def report_following_redirect(self, new_url):
842         """Report information extraction."""
843         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
844
845     def _extract_rss(self, url, video_id, doc):
846         playlist_title = doc.find('./channel/title').text
847         playlist_desc_el = doc.find('./channel/description')
848         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
849
850         entries = []
851         for it in doc.findall('./channel/item'):
852             next_url = xpath_text(it, 'link', fatal=False)
853             if not next_url:
854                 enclosure_nodes = it.findall('./enclosure')
855                 for e in enclosure_nodes:
856                     next_url = e.attrib.get('url')
857                     if next_url:
858                         break
859
860             if not next_url:
861                 continue
862
863             entries.append({
864                 '_type': 'url',
865                 'url': next_url,
866                 'title': it.find('title').text,
867             })
868
869         return {
870             '_type': 'playlist',
871             'id': url,
872             'title': playlist_title,
873             'description': playlist_desc,
874             'entries': entries,
875         }
876
877     def _extract_camtasia(self, url, video_id, webpage):
878         """ Returns None if no camtasia video can be found. """
879
880         camtasia_cfg = self._search_regex(
881             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
882             webpage, 'camtasia configuration file', default=None)
883         if camtasia_cfg is None:
884             return None
885
886         title = self._html_search_meta('DC.title', webpage, fatal=True)
887
888         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
889         camtasia_cfg = self._download_xml(
890             camtasia_url, video_id,
891             note='Downloading camtasia configuration',
892             errnote='Failed to download camtasia configuration')
893         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
894
895         entries = []
896         for n in fileset_node.getchildren():
897             url_n = n.find('./uri')
898             if url_n is None:
899                 continue
900
901             entries.append({
902                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
903                 'title': '%s - %s' % (title, n.tag),
904                 'url': compat_urlparse.urljoin(url, url_n.text),
905                 'duration': float_or_none(n.find('./duration').text),
906             })
907
908         return {
909             '_type': 'playlist',
910             'entries': entries,
911             'title': title,
912         }
913
914     def _real_extract(self, url):
915         if url.startswith('//'):
916             return {
917                 '_type': 'url',
918                 'url': self.http_scheme() + url,
919             }
920
921         parsed_url = compat_urlparse.urlparse(url)
922         if not parsed_url.scheme:
923             default_search = self._downloader.params.get('default_search')
924             if default_search is None:
925                 default_search = 'fixup_error'
926
927             if default_search in ('auto', 'auto_warning', 'fixup_error'):
928                 if '/' in url:
929                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
930                     return self.url_result('http://' + url)
931                 elif default_search != 'fixup_error':
932                     if default_search == 'auto_warning':
933                         if re.match(r'^(?:url|URL)$', url):
934                             raise ExtractorError(
935                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
936                                 expected=True)
937                         else:
938                             self._downloader.report_warning(
939                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
940                     return self.url_result('ytsearch:' + url)
941
942             if default_search in ('error', 'fixup_error'):
943                 raise ExtractorError(
944                     '%r is not a valid URL. '
945                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
946                     % (url, url), expected=True)
947             else:
948                 if ':' not in default_search:
949                     default_search += ':'
950                 return self.url_result(default_search + url)
951
952         url, smuggled_data = unsmuggle_url(url)
953         force_videoid = None
954         is_intentional = smuggled_data and smuggled_data.get('to_generic')
955         if smuggled_data and 'force_videoid' in smuggled_data:
956             force_videoid = smuggled_data['force_videoid']
957             video_id = force_videoid
958         else:
959             video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
960
961         self.to_screen('%s: Requesting header' % video_id)
962
963         head_req = HEADRequest(url)
964         head_response = self._request_webpage(
965             head_req, video_id,
966             note=False, errnote='Could not send HEAD request to %s' % url,
967             fatal=False)
968
969         if head_response is not False:
970             # Check for redirect
971             new_url = head_response.geturl()
972             if url != new_url:
973                 self.report_following_redirect(new_url)
974                 if force_videoid:
975                     new_url = smuggle_url(
976                         new_url, {'force_videoid': force_videoid})
977                 return self.url_result(new_url)
978
979         full_response = None
980         if head_response is False:
981             request = compat_urllib_request.Request(url)
982             request.add_header('Accept-Encoding', '*')
983             full_response = self._request_webpage(request, video_id)
984             head_response = full_response
985
986         # Check for direct link to a video
987         content_type = head_response.headers.get('Content-Type', '')
988         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
989         if m:
990             upload_date = unified_strdate(
991                 head_response.headers.get('Last-Modified'))
992             return {
993                 'id': video_id,
994                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
995                 'direct': True,
996                 'formats': [{
997                     'format_id': m.group('format_id'),
998                     'url': url,
999                     'vcodec': 'none' if m.group('type') == 'audio' else None
1000                 }],
1001                 'upload_date': upload_date,
1002             }
1003
1004         if not self._downloader.params.get('test', False) and not is_intentional:
1005             self._downloader.report_warning('Falling back on generic information extractor.')
1006
1007         if not full_response:
1008             request = compat_urllib_request.Request(url)
1009             # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
1010             # making it impossible to download only chunk of the file (yet we need only 512kB to
1011             # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
1012             # that will always result in downloading the whole file that is not desirable.
1013             # Therefore for extraction pass we have to override Accept-Encoding to any in order
1014             # to accept raw bytes and being able to download only a chunk.
1015             # It may probably better to solve this by checking Content-Type for application/octet-stream
1016             # after HEAD request finishes, but not sure if we can rely on this.
1017             request.add_header('Accept-Encoding', '*')
1018             full_response = self._request_webpage(request, video_id)
1019
1020         # Maybe it's a direct link to a video?
1021         # Be careful not to download the whole thing!
1022         first_bytes = full_response.read(512)
1023         if not is_html(first_bytes):
1024             self._downloader.report_warning(
1025                 'URL could be a direct video link, returning it as such.')
1026             upload_date = unified_strdate(
1027                 head_response.headers.get('Last-Modified'))
1028             return {
1029                 'id': video_id,
1030                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
1031                 'direct': True,
1032                 'url': url,
1033                 'upload_date': upload_date,
1034             }
1035
1036         webpage = self._webpage_read_content(
1037             full_response, url, video_id, prefix=first_bytes)
1038
1039         self.report_extraction(video_id)
1040
1041         # Is it an RSS feed?
1042         try:
1043             doc = parse_xml(webpage)
1044             if doc.tag == 'rss':
1045                 return self._extract_rss(url, video_id, doc)
1046         except compat_xml_parse_error:
1047             pass
1048
1049         # Is it a Camtasia project?
1050         camtasia_res = self._extract_camtasia(url, video_id, webpage)
1051         if camtasia_res is not None:
1052             return camtasia_res
1053
1054         # Sometimes embedded video player is hidden behind percent encoding
1055         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1056         # Unescaping the whole page allows to handle those cases in a generic way
1057         webpage = compat_urllib_parse.unquote(webpage)
1058
1059         # it's tempting to parse this further, but you would
1060         # have to take into account all the variations like
1061         #   Video Title - Site Name
1062         #   Site Name | Video Title
1063         #   Video Title - Tagline | Site Name
1064         # and so on and so forth; it's just not practical
1065         video_title = self._html_search_regex(
1066             r'(?s)<title>(.*?)</title>', webpage, 'video title',
1067             default='video')
1068
1069         # Try to detect age limit automatically
1070         age_limit = self._rta_search(webpage)
1071         # And then there are the jokers who advertise that they use RTA,
1072         # but actually don't.
1073         AGE_LIMIT_MARKERS = [
1074             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1075         ]
1076         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1077             age_limit = 18
1078
1079         # video uploader is domain name
1080         video_uploader = self._search_regex(
1081             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1082
1083         # Helper method
1084         def _playlist_from_matches(matches, getter=None, ie=None):
1085             urlrs = orderedSet(
1086                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1087                 for m in matches)
1088             return self.playlist_result(
1089                 urlrs, playlist_id=video_id, playlist_title=video_title)
1090
1091         # Look for BrightCove:
1092         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1093         if bc_urls:
1094             self.to_screen('Brightcove video detected.')
1095             entries = [{
1096                 '_type': 'url',
1097                 'url': smuggle_url(bc_url, {'Referer': url}),
1098                 'ie_key': 'Brightcove'
1099             } for bc_url in bc_urls]
1100
1101             return {
1102                 '_type': 'playlist',
1103                 'title': video_title,
1104                 'id': video_id,
1105                 'entries': entries,
1106             }
1107
1108         # Look for embedded rtl.nl player
1109         matches = re.findall(
1110             r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
1111             webpage)
1112         if matches:
1113             return _playlist_from_matches(matches, ie='RtlNl')
1114
1115         vimeo_url = VimeoIE._extract_vimeo_url(url, webpage)
1116         if vimeo_url is not None:
1117             return self.url_result(vimeo_url)
1118
1119         # Look for embedded YouTube player
1120         matches = re.findall(r'''(?x)
1121             (?:
1122                 <iframe[^>]+?src=|
1123                 data-video-url=|
1124                 <embed[^>]+?src=|
1125                 embedSWF\(?:\s*|
1126                 new\s+SWFObject\(
1127             )
1128             (["\'])
1129                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1130                 (?:embed|v|p)/.+?)
1131             \1''', webpage)
1132         if matches:
1133             return _playlist_from_matches(
1134                 matches, lambda m: unescapeHTML(m[1]))
1135
1136         # Look for lazyYT YouTube embed
1137         matches = re.findall(
1138             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1139         if matches:
1140             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1141
1142         # Look for embedded Dailymotion player
1143         matches = re.findall(
1144             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1145         if matches:
1146             return _playlist_from_matches(
1147                 matches, lambda m: unescapeHTML(m[1]))
1148
1149         # Look for embedded Dailymotion playlist player (#3822)
1150         m = re.search(
1151             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1152         if m:
1153             playlists = re.findall(
1154                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1155             if playlists:
1156                 return _playlist_from_matches(
1157                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1158
1159         # Look for embedded Wistia player
1160         match = re.search(
1161             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1162         if match:
1163             embed_url = self._proto_relative_url(
1164                 unescapeHTML(match.group('url')))
1165             return {
1166                 '_type': 'url_transparent',
1167                 'url': embed_url,
1168                 'ie_key': 'Wistia',
1169                 'uploader': video_uploader,
1170                 'title': video_title,
1171                 'id': video_id,
1172             }
1173
1174         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1175         if match:
1176             return {
1177                 '_type': 'url_transparent',
1178                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1179                 'ie_key': 'Wistia',
1180                 'uploader': video_uploader,
1181                 'title': video_title,
1182                 'id': match.group('id')
1183             }
1184
1185         # Look for embedded blip.tv player
1186         bliptv_url = BlipTVIE._extract_url(webpage)
1187         if bliptv_url:
1188             return self.url_result(bliptv_url, 'BlipTV')
1189
1190         # Look for SVT player
1191         svt_url = SVTIE._extract_url(webpage)
1192         if svt_url:
1193             return self.url_result(svt_url, 'SVT')
1194
1195         # Look for embedded condenast player
1196         matches = re.findall(
1197             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1198             webpage)
1199         if matches:
1200             return {
1201                 '_type': 'playlist',
1202                 'entries': [{
1203                     '_type': 'url',
1204                     'ie_key': 'CondeNast',
1205                     'url': ma,
1206                 } for ma in matches],
1207                 'title': video_title,
1208                 'id': video_id,
1209             }
1210
1211         # Look for Bandcamp pages with custom domain
1212         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1213         if mobj is not None:
1214             burl = unescapeHTML(mobj.group(1))
1215             # Don't set the extractor because it can be a track url or an album
1216             return self.url_result(burl)
1217
1218         # Look for embedded Vevo player
1219         mobj = re.search(
1220             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1221         if mobj is not None:
1222             return self.url_result(mobj.group('url'))
1223
1224         # Look for embedded Viddler player
1225         mobj = re.search(
1226             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1227             webpage)
1228         if mobj is not None:
1229             return self.url_result(mobj.group('url'))
1230
1231         # Look for NYTimes player
1232         mobj = re.search(
1233             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1234             webpage)
1235         if mobj is not None:
1236             return self.url_result(mobj.group('url'))
1237
1238         # Look for Libsyn player
1239         mobj = re.search(
1240             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1241         if mobj is not None:
1242             return self.url_result(mobj.group('url'))
1243
1244         # Look for Ooyala videos
1245         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1246                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1247                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1248                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1249         if mobj is not None:
1250             return OoyalaIE._build_url_result(mobj.group('ec'))
1251
1252         # Look for multiple Ooyala embeds on SBN network websites
1253         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1254         if mobj is not None:
1255             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1256             if embeds:
1257                 return _playlist_from_matches(
1258                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1259
1260         # Look for Aparat videos
1261         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1262         if mobj is not None:
1263             return self.url_result(mobj.group(1), 'Aparat')
1264
1265         # Look for MPORA videos
1266         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1267         if mobj is not None:
1268             return self.url_result(mobj.group(1), 'Mpora')
1269
1270         # Look for embedded NovaMov-based player
1271         mobj = re.search(
1272             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1273                     (?P<url>http://(?:(?:embed|www)\.)?
1274                         (?:novamov\.com|
1275                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1276                            videoweed\.(?:es|com)|
1277                            movshare\.(?:net|sx|ag)|
1278                            divxstage\.(?:eu|net|ch|co|at|ag))
1279                         /embed\.php.+?)\1''', webpage)
1280         if mobj is not None:
1281             return self.url_result(mobj.group('url'))
1282
1283         # Look for embedded Facebook player
1284         mobj = re.search(
1285             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1286         if mobj is not None:
1287             return self.url_result(mobj.group('url'), 'Facebook')
1288
1289         # Look for embedded VK player
1290         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1291         if mobj is not None:
1292             return self.url_result(mobj.group('url'), 'VK')
1293
1294         # Look for embedded ivi player
1295         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1296         if mobj is not None:
1297             return self.url_result(mobj.group('url'), 'Ivi')
1298
1299         # Look for embedded Huffington Post player
1300         mobj = re.search(
1301             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1302         if mobj is not None:
1303             return self.url_result(mobj.group('url'), 'HuffPost')
1304
1305         # Look for embed.ly
1306         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1307         if mobj is not None:
1308             return self.url_result(mobj.group('url'))
1309         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1310         if mobj is not None:
1311             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1312
1313         # Look for funnyordie embed
1314         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1315         if matches:
1316             return _playlist_from_matches(
1317                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1318
1319         # Look for BBC iPlayer embed
1320         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1321         if matches:
1322             return _playlist_from_matches(matches, ie='BBCCoUk')
1323
1324         # Look for embedded RUTV player
1325         rutv_url = RUTVIE._extract_url(webpage)
1326         if rutv_url:
1327             return self.url_result(rutv_url, 'RUTV')
1328
1329         # Look for embedded TVC player
1330         tvc_url = TVCIE._extract_url(webpage)
1331         if tvc_url:
1332             return self.url_result(tvc_url, 'TVC')
1333
1334         # Look for embedded SportBox player
1335         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1336         if sportbox_urls:
1337             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1338
1339         # Look for embedded PornHub player
1340         pornhub_url = PornHubIE._extract_url(webpage)
1341         if pornhub_url:
1342             return self.url_result(pornhub_url, 'PornHub')
1343
1344         # Look for embedded XHamster player
1345         xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)
1346         if xhamster_urls:
1347             return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed')
1348
1349         # Look for embedded Tvigle player
1350         mobj = re.search(
1351             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage)
1352         if mobj is not None:
1353             return self.url_result(mobj.group('url'), 'Tvigle')
1354
1355         # Look for embedded TED player
1356         mobj = re.search(
1357             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1358         if mobj is not None:
1359             return self.url_result(mobj.group('url'), 'TED')
1360
1361         # Look for embedded Ustream videos
1362         mobj = re.search(
1363             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1364         if mobj is not None:
1365             return self.url_result(mobj.group('url'), 'Ustream')
1366
1367         # Look for embedded arte.tv player
1368         mobj = re.search(
1369             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1370             webpage)
1371         if mobj is not None:
1372             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1373
1374         # Look for embedded smotri.com player
1375         smotri_url = SmotriIE._extract_url(webpage)
1376         if smotri_url:
1377             return self.url_result(smotri_url, 'Smotri')
1378
1379         # Look for embeded soundcloud player
1380         mobj = re.search(
1381             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1382             webpage)
1383         if mobj is not None:
1384             url = unescapeHTML(mobj.group('url'))
1385             return self.url_result(url)
1386
1387         # Look for embedded vulture.com player
1388         mobj = re.search(
1389             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1390             webpage)
1391         if mobj is not None:
1392             url = unescapeHTML(mobj.group('url'))
1393             return self.url_result(url, ie='Vulture')
1394
1395         # Look for embedded mtvservices player
1396         mobj = re.search(
1397             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1398             webpage)
1399         if mobj is not None:
1400             url = unescapeHTML(mobj.group('url'))
1401             return self.url_result(url, ie='MTVServicesEmbedded')
1402
1403         # Look for embedded yahoo player
1404         mobj = re.search(
1405             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1406             webpage)
1407         if mobj is not None:
1408             return self.url_result(mobj.group('url'), 'Yahoo')
1409
1410         # Look for embedded sbs.com.au player
1411         mobj = re.search(
1412             r'''(?x)
1413             (?:
1414                 <meta\s+property="og:video"\s+content=|
1415                 <iframe[^>]+?src=
1416             )
1417             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1418             webpage)
1419         if mobj is not None:
1420             return self.url_result(mobj.group('url'), 'SBS')
1421
1422         # Look for embedded Cinchcast player
1423         mobj = re.search(
1424             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1425             webpage)
1426         if mobj is not None:
1427             return self.url_result(mobj.group('url'), 'Cinchcast')
1428
1429         mobj = re.search(
1430             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1431             webpage)
1432         if not mobj:
1433             mobj = re.search(
1434                 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1435                 webpage)
1436         if mobj is not None:
1437             return self.url_result(mobj.group('url'), 'MLB')
1438
1439         mobj = re.search(
1440             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1441             webpage)
1442         if mobj is not None:
1443             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1444
1445         mobj = re.search(
1446             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1447             webpage)
1448         if mobj is not None:
1449             return self.url_result(mobj.group('url'), 'Livestream')
1450
1451         # Look for Zapiks embed
1452         mobj = re.search(
1453             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1454         if mobj is not None:
1455             return self.url_result(mobj.group('url'), 'Zapiks')
1456
1457         # Look for Kaltura embeds
1458         mobj = re.search(
1459             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1460         if mobj is not None:
1461             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1462
1463         # Look for Eagle.Platform embeds
1464         mobj = re.search(
1465             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1466         if mobj is not None:
1467             return self.url_result(mobj.group('url'), 'EaglePlatform')
1468
1469         # Look for ClipYou (uses Eagle.Platform) embeds
1470         mobj = re.search(
1471             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1472         if mobj is not None:
1473             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1474
1475         # Look for Pladform embeds
1476         mobj = re.search(
1477             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1478         if mobj is not None:
1479             return self.url_result(mobj.group('url'), 'Pladform')
1480
1481         # Look for Playwire embeds
1482         mobj = re.search(
1483             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1484         if mobj is not None:
1485             return self.url_result(mobj.group('url'))
1486
1487         # Look for 5min embeds
1488         mobj = re.search(
1489             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1490         if mobj is not None:
1491             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1492
1493         # Look for Crooks and Liars embeds
1494         mobj = re.search(
1495             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1496         if mobj is not None:
1497             return self.url_result(mobj.group('url'))
1498
1499         # Look for NBC Sports VPlayer embeds
1500         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1501         if nbc_sports_url:
1502             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1503
1504         # Look for UDN embeds
1505         mobj = re.search(
1506             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1507         if mobj is not None:
1508             return self.url_result(
1509                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1510
1511         # Look for Senate ISVP iframe
1512         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1513         if senate_isvp_url:
1514             return self.url_result(senate_isvp_url, 'SenateISVP')
1515
1516         # Look for Dailymotion Cloud videos
1517         dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
1518         if dmcloud_url:
1519             return self.url_result(dmcloud_url, 'DailymotionCloud')
1520
1521         def check_video(vurl):
1522             if YoutubeIE.suitable(vurl):
1523                 return True
1524             vpath = compat_urlparse.urlparse(vurl).path
1525             vext = determine_ext(vpath)
1526             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1527
1528         def filter_video(urls):
1529             return list(filter(check_video, urls))
1530
1531         # Start with something easy: JW Player in SWFObject
1532         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1533         if not found:
1534             # Look for gorilla-vid style embedding
1535             found = filter_video(re.findall(r'''(?sx)
1536                 (?:
1537                     jw_plugins|
1538                     JWPlayerOptions|
1539                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1540                 )
1541                 .*?
1542                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1543         if not found:
1544             # Broaden the search a little bit
1545             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1546         if not found:
1547             # Broaden the findall a little bit: JWPlayer JS loader
1548             found = filter_video(re.findall(
1549                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1550         if not found:
1551             # Flow player
1552             found = filter_video(re.findall(r'''(?xs)
1553                 flowplayer\("[^"]+",\s*
1554                     \{[^}]+?\}\s*,
1555                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1556                         ["']?url["']?\s*:\s*["']([^"']+)["']
1557             ''', webpage))
1558         if not found:
1559             # Cinerama player
1560             found = re.findall(
1561                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1562         if not found:
1563             # Try to find twitter cards info
1564             found = filter_video(re.findall(
1565                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1566         if not found:
1567             # We look for Open Graph info:
1568             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1569             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1570             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1571             if m_video_type is not None:
1572                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1573         if not found:
1574             # HTML5 video
1575             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1576         if not found:
1577             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1578             found = re.search(
1579                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1580                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1581                 webpage)
1582             if not found:
1583                 # Look also in Refresh HTTP header
1584                 refresh_header = head_response.headers.get('Refresh')
1585                 if refresh_header:
1586                     found = re.search(REDIRECT_REGEX, refresh_header)
1587             if found:
1588                 new_url = compat_urlparse.urljoin(url, found.group(1))
1589                 self.report_following_redirect(new_url)
1590                 return {
1591                     '_type': 'url',
1592                     'url': new_url,
1593                 }
1594         if not found:
1595             raise UnsupportedError(url)
1596
1597         entries = []
1598         for video_url in found:
1599             video_url = compat_urlparse.urljoin(url, video_url)
1600             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1601
1602             # Sometimes, jwplayer extraction will result in a YouTube URL
1603             if YoutubeIE.suitable(video_url):
1604                 entries.append(self.url_result(video_url, 'Youtube'))
1605                 continue
1606
1607             # here's a fun little line of code for you:
1608             video_id = os.path.splitext(video_id)[0]
1609
1610             if determine_ext(video_url) == 'smil':
1611                 entries.append({
1612                     'id': video_id,
1613                     'formats': self._extract_smil_formats(video_url, video_id),
1614                     'uploader': video_uploader,
1615                     'title': video_title,
1616                     'age_limit': age_limit,
1617                 })
1618             else:
1619                 entries.append({
1620                     'id': video_id,
1621                     'url': video_url,
1622                     'uploader': video_uploader,
1623                     'title': video_title,
1624                     'age_limit': age_limit,
1625                 })
1626
1627         if len(entries) == 1:
1628             return entries[0]
1629         else:
1630             for num, e in enumerate(entries, start=1):
1631                 # 'url' results don't have a title
1632                 if e.get('title') is not None:
1633                     e['title'] = '%s (%d)' % (e['title'], num)
1634             return {
1635                 '_type': 'playlist',
1636                 'entries': entries,
1637             }