_ Git - youtube-dl/blob - youtube_dl/extractor/generic.py

   1 # encoding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 import os
   6 import re
   7
   8 from .common import InfoExtractor
   9 from .youtube import YoutubeIE
  10 from ..compat import (
  11     compat_urllib_parse,
  12     compat_urlparse,
  13     compat_xml_parse_error,
  14 )
  15 from ..utils import (
  16     determine_ext,
  17     ExtractorError,
  18     float_or_none,
  19     HEADRequest,
  20     is_html,
  21     orderedSet,
  22     parse_xml,
  23     smuggle_url,
  24     unescapeHTML,
  25     unified_strdate,
  26     unsmuggle_url,
  27     UnsupportedError,
  28     url_basename,
  29     xpath_text,
  30 )
  31 from .brightcove import BrightcoveIE
  32 from .nbc import NBCSportsVPlayerIE
  33 from .ooyala import OoyalaIE
  34 from .rutv import RUTVIE
  35 from .sportbox import SportBoxEmbedIE
  36 from .smotri import SmotriIE
  37 from .condenast import CondeNastIE
  38 from .udn import UDNEmbedIE
  39 from .senateisvp import SenateISVPIE
  40 from .bliptv import BlipTVIE
  41 from .svt import SVTIE
  42
  43
  44 class GenericIE(InfoExtractor):
  45     IE_DESC = 'Generic downloader that works on some sites'
  46     _VALID_URL = r'.*'
  47     IE_NAME = 'generic'
  48     _TESTS = [
  49         {
  50             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
  51             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
  52             'info_dict': {
  53                 'id': '13601338388002',
  54                 'ext': 'mp4',
  55                 'uploader': 'www.hodiho.fr',
  56                 'title': 'R\u00e9gis plante sa Jeep',
  57             }
  58         },
  59         # bandcamp page with custom domain
  60         {
  61             'add_ie': ['Bandcamp'],
  62             'url': 'http://bronyrock.com/track/the-pony-mash',
  63             'info_dict': {
  64                 'id': '3235767654',
  65                 'ext': 'mp3',
  66                 'title': 'The Pony Mash',
  67                 'uploader': 'M_Pallante',
  68             },
  69             'skip': 'There is a limit of 200 free downloads / month for the test song',
  70         },
  71         # embedded brightcove video
  72         # it also tests brightcove videos that need to set the 'Referer' in the
  73         # http requests
  74         {
  75             'add_ie': ['Brightcove'],
  76             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
  77             'info_dict': {
  78                 'id': '2765128793001',
  79                 'ext': 'mp4',
  80                 'title': 'Le cours de bourse : l’analyse technique',
  81                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
  82                 'uploader': 'BFM BUSINESS',
  83             },
  84             'params': {
  85                 'skip_download': True,
  86             },
  87         },
  88         {
  89             # https://github.com/rg3/youtube-dl/issues/2253
  90             'url': 'http://bcove.me/i6nfkrc3',
  91             'md5': '0ba9446db037002366bab3b3eb30c88c',
  92             'info_dict': {
  93                 'id': '3101154703001',
  94                 'ext': 'mp4',
  95                 'title': 'Still no power',
  96                 'uploader': 'thestar.com',
  97                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
  98             },
  99             'add_ie': ['Brightcove'],
 100         },
 101         {
 102             'url': 'http://www.championat.com/video/football/v/87/87499.html',
 103             'md5': 'fb973ecf6e4a78a67453647444222983',
 104             'info_dict': {
 105                 'id': '3414141473001',
 106                 'ext': 'mp4',
 107                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
 108                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
 109                 'uploader': 'Championat',
 110             },
 111         },
 112         {
 113             # https://github.com/rg3/youtube-dl/issues/3541
 114             'add_ie': ['Brightcove'],
 115             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
 116             'info_dict': {
 117                 'id': '3866516442001',
 118                 'ext': 'mp4',
 119                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
 120                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
 121                 'uploader': 'SBS Broadcasting',
 122             },
 123             'skip': 'Restricted to Netherlands',
 124             'params': {
 125                 'skip_download': True,  # m3u8 download
 126             },
 127         },
 128         # Direct link to a video
 129         {
 130             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
 131             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
 132             'info_dict': {
 133                 'id': 'trailer',
 134                 'ext': 'mp4',
 135                 'title': 'trailer',
 136                 'upload_date': '20100513',
 137             }
 138         },
 139         # ooyala video
 140         {
 141             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
 142             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
 143             'info_dict': {
 144                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
 145                 'ext': 'mp4',
 146                 'title': '2cc213299525360.mov',  # that's what we get
 147             },
 148             'add_ie': ['Ooyala'],
 149         },
 150         # multiple ooyala embeds on SBN network websites
 151         {
 152             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 153             'info_dict': {
 154                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 155                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
 156             },
 157             'playlist_mincount': 3,
 158             'params': {
 159                 'skip_download': True,
 160             },
 161             'add_ie': ['Ooyala'],
 162         },
 163         # google redirect
 164         {
 165             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
 166             'info_dict': {
 167                 'id': 'cmQHVoWB5FY',
 168                 'ext': 'mp4',
 169                 'upload_date': '20130224',
 170                 'uploader_id': 'TheVerge',
 171                 'description': 're:^Chris Ziegler takes a look at the\.*',
 172                 'uploader': 'The Verge',
 173                 'title': 'First Firefox OS phones side-by-side',
 174             },
 175             'params': {
 176                 'skip_download': False,
 177             }
 178         },
 179         # embed.ly video
 180         {
 181             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
 182             'info_dict': {
 183                 'id': '9ODmcdjQcHQ',
 184                 'ext': 'mp4',
 185                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
 186                 'upload_date': '20140225',
 187                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
 188                 'uploader': 'Tested',
 189                 'uploader_id': 'testedcom',
 190             },
 191             # No need to test YoutubeIE here
 192             'params': {
 193                 'skip_download': True,
 194             },
 195         },
 196         # funnyordie embed
 197         {
 198             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
 199             'info_dict': {
 200                 'id': '18e820ec3f',
 201                 'ext': 'mp4',
 202                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
 203                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
 204             },
 205         },
 206         # BBC iPlayer embeds
 207         {
 208             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
 209             'info_dict': {
 210                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
 211             },
 212             'playlist_mincount': 18,
 213         },
 214         # RUTV embed
 215         {
 216             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
 217             'info_dict': {
 218                 'id': '776940',
 219                 'ext': 'mp4',
 220                 'title': 'Охотское море стало целиком российским',
 221                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
 222             },
 223             'params': {
 224                 # m3u8 download
 225                 'skip_download': True,
 226             },
 227         },
 228         # SportBox embed
 229         {
 230             'url': 'http://www.vestifinance.ru/articles/25753',
 231             'info_dict': {
 232                 'id': '25753',
 233                 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
 234             },
 235             'playlist': [{
 236                 'info_dict': {
 237                     'id': '370908',
 238                     'title': 'Госзаказ. День 3',
 239                     'ext': 'mp4',
 240                 }
 241             }, {
 242                 'info_dict': {
 243                     'id': '370905',
 244                     'title': 'Госзаказ. День 2',
 245                     'ext': 'mp4',
 246                 }
 247             }, {
 248                 'info_dict': {
 249                     'id': '370902',
 250                     'title': 'Госзаказ. День 1',
 251                     'ext': 'mp4',
 252                 }
 253             }],
 254             'params': {
 255                 # m3u8 download
 256                 'skip_download': True,
 257             },
 258         },
 259         # Embedded TED video
 260         {
 261             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
 262             'md5': '65fdff94098e4a607385a60c5177c638',
 263             'info_dict': {
 264                 'id': '1969',
 265                 'ext': 'mp4',
 266                 'title': 'Hidden miracles of the natural world',
 267                 'uploader': 'Louie Schwartzberg',
 268                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
 269             }
 270         },
 271         # Embeded Ustream video
 272         {
 273             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
 274             'md5': '27b99cdb639c9b12a79bca876a073417',
 275             'info_dict': {
 276                 'id': '45734260',
 277                 'ext': 'flv',
 278                 'uploader': 'AU SPA:  The NSA and Privacy',
 279                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
 280             }
 281         },
 282         # nowvideo embed hidden behind percent encoding
 283         {
 284             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
 285             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
 286             'info_dict': {
 287                 'id': '06e53103ca9aa',
 288                 'ext': 'flv',
 289                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
 290                 'description': 'No description',
 291             },
 292         },
 293         # arte embed
 294         {
 295             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
 296             'md5': '7653032cbb25bf6c80d80f217055fa43',
 297             'info_dict': {
 298                 'id': '048195-004_PLUS7-F',
 299                 'ext': 'flv',
 300                 'title': 'X:enius',
 301                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
 302                 'upload_date': '20140320',
 303             },
 304             'params': {
 305                 'skip_download': 'Requires rtmpdump'
 306             }
 307         },
 308         # Condé Nast embed
 309         {
 310             'url': 'http://www.wired.com/2014/04/honda-asimo/',
 311             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
 312             'info_dict': {
 313                 'id': '53501be369702d3275860000',
 314                 'ext': 'mp4',
 315                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
 316             }
 317         },
 318         # Dailymotion embed
 319         {
 320             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
 321             'md5': '441aeeb82eb72c422c7f14ec533999cd',
 322             'info_dict': {
 323                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
 324                 'ext': 'mp4',
 325                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
 326                 'uploader': 'Spi0n',
 327             },
 328             'add_ie': ['Dailymotion'],
 329         },
 330         # YouTube embed
 331         {
 332             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
 333             'info_dict': {
 334                 'id': 'FXRb4ykk4S0',
 335                 'ext': 'mp4',
 336                 'title': 'The NBL Auction 2014',
 337                 'uploader': 'BADMINTON England',
 338                 'uploader_id': 'BADMINTONEvents',
 339                 'upload_date': '20140603',
 340                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
 341             },
 342             'add_ie': ['Youtube'],
 343             'params': {
 344                 'skip_download': True,
 345             }
 346         },
 347         # MTVSercices embed
 348         {
 349             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
 350             'md5': '35727f82f58c76d996fc188f9755b0d5',
 351             'info_dict': {
 352                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
 353                 'ext': 'mp4',
 354                 'title': 'Review',
 355                 'description': 'Mario\'s life in the fast lane has never looked so good.',
 356             },
 357         },
 358         # YouTube embed via <data-embed-url="">
 359         {
 360             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
 361             'info_dict': {
 362                 'id': '4vAffPZIT44',
 363                 'ext': 'mp4',
 364                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
 365                 'uploader': 'Gameloft',
 366                 'uploader_id': 'gameloft',
 367                 'upload_date': '20140828',
 368                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
 369             },
 370             'params': {
 371                 'skip_download': True,
 372             }
 373         },
 374         # Camtasia studio
 375         {
 376             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
 377             'playlist': [{
 378                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
 379                 'info_dict': {
 380                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 381                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
 382                     'ext': 'flv',
 383                     'duration': 2235.90,
 384                 }
 385             }, {
 386                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
 387                 'info_dict': {
 388                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
 389                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
 390                     'ext': 'flv',
 391                     'duration': 2235.93,
 392                 }
 393             }],
 394             'info_dict': {
 395                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 396             }
 397         },
 398         # Flowplayer
 399         {
 400             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
 401             'md5': '9d65602bf31c6e20014319c7d07fba27',
 402             'info_dict': {
 403                 'id': '5123ea6d5e5a7',
 404                 'ext': 'mp4',
 405                 'age_limit': 18,
 406                 'uploader': 'www.handjobhub.com',
 407                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
 408             }
 409         },
 410         # RSS feed
 411         {
 412             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 413             'info_dict': {
 414                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 415                 'title': 'Zero Punctuation',
 416                 'description': 're:.*groundbreaking video review series.*'
 417             },
 418             'playlist_mincount': 11,
 419         },
 420         # Multiple brightcove videos
 421         # https://github.com/rg3/youtube-dl/issues/2283
 422         {
 423             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
 424             'info_dict': {
 425                 'id': 'always-never',
 426                 'title': 'Always / Never - The New Yorker',
 427             },
 428             'playlist_count': 3,
 429             'params': {
 430                 'extract_flat': False,
 431                 'skip_download': True,
 432             }
 433         },
 434         # MLB embed
 435         {
 436             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
 437             'md5': '96f09a37e44da40dd083e12d9a683327',
 438             'info_dict': {
 439                 'id': '33322633',
 440                 'ext': 'mp4',
 441                 'title': 'Ump changes call to ball',
 442                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
 443                 'duration': 48,
 444                 'timestamp': 1401537900,
 445                 'upload_date': '20140531',
 446                 'thumbnail': 're:^https?://.*\.jpg$',
 447             },
 448         },
 449         # Wistia embed
 450         {
 451             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
 452             'md5': '8788b683c777a5cf25621eaf286d0c23',
 453             'info_dict': {
 454                 'id': '1cfaf6b7ea',
 455                 'ext': 'mov',
 456                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
 457                 'duration': 643.0,
 458                 'filesize': 182808282,
 459                 'uploader': 'education-portal.com',
 460             },
 461         },
 462         {
 463             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
 464             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
 465             'info_dict': {
 466                 'id': 'uxjb0lwrcz',
 467                 'ext': 'mp4',
 468                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
 469                 'duration': 1715.0,
 470                 'uploader': 'thoughtworks.wistia.com',
 471             },
 472         },
 473         # Direct download with broken HEAD
 474         {
 475             'url': 'http://ai-radio.org:8000/radio.opus',
 476             'info_dict': {
 477                 'id': 'radio',
 478                 'ext': 'opus',
 479                 'title': 'radio',
 480             },
 481             'params': {
 482                 'skip_download': True,  # infinite live stream
 483             },
 484             'expected_warnings': [
 485                 r'501.*Not Implemented'
 486             ],
 487         },
 488         # Soundcloud embed
 489         {
 490             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
 491             'info_dict': {
 492                 'id': '174391317',
 493                 'ext': 'mp3',
 494                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
 495                 'uploader': 'Sophos Security',
 496                 'title': 'Chet Chat 171 - Oct 29, 2014',
 497                 'upload_date': '20141029',
 498             }
 499         },
 500         # Livestream embed
 501         {
 502             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
 503             'info_dict': {
 504                 'id': '67864563',
 505                 'ext': 'flv',
 506                 'upload_date': '20141112',
 507                 'title': 'Rosetta #CometLanding webcast HL 10',
 508             }
 509         },
 510         # LazyYT
 511         {
 512             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
 513             'info_dict': {
 514                 'id': '1986',
 515                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
 516             },
 517             'playlist_mincount': 2,
 518         },
 519         # Direct link with incorrect MIME type
 520         {
 521             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
 522             'md5': '4ccbebe5f36706d85221f204d7eb5913',
 523             'info_dict': {
 524                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
 525                 'id': '5_Lennart_Poettering_-_Systemd',
 526                 'ext': 'webm',
 527                 'title': '5_Lennart_Poettering_-_Systemd',
 528                 'upload_date': '20141120',
 529             },
 530             'expected_warnings': [
 531                 'URL could be a direct video link, returning it as such.'
 532             ]
 533         },
 534         # Cinchcast embed
 535         {
 536             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
 537             'info_dict': {
 538                 'id': '7141703',
 539                 'ext': 'mp3',
 540                 'upload_date': '20141126',
 541                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
 542             }
 543         },
 544         # Cinerama player
 545         {
 546             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
 547             'info_dict': {
 548                 'id': '730m_DandD_1901_512k',
 549                 'ext': 'mp4',
 550                 'uploader': 'www.abc.net.au',
 551                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
 552             }
 553         },
 554         # embedded viddler video
 555         {
 556             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
 557             'info_dict': {
 558                 'id': '4d03aad9',
 559                 'ext': 'mp4',
 560                 'uploader': 'deadspin',
 561                 'title': 'WALL-TO-GORTAT',
 562                 'timestamp': 1422285291,
 563                 'upload_date': '20150126',
 564             },
 565             'add_ie': ['Viddler'],
 566         },
 567         # Libsyn embed
 568         {
 569             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
 570             'info_dict': {
 571                 'id': '3377616',
 572                 'ext': 'mp3',
 573                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
 574                 'description': 'md5:601cb790edd05908957dae8aaa866465',
 575                 'upload_date': '20150220',
 576             },
 577         },
 578         # jwplayer YouTube
 579         {
 580             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
 581             'info_dict': {
 582                 'id': 'Mrj4DVp2zeA',
 583                 'ext': 'mp4',
 584                 'upload_date': '20150212',
 585                 'uploader': 'The National Archives UK',
 586                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
 587                 'uploader_id': 'NationalArchives08',
 588                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
 589             },
 590         },
 591         # rtl.nl embed
 592         {
 593             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
 594             'playlist_mincount': 5,
 595             'info_dict': {
 596                 'id': 'aanslagen-kopenhagen',
 597                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
 598             }
 599         },
 600         # Zapiks embed
 601         {
 602             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
 603             'info_dict': {
 604                 'id': '118046',
 605                 'ext': 'mp4',
 606                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
 607             }
 608         },
 609         # Kaltura embed
 610         {
 611             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
 612             'info_dict': {
 613                 'id': '1_eergr3h1',
 614                 'ext': 'mp4',
 615                 'upload_date': '20150226',
 616                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
 617                 'timestamp': int,
 618                 'title': 'John Carlson Postgame 2/25/15',
 619             },
 620         },
 621         # Eagle.Platform embed (generic URL)
 622         {
 623             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
 624             'info_dict': {
 625                 'id': '227304',
 626                 'ext': 'mp4',
 627                 'title': 'Навальный вышел на свободу',
 628                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
 629                 'thumbnail': 're:^https?://.*\.jpg$',
 630                 'duration': 87,
 631                 'view_count': int,
 632                 'age_limit': 0,
 633             },
 634         },
 635         # ClipYou (Eagle.Platform) embed (custom URL)
 636         {
 637             'url': 'http://muz-tv.ru/play/7129/',
 638             'info_dict': {
 639                 'id': '12820',
 640                 'ext': 'mp4',
 641                 'title': "'O Sole Mio",
 642                 'thumbnail': 're:^https?://.*\.jpg$',
 643                 'duration': 216,
 644                 'view_count': int,
 645             },
 646         },
 647         # Pladform embed
 648         {
 649             'url': 'http://muz-tv.ru/kinozal/view/7400/',
 650             'info_dict': {
 651                 'id': '100183293',
 652                 'ext': 'mp4',
 653                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
 654                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
 655                 'thumbnail': 're:^https?://.*\.jpg$',
 656                 'duration': 694,
 657                 'age_limit': 0,
 658             },
 659         },
 660         # Playwire embed
 661         {
 662             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
 663             'info_dict': {
 664                 'id': '3519514',
 665                 'ext': 'mp4',
 666                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
 667                 'thumbnail': 're:^https?://.*\.png$',
 668                 'duration': 45.115,
 669             },
 670         },
 671         # 5min embed
 672         {
 673             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
 674             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
 675             'info_dict': {
 676                 'id': '518726732',
 677                 'ext': 'mp4',
 678                 'title': 'Facebook Creates "On This Day" | Crunch Report',
 679             },
 680         },
 681         # SVT embed
 682         {
 683             'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
 684             'info_dict': {
 685                 'id': '2900353',
 686                 'ext': 'flv',
 687                 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
 688                 'duration': 27,
 689                 'age_limit': 0,
 690             },
 691         },
 692         # RSS feed with enclosure
 693         {
 694             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
 695             'info_dict': {
 696                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 697                 'ext': 'm4v',
 698                 'upload_date': '20150228',
 699                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 700             }
 701         },
 702         # Crooks and Liars embed
 703         {
 704             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
 705             'info_dict': {
 706                 'id': '8RUoRhRi',
 707                 'ext': 'mp4',
 708                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
 709                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
 710                 'timestamp': 1428207000,
 711                 'upload_date': '20150405',
 712                 'uploader': 'Heather',
 713             },
 714         },
 715         # Crooks and Liars external embed
 716         {
 717             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
 718             'info_dict': {
 719                 'id': 'MTE3MjUtMzQ2MzA',
 720                 'ext': 'mp4',
 721                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
 722                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
 723                 'timestamp': 1265032391,
 724                 'upload_date': '20100201',
 725                 'uploader': 'Heather',
 726             },
 727         },
 728         # NBC Sports vplayer embed
 729         {
 730             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
 731             'info_dict': {
 732                 'id': 'ln7x1qSThw4k',
 733                 'ext': 'flv',
 734                 'title': "PFT Live: New leader in the 'new-look' defense",
 735                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
 736             },
 737         },
 738         # UDN embed
 739         {
 740             'url': 'http://www.udn.com/news/story/7314/822787',
 741             'md5': 'fd2060e988c326991037b9aff9df21a6',
 742             'info_dict': {
 743                 'id': '300346',
 744                 'ext': 'mp4',
 745                 'title': '中一中男師變性 全校師生力挺',
 746                 'thumbnail': 're:^https?://.*\.jpg$',
 747             }
 748         },
 749         # Ooyala embed
 750         {
 751             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
 752             'info_dict': {
 753                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
 754                 'ext': 'mp4',
 755                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
 756                 'title': 'This is what separates the Excel masters from the wannabes',
 757             },
 758             'params': {
 759                 # m3u8 downloads
 760                 'skip_download': True,
 761             }
 762         },
 763         # Contains a SMIL manifest
 764         {
 765             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
 766             'info_dict': {
 767                 'id': 'file',
 768                 'ext': 'flv',
 769                 'title': '+ Football: Lottery Champions League Europe',
 770                 'uploader': 'www.telewebion.com',
 771             },
 772             'params': {
 773                 # rtmpe downloads
 774                 'skip_download': True,
 775             }
 776         }
 777     ]
 778
 779     def report_following_redirect(self, new_url):
 780         """Report information extraction."""
 781         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
 782
 783     def _extract_rss(self, url, video_id, doc):
 784         playlist_title = doc.find('./channel/title').text
 785         playlist_desc_el = doc.find('./channel/description')
 786         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
 787
 788         entries = []
 789         for it in doc.findall('./channel/item'):
 790             next_url = xpath_text(it, 'link', fatal=False)
 791             if not next_url:
 792                 enclosure_nodes = it.findall('./enclosure')
 793                 for e in enclosure_nodes:
 794                     next_url = e.attrib.get('url')
 795                     if next_url:
 796                         break
 797
 798             if not next_url:
 799                 continue
 800
 801             entries.append({
 802                 '_type': 'url',
 803                 'url': next_url,
 804                 'title': it.find('title').text,
 805             })
 806
 807         return {
 808             '_type': 'playlist',
 809             'id': url,
 810             'title': playlist_title,
 811             'description': playlist_desc,
 812             'entries': entries,
 813         }
 814
 815     def _extract_camtasia(self, url, video_id, webpage):
 816         """ Returns None if no camtasia video can be found. """
 817
 818         camtasia_cfg = self._search_regex(
 819             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
 820             webpage, 'camtasia configuration file', default=None)
 821         if camtasia_cfg is None:
 822             return None
 823
 824         title = self._html_search_meta('DC.title', webpage, fatal=True)
 825
 826         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
 827         camtasia_cfg = self._download_xml(
 828             camtasia_url, video_id,
 829             note='Downloading camtasia configuration',
 830             errnote='Failed to download camtasia configuration')
 831         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
 832
 833         entries = []
 834         for n in fileset_node.getchildren():
 835             url_n = n.find('./uri')
 836             if url_n is None:
 837                 continue
 838
 839             entries.append({
 840                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
 841                 'title': '%s - %s' % (title, n.tag),
 842                 'url': compat_urlparse.urljoin(url, url_n.text),
 843                 'duration': float_or_none(n.find('./duration').text),
 844             })
 845
 846         return {
 847             '_type': 'playlist',
 848             'entries': entries,
 849             'title': title,
 850         }
 851
 852     def _real_extract(self, url):
 853         if url.startswith('//'):
 854             return {
 855                 '_type': 'url',
 856                 'url': self.http_scheme() + url,
 857             }
 858
 859         parsed_url = compat_urlparse.urlparse(url)
 860         if not parsed_url.scheme:
 861             default_search = self._downloader.params.get('default_search')
 862             if default_search is None:
 863                 default_search = 'fixup_error'
 864
 865             if default_search in ('auto', 'auto_warning', 'fixup_error'):
 866                 if '/' in url:
 867                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
 868                     return self.url_result('http://' + url)
 869                 elif default_search != 'fixup_error':
 870                     if default_search == 'auto_warning':
 871                         if re.match(r'^(?:url|URL)$', url):
 872                             raise ExtractorError(
 873                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
 874                                 expected=True)
 875                         else:
 876                             self._downloader.report_warning(
 877                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
 878                     return self.url_result('ytsearch:' + url)
 879
 880             if default_search in ('error', 'fixup_error'):
 881                 raise ExtractorError(
 882                     '%r is not a valid URL. '
 883                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
 884                     % (url, url), expected=True)
 885             else:
 886                 if ':' not in default_search:
 887                     default_search += ':'
 888                 return self.url_result(default_search + url)
 889
 890         url, smuggled_data = unsmuggle_url(url)
 891         force_videoid = None
 892         is_intentional = smuggled_data and smuggled_data.get('to_generic')
 893         if smuggled_data and 'force_videoid' in smuggled_data:
 894             force_videoid = smuggled_data['force_videoid']
 895             video_id = force_videoid
 896         else:
 897             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
 898
 899         self.to_screen('%s: Requesting header' % video_id)
 900
 901         head_req = HEADRequest(url)
 902         head_response = self._request_webpage(
 903             head_req, video_id,
 904             note=False, errnote='Could not send HEAD request to %s' % url,
 905             fatal=False)
 906
 907         if head_response is not False:
 908             # Check for redirect
 909             new_url = head_response.geturl()
 910             if url != new_url:
 911                 self.report_following_redirect(new_url)
 912                 if force_videoid:
 913                     new_url = smuggle_url(
 914                         new_url, {'force_videoid': force_videoid})
 915                 return self.url_result(new_url)
 916
 917         full_response = None
 918         if head_response is False:
 919             full_response = self._request_webpage(url, video_id)
 920             head_response = full_response
 921
 922         # Check for direct link to a video
 923         content_type = head_response.headers.get('Content-Type', '')
 924         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
 925         if m:
 926             upload_date = unified_strdate(
 927                 head_response.headers.get('Last-Modified'))
 928             return {
 929                 'id': video_id,
 930                 'title': os.path.splitext(url_basename(url))[0],
 931                 'direct': True,
 932                 'formats': [{
 933                     'format_id': m.group('format_id'),
 934                     'url': url,
 935                     'vcodec': 'none' if m.group('type') == 'audio' else None
 936                 }],
 937                 'upload_date': upload_date,
 938             }
 939
 940         if not self._downloader.params.get('test', False) and not is_intentional:
 941             self._downloader.report_warning('Falling back on generic information extractor.')
 942
 943         if not full_response:
 944             full_response = self._request_webpage(url, video_id)
 945
 946         # Maybe it's a direct link to a video?
 947         # Be careful not to download the whole thing!
 948         first_bytes = full_response.read(512)
 949         if not is_html(first_bytes):
 950             self._downloader.report_warning(
 951                 'URL could be a direct video link, returning it as such.')
 952             upload_date = unified_strdate(
 953                 head_response.headers.get('Last-Modified'))
 954             return {
 955                 'id': video_id,
 956                 'title': os.path.splitext(url_basename(url))[0],
 957                 'direct': True,
 958                 'url': url,
 959                 'upload_date': upload_date,
 960             }
 961
 962         webpage = self._webpage_read_content(
 963             full_response, url, video_id, prefix=first_bytes)
 964
 965         self.report_extraction(video_id)
 966
 967         # Is it an RSS feed?
 968         try:
 969             doc = parse_xml(webpage)
 970             if doc.tag == 'rss':
 971                 return self._extract_rss(url, video_id, doc)
 972         except compat_xml_parse_error:
 973             pass
 974
 975         # Is it a Camtasia project?
 976         camtasia_res = self._extract_camtasia(url, video_id, webpage)
 977         if camtasia_res is not None:
 978             return camtasia_res
 979
 980         # Sometimes embedded video player is hidden behind percent encoding
 981         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
 982         # Unescaping the whole page allows to handle those cases in a generic way
 983         webpage = compat_urllib_parse.unquote(webpage)
 984
 985         # it's tempting to parse this further, but you would
 986         # have to take into account all the variations like
 987         #   Video Title - Site Name
 988         #   Site Name | Video Title
 989         #   Video Title - Tagline | Site Name
 990         # and so on and so forth; it's just not practical
 991         video_title = self._html_search_regex(
 992             r'(?s)<title>(.*?)</title>', webpage, 'video title',
 993             default='video')
 994
 995         # Try to detect age limit automatically
 996         age_limit = self._rta_search(webpage)
 997         # And then there are the jokers who advertise that they use RTA,
 998         # but actually don't.
 999         AGE_LIMIT_MARKERS = [
1000             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1001         ]
1002         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1003             age_limit = 18
1004
1005         # video uploader is domain name
1006         video_uploader = self._search_regex(
1007             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1008
1009         # Helper method
1010         def _playlist_from_matches(matches, getter=None, ie=None):
1011             urlrs = orderedSet(
1012                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1013                 for m in matches)
1014             return self.playlist_result(
1015                 urlrs, playlist_id=video_id, playlist_title=video_title)
1016
1017         # Look for BrightCove:
1018         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1019         if bc_urls:
1020             self.to_screen('Brightcove video detected.')
1021             entries = [{
1022                 '_type': 'url',
1023                 'url': smuggle_url(bc_url, {'Referer': url}),
1024                 'ie_key': 'Brightcove'
1025             } for bc_url in bc_urls]
1026
1027             return {
1028                 '_type': 'playlist',
1029                 'title': video_title,
1030                 'id': video_id,
1031                 'entries': entries,
1032             }
1033
1034         # Look for embedded rtl.nl player
1035         matches = re.findall(
1036             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
1037             webpage)
1038         if matches:
1039             return _playlist_from_matches(matches, ie='RtlNl')
1040
1041         # Look for embedded (iframe) Vimeo player
1042         mobj = re.search(
1043             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1044         if mobj:
1045             player_url = unescapeHTML(mobj.group('url'))
1046             surl = smuggle_url(player_url, {'Referer': url})
1047             return self.url_result(surl)
1048         # Look for embedded (swf embed) Vimeo player
1049         mobj = re.search(
1050             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1051         if mobj:
1052             return self.url_result(mobj.group(1))
1053
1054         # Look for embedded YouTube player
1055         matches = re.findall(r'''(?x)
1056             (?:
1057                 <iframe[^>]+?src=|
1058                 data-video-url=|
1059                 <embed[^>]+?src=|
1060                 embedSWF\(?:\s*|
1061                 new\s+SWFObject\(
1062             )
1063             (["\'])
1064                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1065                 (?:embed|v|p)/.+?)
1066             \1''', webpage)
1067         if matches:
1068             return _playlist_from_matches(
1069                 matches, lambda m: unescapeHTML(m[1]))
1070
1071         # Look for lazyYT YouTube embed
1072         matches = re.findall(
1073             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1074         if matches:
1075             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1076
1077         # Look for embedded Dailymotion player
1078         matches = re.findall(
1079             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1080         if matches:
1081             return _playlist_from_matches(
1082                 matches, lambda m: unescapeHTML(m[1]))
1083
1084         # Look for embedded Dailymotion playlist player (#3822)
1085         m = re.search(
1086             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1087         if m:
1088             playlists = re.findall(
1089                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1090             if playlists:
1091                 return _playlist_from_matches(
1092                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1093
1094         # Look for embedded Wistia player
1095         match = re.search(
1096             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1097         if match:
1098             embed_url = self._proto_relative_url(
1099                 unescapeHTML(match.group('url')))
1100             return {
1101                 '_type': 'url_transparent',
1102                 'url': embed_url,
1103                 'ie_key': 'Wistia',
1104                 'uploader': video_uploader,
1105                 'title': video_title,
1106                 'id': video_id,
1107             }
1108
1109         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1110         if match:
1111             return {
1112                 '_type': 'url_transparent',
1113                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1114                 'ie_key': 'Wistia',
1115                 'uploader': video_uploader,
1116                 'title': video_title,
1117                 'id': match.group('id')
1118             }
1119
1120         # Look for embedded blip.tv player
1121         bliptv_url = BlipTVIE._extract_url(webpage)
1122         if bliptv_url:
1123             return self.url_result(bliptv_url, 'BlipTV')
1124
1125         # Look for SVT player
1126         svt_url = SVTIE._extract_url(webpage)
1127         if svt_url:
1128             return self.url_result(svt_url, 'SVT')
1129
1130         # Look for embedded condenast player
1131         matches = re.findall(
1132             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1133             webpage)
1134         if matches:
1135             return {
1136                 '_type': 'playlist',
1137                 'entries': [{
1138                     '_type': 'url',
1139                     'ie_key': 'CondeNast',
1140                     'url': ma,
1141                 } for ma in matches],
1142                 'title': video_title,
1143                 'id': video_id,
1144             }
1145
1146         # Look for Bandcamp pages with custom domain
1147         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1148         if mobj is not None:
1149             burl = unescapeHTML(mobj.group(1))
1150             # Don't set the extractor because it can be a track url or an album
1151             return self.url_result(burl)
1152
1153         # Look for embedded Vevo player
1154         mobj = re.search(
1155             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1156         if mobj is not None:
1157             return self.url_result(mobj.group('url'))
1158
1159         # Look for embedded Viddler player
1160         mobj = re.search(
1161             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1162             webpage)
1163         if mobj is not None:
1164             return self.url_result(mobj.group('url'))
1165
1166         # Look for NYTimes player
1167         mobj = re.search(
1168             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1169             webpage)
1170         if mobj is not None:
1171             return self.url_result(mobj.group('url'))
1172
1173         # Look for Libsyn player
1174         mobj = re.search(
1175             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1176         if mobj is not None:
1177             return self.url_result(mobj.group('url'))
1178
1179         # Look for Ooyala videos
1180         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1181                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1182                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1183                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1184         if mobj is not None:
1185             return OoyalaIE._build_url_result(mobj.group('ec'))
1186
1187         # Look for multiple Ooyala embeds on SBN network websites
1188         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1189         if mobj is not None:
1190             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1191             if embeds:
1192                 return _playlist_from_matches(
1193                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1194
1195         # Look for Aparat videos
1196         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1197         if mobj is not None:
1198             return self.url_result(mobj.group(1), 'Aparat')
1199
1200         # Look for MPORA videos
1201         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1202         if mobj is not None:
1203             return self.url_result(mobj.group(1), 'Mpora')
1204
1205         # Look for embedded NovaMov-based player
1206         mobj = re.search(
1207             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1208                     (?P<url>http://(?:(?:embed|www)\.)?
1209                         (?:novamov\.com|
1210                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1211                            videoweed\.(?:es|com)|
1212                            movshare\.(?:net|sx|ag)|
1213                            divxstage\.(?:eu|net|ch|co|at|ag))
1214                         /embed\.php.+?)\1''', webpage)
1215         if mobj is not None:
1216             return self.url_result(mobj.group('url'))
1217
1218         # Look for embedded Facebook player
1219         mobj = re.search(
1220             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1221         if mobj is not None:
1222             return self.url_result(mobj.group('url'), 'Facebook')
1223
1224         # Look for embedded VK player
1225         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1226         if mobj is not None:
1227             return self.url_result(mobj.group('url'), 'VK')
1228
1229         # Look for embedded ivi player
1230         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1231         if mobj is not None:
1232             return self.url_result(mobj.group('url'), 'Ivi')
1233
1234         # Look for embedded Huffington Post player
1235         mobj = re.search(
1236             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1237         if mobj is not None:
1238             return self.url_result(mobj.group('url'), 'HuffPost')
1239
1240         # Look for embed.ly
1241         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1242         if mobj is not None:
1243             return self.url_result(mobj.group('url'))
1244         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1245         if mobj is not None:
1246             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1247
1248         # Look for funnyordie embed
1249         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1250         if matches:
1251             return _playlist_from_matches(
1252                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1253
1254         # Look for BBC iPlayer embed
1255         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1256         if matches:
1257             return _playlist_from_matches(matches, ie='BBCCoUk')
1258
1259         # Look for embedded RUTV player
1260         rutv_url = RUTVIE._extract_url(webpage)
1261         if rutv_url:
1262             return self.url_result(rutv_url, 'RUTV')
1263
1264         # Look for embedded SportBox player
1265         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1266         if sportbox_urls:
1267             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1268
1269         # Look for embedded TED player
1270         mobj = re.search(
1271             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1272         if mobj is not None:
1273             return self.url_result(mobj.group('url'), 'TED')
1274
1275         # Look for embedded Ustream videos
1276         mobj = re.search(
1277             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1278         if mobj is not None:
1279             return self.url_result(mobj.group('url'), 'Ustream')
1280
1281         # Look for embedded arte.tv player
1282         mobj = re.search(
1283             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1284             webpage)
1285         if mobj is not None:
1286             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1287
1288         # Look for embedded smotri.com player
1289         smotri_url = SmotriIE._extract_url(webpage)
1290         if smotri_url:
1291             return self.url_result(smotri_url, 'Smotri')
1292
1293         # Look for embeded soundcloud player
1294         mobj = re.search(
1295             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1296             webpage)
1297         if mobj is not None:
1298             url = unescapeHTML(mobj.group('url'))
1299             return self.url_result(url)
1300
1301         # Look for embedded vulture.com player
1302         mobj = re.search(
1303             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1304             webpage)
1305         if mobj is not None:
1306             url = unescapeHTML(mobj.group('url'))
1307             return self.url_result(url, ie='Vulture')
1308
1309         # Look for embedded mtvservices player
1310         mobj = re.search(
1311             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1312             webpage)
1313         if mobj is not None:
1314             url = unescapeHTML(mobj.group('url'))
1315             return self.url_result(url, ie='MTVServicesEmbedded')
1316
1317         # Look for embedded yahoo player
1318         mobj = re.search(
1319             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1320             webpage)
1321         if mobj is not None:
1322             return self.url_result(mobj.group('url'), 'Yahoo')
1323
1324         # Look for embedded sbs.com.au player
1325         mobj = re.search(
1326             r'''(?x)
1327             (?:
1328                 <meta\s+property="og:video"\s+content=|
1329                 <iframe[^>]+?src=
1330             )
1331             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1332             webpage)
1333         if mobj is not None:
1334             return self.url_result(mobj.group('url'), 'SBS')
1335
1336         # Look for embedded Cinchcast player
1337         mobj = re.search(
1338             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1339             webpage)
1340         if mobj is not None:
1341             return self.url_result(mobj.group('url'), 'Cinchcast')
1342
1343         mobj = re.search(
1344             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1345             webpage)
1346         if not mobj:
1347             mobj = re.search(
1348                 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1349                 webpage)
1350         if mobj is not None:
1351             return self.url_result(mobj.group('url'), 'MLB')
1352
1353         mobj = re.search(
1354             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1355             webpage)
1356         if mobj is not None:
1357             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1358
1359         mobj = re.search(
1360             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1361             webpage)
1362         if mobj is not None:
1363             return self.url_result(mobj.group('url'), 'Livestream')
1364
1365         # Look for Zapiks embed
1366         mobj = re.search(
1367             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1368         if mobj is not None:
1369             return self.url_result(mobj.group('url'), 'Zapiks')
1370
1371         # Look for Kaltura embeds
1372         mobj = re.search(
1373             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1374         if mobj is not None:
1375             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1376
1377         # Look for Eagle.Platform embeds
1378         mobj = re.search(
1379             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1380         if mobj is not None:
1381             return self.url_result(mobj.group('url'), 'EaglePlatform')
1382
1383         # Look for ClipYou (uses Eagle.Platform) embeds
1384         mobj = re.search(
1385             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1386         if mobj is not None:
1387             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1388
1389         # Look for Pladform embeds
1390         mobj = re.search(
1391             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1392         if mobj is not None:
1393             return self.url_result(mobj.group('url'), 'Pladform')
1394
1395         # Look for Playwire embeds
1396         mobj = re.search(
1397             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1398         if mobj is not None:
1399             return self.url_result(mobj.group('url'))
1400
1401         # Look for 5min embeds
1402         mobj = re.search(
1403             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1404         if mobj is not None:
1405             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1406
1407         # Look for Crooks and Liars embeds
1408         mobj = re.search(
1409             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1410         if mobj is not None:
1411             return self.url_result(mobj.group('url'))
1412
1413         # Look for NBC Sports VPlayer embeds
1414         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1415         if nbc_sports_url:
1416             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1417
1418         # Look for UDN embeds
1419         mobj = re.search(
1420             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1421         if mobj is not None:
1422             return self.url_result(
1423                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1424
1425         # Look for Senate ISVP iframe
1426         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1427         if senate_isvp_url:
1428             return self.url_result(senate_isvp_url, 'SenateISVP')
1429
1430         def check_video(vurl):
1431             if YoutubeIE.suitable(vurl):
1432                 return True
1433             vpath = compat_urlparse.urlparse(vurl).path
1434             vext = determine_ext(vpath)
1435             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1436
1437         def filter_video(urls):
1438             return list(filter(check_video, urls))
1439
1440         # Start with something easy: JW Player in SWFObject
1441         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1442         if not found:
1443             # Look for gorilla-vid style embedding
1444             found = filter_video(re.findall(r'''(?sx)
1445                 (?:
1446                     jw_plugins|
1447                     JWPlayerOptions|
1448                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1449                 )
1450                 .*?
1451                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1452         if not found:
1453             # Broaden the search a little bit
1454             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1455         if not found:
1456             # Broaden the findall a little bit: JWPlayer JS loader
1457             found = filter_video(re.findall(
1458                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1459         if not found:
1460             # Flow player
1461             found = filter_video(re.findall(r'''(?xs)
1462                 flowplayer\("[^"]+",\s*
1463                     \{[^}]+?\}\s*,
1464                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1465                         ["']?url["']?\s*:\s*["']([^"']+)["']
1466             ''', webpage))
1467         if not found:
1468             # Cinerama player
1469             found = re.findall(
1470                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1471         if not found:
1472             # Try to find twitter cards info
1473             found = filter_video(re.findall(
1474                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1475         if not found:
1476             # We look for Open Graph info:
1477             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1478             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1479             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1480             if m_video_type is not None:
1481                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1482         if not found:
1483             # HTML5 video
1484             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1485         if not found:
1486             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1487             found = re.search(
1488                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1489                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1490                 webpage)
1491             if not found:
1492                 # Look also in Refresh HTTP header
1493                 refresh_header = head_response.headers.get('Refresh')
1494                 if refresh_header:
1495                     found = re.search(REDIRECT_REGEX, refresh_header)
1496             if found:
1497                 new_url = compat_urlparse.urljoin(url, found.group(1))
1498                 self.report_following_redirect(new_url)
1499                 return {
1500                     '_type': 'url',
1501                     'url': new_url,
1502                 }
1503         if not found:
1504             raise UnsupportedError(url)
1505
1506         entries = []
1507         for video_url in found:
1508             video_url = compat_urlparse.urljoin(url, video_url)
1509             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1510
1511             # Sometimes, jwplayer extraction will result in a YouTube URL
1512             if YoutubeIE.suitable(video_url):
1513                 entries.append(self.url_result(video_url, 'Youtube'))
1514                 continue
1515
1516             # here's a fun little line of code for you:
1517             video_id = os.path.splitext(video_id)[0]
1518
1519             if determine_ext(video_url) == 'smil':
1520                 entries.append({
1521                     'id': video_id,
1522                     'formats': self._extract_smil_formats(video_url, video_id),
1523                     'uploader': video_uploader,
1524                     'title': video_title,
1525                     'age_limit': age_limit,
1526                 })
1527             else:
1528                 entries.append({
1529                     'id': video_id,
1530                     'url': video_url,
1531                     'uploader': video_uploader,
1532                     'title': video_title,
1533                     'age_limit': age_limit,
1534                 })
1535
1536         if len(entries) == 1:
1537             return entries[0]
1538         else:
1539             for num, e in enumerate(entries, start=1):
1540                 # 'url' results don't have a title
1541                 if e.get('title') is not None:
1542                     e['title'] = '%s (%d)' % (e['title'], num)
1543             return {
1544                 '_type': 'playlist',
1545                 'entries': entries,
1546             }