_ Git - youtube-dl/blob - youtube_dl/extractor/generic.py

   1 # encoding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 import os
   6 import re
   7
   8 from .common import InfoExtractor
   9 from .youtube import YoutubeIE
  10 from ..compat import (
  11     compat_urllib_parse,
  12     compat_urlparse,
  13     compat_xml_parse_error,
  14 )
  15 from ..utils import (
  16     determine_ext,
  17     ExtractorError,
  18     float_or_none,
  19     HEADRequest,
  20     is_html,
  21     orderedSet,
  22     parse_xml,
  23     smuggle_url,
  24     unescapeHTML,
  25     unified_strdate,
  26     unsmuggle_url,
  27     UnsupportedError,
  28     url_basename,
  29     xpath_text,
  30 )
  31 from .brightcove import BrightcoveIE
  32 from .nbc import NBCSportsVPlayerIE
  33 from .ooyala import OoyalaIE
  34 from .rutv import RUTVIE
  35 from .smotri import SmotriIE
  36 from .condenast import CondeNastIE
  37 from .udn import UDNEmbedIE
  38 from .senateisvp import SenateISVPIE
  39
  40
  41 class GenericIE(InfoExtractor):
  42     IE_DESC = 'Generic downloader that works on some sites'
  43     _VALID_URL = r'.*'
  44     IE_NAME = 'generic'
  45     _TESTS = [
  46         {
  47             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
  48             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
  49             'info_dict': {
  50                 'id': '13601338388002',
  51                 'ext': 'mp4',
  52                 'uploader': 'www.hodiho.fr',
  53                 'title': 'R\u00e9gis plante sa Jeep',
  54             }
  55         },
  56         # bandcamp page with custom domain
  57         {
  58             'add_ie': ['Bandcamp'],
  59             'url': 'http://bronyrock.com/track/the-pony-mash',
  60             'info_dict': {
  61                 'id': '3235767654',
  62                 'ext': 'mp3',
  63                 'title': 'The Pony Mash',
  64                 'uploader': 'M_Pallante',
  65             },
  66             'skip': 'There is a limit of 200 free downloads / month for the test song',
  67         },
  68         # embedded brightcove video
  69         # it also tests brightcove videos that need to set the 'Referer' in the
  70         # http requests
  71         {
  72             'add_ie': ['Brightcove'],
  73             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
  74             'info_dict': {
  75                 'id': '2765128793001',
  76                 'ext': 'mp4',
  77                 'title': 'Le cours de bourse : l’analyse technique',
  78                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
  79                 'uploader': 'BFM BUSINESS',
  80             },
  81             'params': {
  82                 'skip_download': True,
  83             },
  84         },
  85         {
  86             # https://github.com/rg3/youtube-dl/issues/2253
  87             'url': 'http://bcove.me/i6nfkrc3',
  88             'md5': '0ba9446db037002366bab3b3eb30c88c',
  89             'info_dict': {
  90                 'id': '3101154703001',
  91                 'ext': 'mp4',
  92                 'title': 'Still no power',
  93                 'uploader': 'thestar.com',
  94                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
  95             },
  96             'add_ie': ['Brightcove'],
  97         },
  98         {
  99             'url': 'http://www.championat.com/video/football/v/87/87499.html',
 100             'md5': 'fb973ecf6e4a78a67453647444222983',
 101             'info_dict': {
 102                 'id': '3414141473001',
 103                 'ext': 'mp4',
 104                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
 105                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
 106                 'uploader': 'Championat',
 107             },
 108         },
 109         {
 110             # https://github.com/rg3/youtube-dl/issues/3541
 111             'add_ie': ['Brightcove'],
 112             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
 113             'info_dict': {
 114                 'id': '3866516442001',
 115                 'ext': 'mp4',
 116                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
 117                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
 118                 'uploader': 'SBS Broadcasting',
 119             },
 120             'skip': 'Restricted to Netherlands',
 121             'params': {
 122                 'skip_download': True,  # m3u8 download
 123             },
 124         },
 125         # Direct link to a video
 126         {
 127             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
 128             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
 129             'info_dict': {
 130                 'id': 'trailer',
 131                 'ext': 'mp4',
 132                 'title': 'trailer',
 133                 'upload_date': '20100513',
 134             }
 135         },
 136         # ooyala video
 137         {
 138             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
 139             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
 140             'info_dict': {
 141                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
 142                 'ext': 'mp4',
 143                 'title': '2cc213299525360.mov',  # that's what we get
 144             },
 145             'add_ie': ['Ooyala'],
 146         },
 147         # multiple ooyala embeds on SBN network websites
 148         {
 149             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 150             'info_dict': {
 151                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 152                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
 153             },
 154             'playlist_mincount': 3,
 155             'params': {
 156                 'skip_download': True,
 157             },
 158             'add_ie': ['Ooyala'],
 159         },
 160         # google redirect
 161         {
 162             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
 163             'info_dict': {
 164                 'id': 'cmQHVoWB5FY',
 165                 'ext': 'mp4',
 166                 'upload_date': '20130224',
 167                 'uploader_id': 'TheVerge',
 168                 'description': 're:^Chris Ziegler takes a look at the\.*',
 169                 'uploader': 'The Verge',
 170                 'title': 'First Firefox OS phones side-by-side',
 171             },
 172             'params': {
 173                 'skip_download': False,
 174             }
 175         },
 176         # embed.ly video
 177         {
 178             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
 179             'info_dict': {
 180                 'id': '9ODmcdjQcHQ',
 181                 'ext': 'mp4',
 182                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
 183                 'upload_date': '20140225',
 184                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
 185                 'uploader': 'Tested',
 186                 'uploader_id': 'testedcom',
 187             },
 188             # No need to test YoutubeIE here
 189             'params': {
 190                 'skip_download': True,
 191             },
 192         },
 193         # funnyordie embed
 194         {
 195             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
 196             'info_dict': {
 197                 'id': '18e820ec3f',
 198                 'ext': 'mp4',
 199                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
 200                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
 201             },
 202         },
 203         # BBC iPlayer embeds
 204         {
 205             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
 206             'info_dict': {
 207                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
 208             },
 209             'playlist_mincount': 18,
 210         },
 211         # RUTV embed
 212         {
 213             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
 214             'info_dict': {
 215                 'id': '776940',
 216                 'ext': 'mp4',
 217                 'title': 'Охотское море стало целиком российским',
 218                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
 219             },
 220             'params': {
 221                 # m3u8 download
 222                 'skip_download': True,
 223             },
 224         },
 225         # Embedded TED video
 226         {
 227             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
 228             'md5': '65fdff94098e4a607385a60c5177c638',
 229             'info_dict': {
 230                 'id': '1969',
 231                 'ext': 'mp4',
 232                 'title': 'Hidden miracles of the natural world',
 233                 'uploader': 'Louie Schwartzberg',
 234                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
 235             }
 236         },
 237         # Embeded Ustream video
 238         {
 239             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
 240             'md5': '27b99cdb639c9b12a79bca876a073417',
 241             'info_dict': {
 242                 'id': '45734260',
 243                 'ext': 'flv',
 244                 'uploader': 'AU SPA:  The NSA and Privacy',
 245                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
 246             }
 247         },
 248         # nowvideo embed hidden behind percent encoding
 249         {
 250             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
 251             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
 252             'info_dict': {
 253                 'id': '06e53103ca9aa',
 254                 'ext': 'flv',
 255                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
 256                 'description': 'No description',
 257             },
 258         },
 259         # arte embed
 260         {
 261             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
 262             'md5': '7653032cbb25bf6c80d80f217055fa43',
 263             'info_dict': {
 264                 'id': '048195-004_PLUS7-F',
 265                 'ext': 'flv',
 266                 'title': 'X:enius',
 267                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
 268                 'upload_date': '20140320',
 269             },
 270             'params': {
 271                 'skip_download': 'Requires rtmpdump'
 272             }
 273         },
 274         # Condé Nast embed
 275         {
 276             'url': 'http://www.wired.com/2014/04/honda-asimo/',
 277             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
 278             'info_dict': {
 279                 'id': '53501be369702d3275860000',
 280                 'ext': 'mp4',
 281                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
 282             }
 283         },
 284         # Dailymotion embed
 285         {
 286             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
 287             'md5': '441aeeb82eb72c422c7f14ec533999cd',
 288             'info_dict': {
 289                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
 290                 'ext': 'mp4',
 291                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
 292                 'uploader': 'Spi0n',
 293             },
 294             'add_ie': ['Dailymotion'],
 295         },
 296         # YouTube embed
 297         {
 298             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
 299             'info_dict': {
 300                 'id': 'FXRb4ykk4S0',
 301                 'ext': 'mp4',
 302                 'title': 'The NBL Auction 2014',
 303                 'uploader': 'BADMINTON England',
 304                 'uploader_id': 'BADMINTONEvents',
 305                 'upload_date': '20140603',
 306                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
 307             },
 308             'add_ie': ['Youtube'],
 309             'params': {
 310                 'skip_download': True,
 311             }
 312         },
 313         # MTVSercices embed
 314         {
 315             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
 316             'md5': '35727f82f58c76d996fc188f9755b0d5',
 317             'info_dict': {
 318                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
 319                 'ext': 'mp4',
 320                 'title': 'Review',
 321                 'description': 'Mario\'s life in the fast lane has never looked so good.',
 322             },
 323         },
 324         # YouTube embed via <data-embed-url="">
 325         {
 326             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
 327             'info_dict': {
 328                 'id': '4vAffPZIT44',
 329                 'ext': 'mp4',
 330                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
 331                 'uploader': 'Gameloft',
 332                 'uploader_id': 'gameloft',
 333                 'upload_date': '20140828',
 334                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
 335             },
 336             'params': {
 337                 'skip_download': True,
 338             }
 339         },
 340         # Camtasia studio
 341         {
 342             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
 343             'playlist': [{
 344                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
 345                 'info_dict': {
 346                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 347                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
 348                     'ext': 'flv',
 349                     'duration': 2235.90,
 350                 }
 351             }, {
 352                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
 353                 'info_dict': {
 354                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
 355                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
 356                     'ext': 'flv',
 357                     'duration': 2235.93,
 358                 }
 359             }],
 360             'info_dict': {
 361                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 362             }
 363         },
 364         # Flowplayer
 365         {
 366             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
 367             'md5': '9d65602bf31c6e20014319c7d07fba27',
 368             'info_dict': {
 369                 'id': '5123ea6d5e5a7',
 370                 'ext': 'mp4',
 371                 'age_limit': 18,
 372                 'uploader': 'www.handjobhub.com',
 373                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
 374             }
 375         },
 376         # RSS feed
 377         {
 378             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 379             'info_dict': {
 380                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 381                 'title': 'Zero Punctuation',
 382                 'description': 're:.*groundbreaking video review series.*'
 383             },
 384             'playlist_mincount': 11,
 385         },
 386         # Multiple brightcove videos
 387         # https://github.com/rg3/youtube-dl/issues/2283
 388         {
 389             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
 390             'info_dict': {
 391                 'id': 'always-never',
 392                 'title': 'Always / Never - The New Yorker',
 393             },
 394             'playlist_count': 3,
 395             'params': {
 396                 'extract_flat': False,
 397                 'skip_download': True,
 398             }
 399         },
 400         # MLB embed
 401         {
 402             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
 403             'md5': '96f09a37e44da40dd083e12d9a683327',
 404             'info_dict': {
 405                 'id': '33322633',
 406                 'ext': 'mp4',
 407                 'title': 'Ump changes call to ball',
 408                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
 409                 'duration': 48,
 410                 'timestamp': 1401537900,
 411                 'upload_date': '20140531',
 412                 'thumbnail': 're:^https?://.*\.jpg$',
 413             },
 414         },
 415         # Wistia embed
 416         {
 417             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
 418             'md5': '8788b683c777a5cf25621eaf286d0c23',
 419             'info_dict': {
 420                 'id': '1cfaf6b7ea',
 421                 'ext': 'mov',
 422                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
 423                 'duration': 643.0,
 424                 'filesize': 182808282,
 425                 'uploader': 'education-portal.com',
 426             },
 427         },
 428         {
 429             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
 430             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
 431             'info_dict': {
 432                 'id': 'uxjb0lwrcz',
 433                 'ext': 'mp4',
 434                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
 435                 'duration': 1715.0,
 436                 'uploader': 'thoughtworks.wistia.com',
 437             },
 438         },
 439         # Direct download with broken HEAD
 440         {
 441             'url': 'http://ai-radio.org:8000/radio.opus',
 442             'info_dict': {
 443                 'id': 'radio',
 444                 'ext': 'opus',
 445                 'title': 'radio',
 446             },
 447             'params': {
 448                 'skip_download': True,  # infinite live stream
 449             },
 450             'expected_warnings': [
 451                 r'501.*Not Implemented'
 452             ],
 453         },
 454         # Soundcloud embed
 455         {
 456             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
 457             'info_dict': {
 458                 'id': '174391317',
 459                 'ext': 'mp3',
 460                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
 461                 'uploader': 'Sophos Security',
 462                 'title': 'Chet Chat 171 - Oct 29, 2014',
 463                 'upload_date': '20141029',
 464             }
 465         },
 466         # Livestream embed
 467         {
 468             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
 469             'info_dict': {
 470                 'id': '67864563',
 471                 'ext': 'flv',
 472                 'upload_date': '20141112',
 473                 'title': 'Rosetta #CometLanding webcast HL 10',
 474             }
 475         },
 476         # LazyYT
 477         {
 478             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
 479             'info_dict': {
 480                 'id': '1986',
 481                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
 482             },
 483             'playlist_mincount': 2,
 484         },
 485         # Direct link with incorrect MIME type
 486         {
 487             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
 488             'md5': '4ccbebe5f36706d85221f204d7eb5913',
 489             'info_dict': {
 490                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
 491                 'id': '5_Lennart_Poettering_-_Systemd',
 492                 'ext': 'webm',
 493                 'title': '5_Lennart_Poettering_-_Systemd',
 494                 'upload_date': '20141120',
 495             },
 496             'expected_warnings': [
 497                 'URL could be a direct video link, returning it as such.'
 498             ]
 499         },
 500         # Cinchcast embed
 501         {
 502             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
 503             'info_dict': {
 504                 'id': '7141703',
 505                 'ext': 'mp3',
 506                 'upload_date': '20141126',
 507                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
 508             }
 509         },
 510         # Cinerama player
 511         {
 512             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
 513             'info_dict': {
 514                 'id': '730m_DandD_1901_512k',
 515                 'ext': 'mp4',
 516                 'uploader': 'www.abc.net.au',
 517                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
 518             }
 519         },
 520         # embedded viddler video
 521         {
 522             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
 523             'info_dict': {
 524                 'id': '4d03aad9',
 525                 'ext': 'mp4',
 526                 'uploader': 'deadspin',
 527                 'title': 'WALL-TO-GORTAT',
 528                 'timestamp': 1422285291,
 529                 'upload_date': '20150126',
 530             },
 531             'add_ie': ['Viddler'],
 532         },
 533         # Libsyn embed
 534         {
 535             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
 536             'info_dict': {
 537                 'id': '3377616',
 538                 'ext': 'mp3',
 539                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
 540                 'description': 'md5:601cb790edd05908957dae8aaa866465',
 541                 'upload_date': '20150220',
 542             },
 543         },
 544         # jwplayer YouTube
 545         {
 546             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
 547             'info_dict': {
 548                 'id': 'Mrj4DVp2zeA',
 549                 'ext': 'mp4',
 550                 'upload_date': '20150212',
 551                 'uploader': 'The National Archives UK',
 552                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
 553                 'uploader_id': 'NationalArchives08',
 554                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
 555             },
 556         },
 557         # rtl.nl embed
 558         {
 559             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
 560             'playlist_mincount': 5,
 561             'info_dict': {
 562                 'id': 'aanslagen-kopenhagen',
 563                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
 564             }
 565         },
 566         # Zapiks embed
 567         {
 568             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
 569             'info_dict': {
 570                 'id': '118046',
 571                 'ext': 'mp4',
 572                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
 573             }
 574         },
 575         # Kaltura embed
 576         {
 577             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
 578             'info_dict': {
 579                 'id': '1_eergr3h1',
 580                 'ext': 'mp4',
 581                 'upload_date': '20150226',
 582                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
 583                 'timestamp': int,
 584                 'title': 'John Carlson Postgame 2/25/15',
 585             },
 586         },
 587         # Eagle.Platform embed (generic URL)
 588         {
 589             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
 590             'info_dict': {
 591                 'id': '227304',
 592                 'ext': 'mp4',
 593                 'title': 'Навальный вышел на свободу',
 594                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
 595                 'thumbnail': 're:^https?://.*\.jpg$',
 596                 'duration': 87,
 597                 'view_count': int,
 598                 'age_limit': 0,
 599             },
 600         },
 601         # ClipYou (Eagle.Platform) embed (custom URL)
 602         {
 603             'url': 'http://muz-tv.ru/play/7129/',
 604             'info_dict': {
 605                 'id': '12820',
 606                 'ext': 'mp4',
 607                 'title': "'O Sole Mio",
 608                 'thumbnail': 're:^https?://.*\.jpg$',
 609                 'duration': 216,
 610                 'view_count': int,
 611             },
 612         },
 613         # Pladform embed
 614         {
 615             'url': 'http://muz-tv.ru/kinozal/view/7400/',
 616             'info_dict': {
 617                 'id': '100183293',
 618                 'ext': 'mp4',
 619                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
 620                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
 621                 'thumbnail': 're:^https?://.*\.jpg$',
 622                 'duration': 694,
 623                 'age_limit': 0,
 624             },
 625         },
 626         # Playwire embed
 627         {
 628             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
 629             'info_dict': {
 630                 'id': '3519514',
 631                 'ext': 'mp4',
 632                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
 633                 'thumbnail': 're:^https?://.*\.png$',
 634                 'duration': 45.115,
 635             },
 636         },
 637         # 5min embed
 638         {
 639             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
 640             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
 641             'info_dict': {
 642                 'id': '518726732',
 643                 'ext': 'mp4',
 644                 'title': 'Facebook Creates "On This Day" | Crunch Report',
 645             },
 646         },
 647         # RSS feed with enclosure
 648         {
 649             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
 650             'info_dict': {
 651                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 652                 'ext': 'm4v',
 653                 'upload_date': '20150228',
 654                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 655             }
 656         },
 657         # Crooks and Liars embed
 658         {
 659             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
 660             'info_dict': {
 661                 'id': '8RUoRhRi',
 662                 'ext': 'mp4',
 663                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
 664                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
 665                 'timestamp': 1428207000,
 666                 'upload_date': '20150405',
 667                 'uploader': 'Heather',
 668             },
 669         },
 670         # Crooks and Liars external embed
 671         {
 672             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
 673             'info_dict': {
 674                 'id': 'MTE3MjUtMzQ2MzA',
 675                 'ext': 'mp4',
 676                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
 677                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
 678                 'timestamp': 1265032391,
 679                 'upload_date': '20100201',
 680                 'uploader': 'Heather',
 681             },
 682         },
 683         # NBC Sports vplayer embed
 684         {
 685             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
 686             'info_dict': {
 687                 'id': 'ln7x1qSThw4k',
 688                 'ext': 'flv',
 689                 'title': "PFT Live: New leader in the 'new-look' defense",
 690                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
 691             },
 692         },
 693         # UDN embed
 694         {
 695             'url': 'http://www.udn.com/news/story/7314/822787',
 696             'md5': 'fd2060e988c326991037b9aff9df21a6',
 697             'info_dict': {
 698                 'id': '300346',
 699                 'ext': 'mp4',
 700                 'title': '中一中男師變性 全校師生力挺',
 701                 'thumbnail': 're:^https?://.*\.jpg$',
 702             }
 703         },
 704         # Ooyala embed
 705         {
 706             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
 707             'info_dict': {
 708                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
 709                 'ext': 'mp4',
 710                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
 711                 'title': 'This is what separates the Excel masters from the wannabes',
 712             },
 713             'params': {
 714                 # m3u8 downloads
 715                 'skip_download': True,
 716             }
 717         },
 718         # Contains a SMIL manifest
 719         {
 720             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
 721             'info_dict': {
 722                 'id': 'file',
 723                 'ext': 'flv',
 724                 'title': '+ Football: Lottery Champions League Europe',
 725                 'uploader': 'www.telewebion.com',
 726             },
 727             'params': {
 728                 # rtmpe downloads
 729                 'skip_download': True,
 730             }
 731         }
 732     ]
 733
 734     def report_following_redirect(self, new_url):
 735         """Report information extraction."""
 736         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
 737
 738     def _extract_rss(self, url, video_id, doc):
 739         playlist_title = doc.find('./channel/title').text
 740         playlist_desc_el = doc.find('./channel/description')
 741         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
 742
 743         entries = []
 744         for it in doc.findall('./channel/item'):
 745             next_url = xpath_text(it, 'link', fatal=False)
 746             if not next_url:
 747                 enclosure_nodes = it.findall('./enclosure')
 748                 for e in enclosure_nodes:
 749                     next_url = e.attrib.get('url')
 750                     if next_url:
 751                         break
 752
 753             if not next_url:
 754                 continue
 755
 756             entries.append({
 757                 '_type': 'url',
 758                 'url': next_url,
 759                 'title': it.find('title').text,
 760             })
 761
 762         return {
 763             '_type': 'playlist',
 764             'id': url,
 765             'title': playlist_title,
 766             'description': playlist_desc,
 767             'entries': entries,
 768         }
 769
 770     def _extract_camtasia(self, url, video_id, webpage):
 771         """ Returns None if no camtasia video can be found. """
 772
 773         camtasia_cfg = self._search_regex(
 774             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
 775             webpage, 'camtasia configuration file', default=None)
 776         if camtasia_cfg is None:
 777             return None
 778
 779         title = self._html_search_meta('DC.title', webpage, fatal=True)
 780
 781         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
 782         camtasia_cfg = self._download_xml(
 783             camtasia_url, video_id,
 784             note='Downloading camtasia configuration',
 785             errnote='Failed to download camtasia configuration')
 786         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
 787
 788         entries = []
 789         for n in fileset_node.getchildren():
 790             url_n = n.find('./uri')
 791             if url_n is None:
 792                 continue
 793
 794             entries.append({
 795                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
 796                 'title': '%s - %s' % (title, n.tag),
 797                 'url': compat_urlparse.urljoin(url, url_n.text),
 798                 'duration': float_or_none(n.find('./duration').text),
 799             })
 800
 801         return {
 802             '_type': 'playlist',
 803             'entries': entries,
 804             'title': title,
 805         }
 806
 807     def _real_extract(self, url):
 808         if url.startswith('//'):
 809             return {
 810                 '_type': 'url',
 811                 'url': self.http_scheme() + url,
 812             }
 813
 814         parsed_url = compat_urlparse.urlparse(url)
 815         if not parsed_url.scheme:
 816             default_search = self._downloader.params.get('default_search')
 817             if default_search is None:
 818                 default_search = 'fixup_error'
 819
 820             if default_search in ('auto', 'auto_warning', 'fixup_error'):
 821                 if '/' in url:
 822                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
 823                     return self.url_result('http://' + url)
 824                 elif default_search != 'fixup_error':
 825                     if default_search == 'auto_warning':
 826                         if re.match(r'^(?:url|URL)$', url):
 827                             raise ExtractorError(
 828                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
 829                                 expected=True)
 830                         else:
 831                             self._downloader.report_warning(
 832                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
 833                     return self.url_result('ytsearch:' + url)
 834
 835             if default_search in ('error', 'fixup_error'):
 836                 raise ExtractorError(
 837                     '%r is not a valid URL. '
 838                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
 839                     % (url, url), expected=True)
 840             else:
 841                 if ':' not in default_search:
 842                     default_search += ':'
 843                 return self.url_result(default_search + url)
 844
 845         url, smuggled_data = unsmuggle_url(url)
 846         force_videoid = None
 847         is_intentional = smuggled_data and smuggled_data.get('to_generic')
 848         if smuggled_data and 'force_videoid' in smuggled_data:
 849             force_videoid = smuggled_data['force_videoid']
 850             video_id = force_videoid
 851         else:
 852             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
 853
 854         self.to_screen('%s: Requesting header' % video_id)
 855
 856         head_req = HEADRequest(url)
 857         head_response = self._request_webpage(
 858             head_req, video_id,
 859             note=False, errnote='Could not send HEAD request to %s' % url,
 860             fatal=False)
 861
 862         if head_response is not False:
 863             # Check for redirect
 864             new_url = head_response.geturl()
 865             if url != new_url:
 866                 self.report_following_redirect(new_url)
 867                 if force_videoid:
 868                     new_url = smuggle_url(
 869                         new_url, {'force_videoid': force_videoid})
 870                 return self.url_result(new_url)
 871
 872         full_response = None
 873         if head_response is False:
 874             full_response = self._request_webpage(url, video_id)
 875             head_response = full_response
 876
 877         # Check for direct link to a video
 878         content_type = head_response.headers.get('Content-Type', '')
 879         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
 880         if m:
 881             upload_date = unified_strdate(
 882                 head_response.headers.get('Last-Modified'))
 883             return {
 884                 'id': video_id,
 885                 'title': os.path.splitext(url_basename(url))[0],
 886                 'direct': True,
 887                 'formats': [{
 888                     'format_id': m.group('format_id'),
 889                     'url': url,
 890                     'vcodec': 'none' if m.group('type') == 'audio' else None
 891                 }],
 892                 'upload_date': upload_date,
 893             }
 894
 895         if not self._downloader.params.get('test', False) and not is_intentional:
 896             self._downloader.report_warning('Falling back on generic information extractor.')
 897
 898         if not full_response:
 899             full_response = self._request_webpage(url, video_id)
 900
 901         # Maybe it's a direct link to a video?
 902         # Be careful not to download the whole thing!
 903         first_bytes = full_response.read(512)
 904         if not is_html(first_bytes):
 905             self._downloader.report_warning(
 906                 'URL could be a direct video link, returning it as such.')
 907             upload_date = unified_strdate(
 908                 head_response.headers.get('Last-Modified'))
 909             return {
 910                 'id': video_id,
 911                 'title': os.path.splitext(url_basename(url))[0],
 912                 'direct': True,
 913                 'url': url,
 914                 'upload_date': upload_date,
 915             }
 916
 917         webpage = self._webpage_read_content(
 918             full_response, url, video_id, prefix=first_bytes)
 919
 920         self.report_extraction(video_id)
 921
 922         # Is it an RSS feed?
 923         try:
 924             doc = parse_xml(webpage)
 925             if doc.tag == 'rss':
 926                 return self._extract_rss(url, video_id, doc)
 927         except compat_xml_parse_error:
 928             pass
 929
 930         # Is it a Camtasia project?
 931         camtasia_res = self._extract_camtasia(url, video_id, webpage)
 932         if camtasia_res is not None:
 933             return camtasia_res
 934
 935         # Sometimes embedded video player is hidden behind percent encoding
 936         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
 937         # Unescaping the whole page allows to handle those cases in a generic way
 938         webpage = compat_urllib_parse.unquote(webpage)
 939
 940         # it's tempting to parse this further, but you would
 941         # have to take into account all the variations like
 942         #   Video Title - Site Name
 943         #   Site Name | Video Title
 944         #   Video Title - Tagline | Site Name
 945         # and so on and so forth; it's just not practical
 946         video_title = self._html_search_regex(
 947             r'(?s)<title>(.*?)</title>', webpage, 'video title',
 948             default='video')
 949
 950         # Try to detect age limit automatically
 951         age_limit = self._rta_search(webpage)
 952         # And then there are the jokers who advertise that they use RTA,
 953         # but actually don't.
 954         AGE_LIMIT_MARKERS = [
 955             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
 956         ]
 957         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
 958             age_limit = 18
 959
 960         # video uploader is domain name
 961         video_uploader = self._search_regex(
 962             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
 963
 964         # Helper method
 965         def _playlist_from_matches(matches, getter=None, ie=None):
 966             urlrs = orderedSet(
 967                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 968                 for m in matches)
 969             return self.playlist_result(
 970                 urlrs, playlist_id=video_id, playlist_title=video_title)
 971
 972         # Look for BrightCove:
 973         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
 974         if bc_urls:
 975             self.to_screen('Brightcove video detected.')
 976             entries = [{
 977                 '_type': 'url',
 978                 'url': smuggle_url(bc_url, {'Referer': url}),
 979                 'ie_key': 'Brightcove'
 980             } for bc_url in bc_urls]
 981
 982             return {
 983                 '_type': 'playlist',
 984                 'title': video_title,
 985                 'id': video_id,
 986                 'entries': entries,
 987             }
 988
 989         # Look for embedded rtl.nl player
 990         matches = re.findall(
 991             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
 992             webpage)
 993         if matches:
 994             return _playlist_from_matches(matches, ie='RtlNl')
 995
 996         # Look for embedded (iframe) Vimeo player
 997         mobj = re.search(
 998             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
 999         if mobj:
1000             player_url = unescapeHTML(mobj.group('url'))
1001             surl = smuggle_url(player_url, {'Referer': url})
1002             return self.url_result(surl)
1003         # Look for embedded (swf embed) Vimeo player
1004         mobj = re.search(
1005             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1006         if mobj:
1007             return self.url_result(mobj.group(1))
1008
1009         # Look for embedded YouTube player
1010         matches = re.findall(r'''(?x)
1011             (?:
1012                 <iframe[^>]+?src=|
1013                 data-video-url=|
1014                 <embed[^>]+?src=|
1015                 embedSWF\(?:\s*|
1016                 new\s+SWFObject\(
1017             )
1018             (["\'])
1019                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1020                 (?:embed|v|p)/.+?)
1021             \1''', webpage)
1022         if matches:
1023             return _playlist_from_matches(
1024                 matches, lambda m: unescapeHTML(m[1]))
1025
1026         # Look for lazyYT YouTube embed
1027         matches = re.findall(
1028             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1029         if matches:
1030             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1031
1032         # Look for embedded Dailymotion player
1033         matches = re.findall(
1034             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1035         if matches:
1036             return _playlist_from_matches(
1037                 matches, lambda m: unescapeHTML(m[1]))
1038
1039         # Look for embedded Dailymotion playlist player (#3822)
1040         m = re.search(
1041             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1042         if m:
1043             playlists = re.findall(
1044                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1045             if playlists:
1046                 return _playlist_from_matches(
1047                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1048
1049         # Look for embedded Wistia player
1050         match = re.search(
1051             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1052         if match:
1053             embed_url = self._proto_relative_url(
1054                 unescapeHTML(match.group('url')))
1055             return {
1056                 '_type': 'url_transparent',
1057                 'url': embed_url,
1058                 'ie_key': 'Wistia',
1059                 'uploader': video_uploader,
1060                 'title': video_title,
1061                 'id': video_id,
1062             }
1063
1064         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1065         if match:
1066             return {
1067                 '_type': 'url_transparent',
1068                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1069                 'ie_key': 'Wistia',
1070                 'uploader': video_uploader,
1071                 'title': video_title,
1072                 'id': match.group('id')
1073             }
1074
1075         # Look for embedded blip.tv player
1076         mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
1077         if mobj:
1078             return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1079         mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1080         if mobj:
1081             return self.url_result(mobj.group(1), 'BlipTV')
1082
1083         # Look for embedded condenast player
1084         matches = re.findall(
1085             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1086             webpage)
1087         if matches:
1088             return {
1089                 '_type': 'playlist',
1090                 'entries': [{
1091                     '_type': 'url',
1092                     'ie_key': 'CondeNast',
1093                     'url': ma,
1094                 } for ma in matches],
1095                 'title': video_title,
1096                 'id': video_id,
1097             }
1098
1099         # Look for Bandcamp pages with custom domain
1100         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1101         if mobj is not None:
1102             burl = unescapeHTML(mobj.group(1))
1103             # Don't set the extractor because it can be a track url or an album
1104             return self.url_result(burl)
1105
1106         # Look for embedded Vevo player
1107         mobj = re.search(
1108             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1109         if mobj is not None:
1110             return self.url_result(mobj.group('url'))
1111
1112         # Look for embedded Viddler player
1113         mobj = re.search(
1114             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1115             webpage)
1116         if mobj is not None:
1117             return self.url_result(mobj.group('url'))
1118
1119         # Look for NYTimes player
1120         mobj = re.search(
1121             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1122             webpage)
1123         if mobj is not None:
1124             return self.url_result(mobj.group('url'))
1125
1126         # Look for Libsyn player
1127         mobj = re.search(
1128             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1129         if mobj is not None:
1130             return self.url_result(mobj.group('url'))
1131
1132         # Look for Ooyala videos
1133         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1134                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1135                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1136                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1137         if mobj is not None:
1138             return OoyalaIE._build_url_result(mobj.group('ec'))
1139
1140         # Look for multiple Ooyala embeds on SBN network websites
1141         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1142         if mobj is not None:
1143             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1144             if embeds:
1145                 return _playlist_from_matches(
1146                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1147
1148         # Look for Aparat videos
1149         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1150         if mobj is not None:
1151             return self.url_result(mobj.group(1), 'Aparat')
1152
1153         # Look for MPORA videos
1154         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1155         if mobj is not None:
1156             return self.url_result(mobj.group(1), 'Mpora')
1157
1158         # Look for embedded NovaMov-based player
1159         mobj = re.search(
1160             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1161                     (?P<url>http://(?:(?:embed|www)\.)?
1162                         (?:novamov\.com|
1163                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1164                            videoweed\.(?:es|com)|
1165                            movshare\.(?:net|sx|ag)|
1166                            divxstage\.(?:eu|net|ch|co|at|ag))
1167                         /embed\.php.+?)\1''', webpage)
1168         if mobj is not None:
1169             return self.url_result(mobj.group('url'))
1170
1171         # Look for embedded Facebook player
1172         mobj = re.search(
1173             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1174         if mobj is not None:
1175             return self.url_result(mobj.group('url'), 'Facebook')
1176
1177         # Look for embedded VK player
1178         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1179         if mobj is not None:
1180             return self.url_result(mobj.group('url'), 'VK')
1181
1182         # Look for embedded ivi player
1183         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1184         if mobj is not None:
1185             return self.url_result(mobj.group('url'), 'Ivi')
1186
1187         # Look for embedded Huffington Post player
1188         mobj = re.search(
1189             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1190         if mobj is not None:
1191             return self.url_result(mobj.group('url'), 'HuffPost')
1192
1193         # Look for embed.ly
1194         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1195         if mobj is not None:
1196             return self.url_result(mobj.group('url'))
1197         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1198         if mobj is not None:
1199             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1200
1201         # Look for funnyordie embed
1202         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1203         if matches:
1204             return _playlist_from_matches(
1205                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1206
1207         # Look for BBC iPlayer embed
1208         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1209         if matches:
1210             return _playlist_from_matches(matches, ie='BBCCoUk')
1211
1212         # Look for embedded RUTV player
1213         rutv_url = RUTVIE._extract_url(webpage)
1214         if rutv_url:
1215             return self.url_result(rutv_url, 'RUTV')
1216
1217         # Look for embedded TED player
1218         mobj = re.search(
1219             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1220         if mobj is not None:
1221             return self.url_result(mobj.group('url'), 'TED')
1222
1223         # Look for embedded Ustream videos
1224         mobj = re.search(
1225             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1226         if mobj is not None:
1227             return self.url_result(mobj.group('url'), 'Ustream')
1228
1229         # Look for embedded arte.tv player
1230         mobj = re.search(
1231             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1232             webpage)
1233         if mobj is not None:
1234             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1235
1236         # Look for embedded smotri.com player
1237         smotri_url = SmotriIE._extract_url(webpage)
1238         if smotri_url:
1239             return self.url_result(smotri_url, 'Smotri')
1240
1241         # Look for embeded soundcloud player
1242         mobj = re.search(
1243             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1244             webpage)
1245         if mobj is not None:
1246             url = unescapeHTML(mobj.group('url'))
1247             return self.url_result(url)
1248
1249         # Look for embedded vulture.com player
1250         mobj = re.search(
1251             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1252             webpage)
1253         if mobj is not None:
1254             url = unescapeHTML(mobj.group('url'))
1255             return self.url_result(url, ie='Vulture')
1256
1257         # Look for embedded mtvservices player
1258         mobj = re.search(
1259             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1260             webpage)
1261         if mobj is not None:
1262             url = unescapeHTML(mobj.group('url'))
1263             return self.url_result(url, ie='MTVServicesEmbedded')
1264
1265         # Look for embedded yahoo player
1266         mobj = re.search(
1267             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1268             webpage)
1269         if mobj is not None:
1270             return self.url_result(mobj.group('url'), 'Yahoo')
1271
1272         # Look for embedded sbs.com.au player
1273         mobj = re.search(
1274             r'''(?x)
1275             (?:
1276                 <meta\s+property="og:video"\s+content=|
1277                 <iframe[^>]+?src=
1278             )
1279             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1280             webpage)
1281         if mobj is not None:
1282             return self.url_result(mobj.group('url'), 'SBS')
1283
1284         # Look for embedded Cinchcast player
1285         mobj = re.search(
1286             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1287             webpage)
1288         if mobj is not None:
1289             return self.url_result(mobj.group('url'), 'Cinchcast')
1290
1291         mobj = re.search(
1292             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1293             webpage)
1294         if mobj is not None:
1295             return self.url_result(mobj.group('url'), 'MLB')
1296
1297         mobj = re.search(
1298             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1299             webpage)
1300         if mobj is not None:
1301             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1302
1303         mobj = re.search(
1304             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1305             webpage)
1306         if mobj is not None:
1307             return self.url_result(mobj.group('url'), 'Livestream')
1308
1309         # Look for Zapiks embed
1310         mobj = re.search(
1311             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1312         if mobj is not None:
1313             return self.url_result(mobj.group('url'), 'Zapiks')
1314
1315         # Look for Kaltura embeds
1316         mobj = re.search(
1317             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1318         if mobj is not None:
1319             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1320
1321         # Look for Eagle.Platform embeds
1322         mobj = re.search(
1323             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1324         if mobj is not None:
1325             return self.url_result(mobj.group('url'), 'EaglePlatform')
1326
1327         # Look for ClipYou (uses Eagle.Platform) embeds
1328         mobj = re.search(
1329             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1330         if mobj is not None:
1331             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1332
1333         # Look for Pladform embeds
1334         mobj = re.search(
1335             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1336         if mobj is not None:
1337             return self.url_result(mobj.group('url'), 'Pladform')
1338
1339         # Look for Playwire embeds
1340         mobj = re.search(
1341             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1342         if mobj is not None:
1343             return self.url_result(mobj.group('url'))
1344
1345         # Look for 5min embeds
1346         mobj = re.search(
1347             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1348         if mobj is not None:
1349             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1350
1351         # Look for Crooks and Liars embeds
1352         mobj = re.search(
1353             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1354         if mobj is not None:
1355             return self.url_result(mobj.group('url'))
1356
1357         # Look for NBC Sports VPlayer embeds
1358         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1359         if nbc_sports_url:
1360             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1361
1362         # Look for UDN embeds
1363         mobj = re.search(
1364             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1365         if mobj is not None:
1366             return self.url_result(
1367                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1368
1369         # Look for Senate ISVP iframe
1370         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1371         if senate_isvp_url:
1372             return self.url_result(surl, 'SenateISVP')
1373
1374         def check_video(vurl):
1375             if YoutubeIE.suitable(vurl):
1376                 return True
1377             vpath = compat_urlparse.urlparse(vurl).path
1378             vext = determine_ext(vpath)
1379             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1380
1381         def filter_video(urls):
1382             return list(filter(check_video, urls))
1383
1384         # Start with something easy: JW Player in SWFObject
1385         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1386         if not found:
1387             # Look for gorilla-vid style embedding
1388             found = filter_video(re.findall(r'''(?sx)
1389                 (?:
1390                     jw_plugins|
1391                     JWPlayerOptions|
1392                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1393                 )
1394                 .*?
1395                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1396         if not found:
1397             # Broaden the search a little bit
1398             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1399         if not found:
1400             # Broaden the findall a little bit: JWPlayer JS loader
1401             found = filter_video(re.findall(
1402                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1403         if not found:
1404             # Flow player
1405             found = filter_video(re.findall(r'''(?xs)
1406                 flowplayer\("[^"]+",\s*
1407                     \{[^}]+?\}\s*,
1408                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1409                         ["']?url["']?\s*:\s*["']([^"']+)["']
1410             ''', webpage))
1411         if not found:
1412             # Cinerama player
1413             found = re.findall(
1414                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1415         if not found:
1416             # Try to find twitter cards info
1417             found = filter_video(re.findall(
1418                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1419         if not found:
1420             # We look for Open Graph info:
1421             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1422             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1423             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1424             if m_video_type is not None:
1425                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1426         if not found:
1427             # HTML5 video
1428             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1429         if not found:
1430             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1431             found = re.search(
1432                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1433                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1434                 webpage)
1435             if not found:
1436                 # Look also in Refresh HTTP header
1437                 refresh_header = head_response.headers.get('Refresh')
1438                 if refresh_header:
1439                     found = re.search(REDIRECT_REGEX, refresh_header)
1440             if found:
1441                 new_url = found.group(1)
1442                 self.report_following_redirect(new_url)
1443                 return {
1444                     '_type': 'url',
1445                     'url': new_url,
1446                 }
1447         if not found:
1448             raise UnsupportedError(url)
1449
1450         entries = []
1451         for video_url in found:
1452             video_url = compat_urlparse.urljoin(url, video_url)
1453             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1454
1455             # Sometimes, jwplayer extraction will result in a YouTube URL
1456             if YoutubeIE.suitable(video_url):
1457                 entries.append(self.url_result(video_url, 'Youtube'))
1458                 continue
1459
1460             # here's a fun little line of code for you:
1461             video_id = os.path.splitext(video_id)[0]
1462
1463             if determine_ext(video_url) == 'smil':
1464                 entries.append({
1465                     'id': video_id,
1466                     'formats': self._extract_smil_formats(video_url, video_id),
1467                     'uploader': video_uploader,
1468                     'title': video_title,
1469                     'age_limit': age_limit,
1470                 })
1471             else:
1472                 entries.append({
1473                     'id': video_id,
1474                     'url': video_url,
1475                     'uploader': video_uploader,
1476                     'title': video_title,
1477                     'age_limit': age_limit,
1478                 })
1479
1480         if len(entries) == 1:
1481             return entries[0]
1482         else:
1483             for num, e in enumerate(entries, start=1):
1484                 # 'url' results don't have a title
1485                 if e.get('title') is not None:
1486                     e['title'] = '%s (%d)' % (e['title'], num)
1487             return {
1488                 '_type': 'playlist',
1489                 'entries': entries,
1490             }