_ Git - youtube-dl/blob - youtube_dl/extractor/generic.py

   1 # encoding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 import os
   6 import re
   7
   8 from .common import InfoExtractor
   9 from .youtube import YoutubeIE
  10 from ..compat import (
  11     compat_urllib_parse,
  12     compat_urlparse,
  13     compat_xml_parse_error,
  14 )
  15 from ..utils import (
  16     determine_ext,
  17     ExtractorError,
  18     float_or_none,
  19     HEADRequest,
  20     is_html,
  21     orderedSet,
  22     parse_xml,
  23     smuggle_url,
  24     unescapeHTML,
  25     unified_strdate,
  26     unsmuggle_url,
  27     UnsupportedError,
  28     url_basename,
  29     xpath_text,
  30 )
  31 from .brightcove import BrightcoveIE
  32 from .nbc import NBCSportsVPlayerIE
  33 from .ooyala import OoyalaIE
  34 from .rutv import RUTVIE
  35 from .smotri import SmotriIE
  36 from .condenast import CondeNastIE
  37 from .udn import UDNEmbedIE
  38
  39
  40 class GenericIE(InfoExtractor):
  41     IE_DESC = 'Generic downloader that works on some sites'
  42     _VALID_URL = r'.*'
  43     IE_NAME = 'generic'
  44     _TESTS = [
  45         {
  46             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
  47             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
  48             'info_dict': {
  49                 'id': '13601338388002',
  50                 'ext': 'mp4',
  51                 'uploader': 'www.hodiho.fr',
  52                 'title': 'R\u00e9gis plante sa Jeep',
  53             }
  54         },
  55         # bandcamp page with custom domain
  56         {
  57             'add_ie': ['Bandcamp'],
  58             'url': 'http://bronyrock.com/track/the-pony-mash',
  59             'info_dict': {
  60                 'id': '3235767654',
  61                 'ext': 'mp3',
  62                 'title': 'The Pony Mash',
  63                 'uploader': 'M_Pallante',
  64             },
  65             'skip': 'There is a limit of 200 free downloads / month for the test song',
  66         },
  67         # embedded brightcove video
  68         # it also tests brightcove videos that need to set the 'Referer' in the
  69         # http requests
  70         {
  71             'add_ie': ['Brightcove'],
  72             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
  73             'info_dict': {
  74                 'id': '2765128793001',
  75                 'ext': 'mp4',
  76                 'title': 'Le cours de bourse : l’analyse technique',
  77                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
  78                 'uploader': 'BFM BUSINESS',
  79             },
  80             'params': {
  81                 'skip_download': True,
  82             },
  83         },
  84         {
  85             # https://github.com/rg3/youtube-dl/issues/2253
  86             'url': 'http://bcove.me/i6nfkrc3',
  87             'md5': '0ba9446db037002366bab3b3eb30c88c',
  88             'info_dict': {
  89                 'id': '3101154703001',
  90                 'ext': 'mp4',
  91                 'title': 'Still no power',
  92                 'uploader': 'thestar.com',
  93                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
  94             },
  95             'add_ie': ['Brightcove'],
  96         },
  97         {
  98             'url': 'http://www.championat.com/video/football/v/87/87499.html',
  99             'md5': 'fb973ecf6e4a78a67453647444222983',
 100             'info_dict': {
 101                 'id': '3414141473001',
 102                 'ext': 'mp4',
 103                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
 104                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
 105                 'uploader': 'Championat',
 106             },
 107         },
 108         {
 109             # https://github.com/rg3/youtube-dl/issues/3541
 110             'add_ie': ['Brightcove'],
 111             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
 112             'info_dict': {
 113                 'id': '3866516442001',
 114                 'ext': 'mp4',
 115                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
 116                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
 117                 'uploader': 'SBS Broadcasting',
 118             },
 119             'skip': 'Restricted to Netherlands',
 120             'params': {
 121                 'skip_download': True,  # m3u8 download
 122             },
 123         },
 124         # Direct link to a video
 125         {
 126             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
 127             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
 128             'info_dict': {
 129                 'id': 'trailer',
 130                 'ext': 'mp4',
 131                 'title': 'trailer',
 132                 'upload_date': '20100513',
 133             }
 134         },
 135         # ooyala video
 136         {
 137             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
 138             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
 139             'info_dict': {
 140                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
 141                 'ext': 'mp4',
 142                 'title': '2cc213299525360.mov',  # that's what we get
 143             },
 144             'add_ie': ['Ooyala'],
 145         },
 146         # multiple ooyala embeds on SBN network websites
 147         {
 148             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 149             'info_dict': {
 150                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 151                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
 152             },
 153             'playlist_mincount': 3,
 154             'params': {
 155                 'skip_download': True,
 156             },
 157             'add_ie': ['Ooyala'],
 158         },
 159         # google redirect
 160         {
 161             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
 162             'info_dict': {
 163                 'id': 'cmQHVoWB5FY',
 164                 'ext': 'mp4',
 165                 'upload_date': '20130224',
 166                 'uploader_id': 'TheVerge',
 167                 'description': 're:^Chris Ziegler takes a look at the\.*',
 168                 'uploader': 'The Verge',
 169                 'title': 'First Firefox OS phones side-by-side',
 170             },
 171             'params': {
 172                 'skip_download': False,
 173             }
 174         },
 175         # embed.ly video
 176         {
 177             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
 178             'info_dict': {
 179                 'id': '9ODmcdjQcHQ',
 180                 'ext': 'mp4',
 181                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
 182                 'upload_date': '20140225',
 183                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
 184                 'uploader': 'Tested',
 185                 'uploader_id': 'testedcom',
 186             },
 187             # No need to test YoutubeIE here
 188             'params': {
 189                 'skip_download': True,
 190             },
 191         },
 192         # funnyordie embed
 193         {
 194             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
 195             'info_dict': {
 196                 'id': '18e820ec3f',
 197                 'ext': 'mp4',
 198                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
 199                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
 200             },
 201         },
 202         # BBC iPlayer embeds
 203         {
 204             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
 205             'info_dict': {
 206                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
 207             },
 208             'playlist_mincount': 18,
 209         },
 210         # RUTV embed
 211         {
 212             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
 213             'info_dict': {
 214                 'id': '776940',
 215                 'ext': 'mp4',
 216                 'title': 'Охотское море стало целиком российским',
 217                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
 218             },
 219             'params': {
 220                 # m3u8 download
 221                 'skip_download': True,
 222             },
 223         },
 224         # Embedded TED video
 225         {
 226             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
 227             'md5': '65fdff94098e4a607385a60c5177c638',
 228             'info_dict': {
 229                 'id': '1969',
 230                 'ext': 'mp4',
 231                 'title': 'Hidden miracles of the natural world',
 232                 'uploader': 'Louie Schwartzberg',
 233                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
 234             }
 235         },
 236         # Embeded Ustream video
 237         {
 238             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
 239             'md5': '27b99cdb639c9b12a79bca876a073417',
 240             'info_dict': {
 241                 'id': '45734260',
 242                 'ext': 'flv',
 243                 'uploader': 'AU SPA:  The NSA and Privacy',
 244                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
 245             }
 246         },
 247         # nowvideo embed hidden behind percent encoding
 248         {
 249             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
 250             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
 251             'info_dict': {
 252                 'id': '06e53103ca9aa',
 253                 'ext': 'flv',
 254                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
 255                 'description': 'No description',
 256             },
 257         },
 258         # arte embed
 259         {
 260             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
 261             'md5': '7653032cbb25bf6c80d80f217055fa43',
 262             'info_dict': {
 263                 'id': '048195-004_PLUS7-F',
 264                 'ext': 'flv',
 265                 'title': 'X:enius',
 266                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
 267                 'upload_date': '20140320',
 268             },
 269             'params': {
 270                 'skip_download': 'Requires rtmpdump'
 271             }
 272         },
 273         # Condé Nast embed
 274         {
 275             'url': 'http://www.wired.com/2014/04/honda-asimo/',
 276             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
 277             'info_dict': {
 278                 'id': '53501be369702d3275860000',
 279                 'ext': 'mp4',
 280                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
 281             }
 282         },
 283         # Dailymotion embed
 284         {
 285             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
 286             'md5': '441aeeb82eb72c422c7f14ec533999cd',
 287             'info_dict': {
 288                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
 289                 'ext': 'mp4',
 290                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
 291                 'uploader': 'Spi0n',
 292             },
 293             'add_ie': ['Dailymotion'],
 294         },
 295         # YouTube embed
 296         {
 297             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
 298             'info_dict': {
 299                 'id': 'FXRb4ykk4S0',
 300                 'ext': 'mp4',
 301                 'title': 'The NBL Auction 2014',
 302                 'uploader': 'BADMINTON England',
 303                 'uploader_id': 'BADMINTONEvents',
 304                 'upload_date': '20140603',
 305                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
 306             },
 307             'add_ie': ['Youtube'],
 308             'params': {
 309                 'skip_download': True,
 310             }
 311         },
 312         # MTVSercices embed
 313         {
 314             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
 315             'md5': '35727f82f58c76d996fc188f9755b0d5',
 316             'info_dict': {
 317                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
 318                 'ext': 'mp4',
 319                 'title': 'Review',
 320                 'description': 'Mario\'s life in the fast lane has never looked so good.',
 321             },
 322         },
 323         # YouTube embed via <data-embed-url="">
 324         {
 325             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
 326             'info_dict': {
 327                 'id': '4vAffPZIT44',
 328                 'ext': 'mp4',
 329                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
 330                 'uploader': 'Gameloft',
 331                 'uploader_id': 'gameloft',
 332                 'upload_date': '20140828',
 333                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
 334             },
 335             'params': {
 336                 'skip_download': True,
 337             }
 338         },
 339         # Camtasia studio
 340         {
 341             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
 342             'playlist': [{
 343                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
 344                 'info_dict': {
 345                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 346                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
 347                     'ext': 'flv',
 348                     'duration': 2235.90,
 349                 }
 350             }, {
 351                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
 352                 'info_dict': {
 353                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
 354                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
 355                     'ext': 'flv',
 356                     'duration': 2235.93,
 357                 }
 358             }],
 359             'info_dict': {
 360                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 361             }
 362         },
 363         # Flowplayer
 364         {
 365             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
 366             'md5': '9d65602bf31c6e20014319c7d07fba27',
 367             'info_dict': {
 368                 'id': '5123ea6d5e5a7',
 369                 'ext': 'mp4',
 370                 'age_limit': 18,
 371                 'uploader': 'www.handjobhub.com',
 372                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
 373             }
 374         },
 375         # RSS feed
 376         {
 377             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 378             'info_dict': {
 379                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 380                 'title': 'Zero Punctuation',
 381                 'description': 're:.*groundbreaking video review series.*'
 382             },
 383             'playlist_mincount': 11,
 384         },
 385         # Multiple brightcove videos
 386         # https://github.com/rg3/youtube-dl/issues/2283
 387         {
 388             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
 389             'info_dict': {
 390                 'id': 'always-never',
 391                 'title': 'Always / Never - The New Yorker',
 392             },
 393             'playlist_count': 3,
 394             'params': {
 395                 'extract_flat': False,
 396                 'skip_download': True,
 397             }
 398         },
 399         # MLB embed
 400         {
 401             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
 402             'md5': '96f09a37e44da40dd083e12d9a683327',
 403             'info_dict': {
 404                 'id': '33322633',
 405                 'ext': 'mp4',
 406                 'title': 'Ump changes call to ball',
 407                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
 408                 'duration': 48,
 409                 'timestamp': 1401537900,
 410                 'upload_date': '20140531',
 411                 'thumbnail': 're:^https?://.*\.jpg$',
 412             },
 413         },
 414         # Wistia embed
 415         {
 416             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
 417             'md5': '8788b683c777a5cf25621eaf286d0c23',
 418             'info_dict': {
 419                 'id': '1cfaf6b7ea',
 420                 'ext': 'mov',
 421                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
 422                 'duration': 643.0,
 423                 'filesize': 182808282,
 424                 'uploader': 'education-portal.com',
 425             },
 426         },
 427         {
 428             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
 429             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
 430             'info_dict': {
 431                 'id': 'uxjb0lwrcz',
 432                 'ext': 'mp4',
 433                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
 434                 'duration': 1715.0,
 435                 'uploader': 'thoughtworks.wistia.com',
 436             },
 437         },
 438         # Direct download with broken HEAD
 439         {
 440             'url': 'http://ai-radio.org:8000/radio.opus',
 441             'info_dict': {
 442                 'id': 'radio',
 443                 'ext': 'opus',
 444                 'title': 'radio',
 445             },
 446             'params': {
 447                 'skip_download': True,  # infinite live stream
 448             },
 449             'expected_warnings': [
 450                 r'501.*Not Implemented'
 451             ],
 452         },
 453         # Soundcloud embed
 454         {
 455             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
 456             'info_dict': {
 457                 'id': '174391317',
 458                 'ext': 'mp3',
 459                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
 460                 'uploader': 'Sophos Security',
 461                 'title': 'Chet Chat 171 - Oct 29, 2014',
 462                 'upload_date': '20141029',
 463             }
 464         },
 465         # Livestream embed
 466         {
 467             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
 468             'info_dict': {
 469                 'id': '67864563',
 470                 'ext': 'flv',
 471                 'upload_date': '20141112',
 472                 'title': 'Rosetta #CometLanding webcast HL 10',
 473             }
 474         },
 475         # LazyYT
 476         {
 477             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
 478             'info_dict': {
 479                 'id': '1986',
 480                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
 481             },
 482             'playlist_mincount': 2,
 483         },
 484         # Direct link with incorrect MIME type
 485         {
 486             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
 487             'md5': '4ccbebe5f36706d85221f204d7eb5913',
 488             'info_dict': {
 489                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
 490                 'id': '5_Lennart_Poettering_-_Systemd',
 491                 'ext': 'webm',
 492                 'title': '5_Lennart_Poettering_-_Systemd',
 493                 'upload_date': '20141120',
 494             },
 495             'expected_warnings': [
 496                 'URL could be a direct video link, returning it as such.'
 497             ]
 498         },
 499         # Cinchcast embed
 500         {
 501             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
 502             'info_dict': {
 503                 'id': '7141703',
 504                 'ext': 'mp3',
 505                 'upload_date': '20141126',
 506                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
 507             }
 508         },
 509         # Cinerama player
 510         {
 511             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
 512             'info_dict': {
 513                 'id': '730m_DandD_1901_512k',
 514                 'ext': 'mp4',
 515                 'uploader': 'www.abc.net.au',
 516                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
 517             }
 518         },
 519         # embedded viddler video
 520         {
 521             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
 522             'info_dict': {
 523                 'id': '4d03aad9',
 524                 'ext': 'mp4',
 525                 'uploader': 'deadspin',
 526                 'title': 'WALL-TO-GORTAT',
 527                 'timestamp': 1422285291,
 528                 'upload_date': '20150126',
 529             },
 530             'add_ie': ['Viddler'],
 531         },
 532         # Libsyn embed
 533         {
 534             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
 535             'info_dict': {
 536                 'id': '3377616',
 537                 'ext': 'mp3',
 538                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
 539                 'description': 'md5:601cb790edd05908957dae8aaa866465',
 540                 'upload_date': '20150220',
 541             },
 542         },
 543         # jwplayer YouTube
 544         {
 545             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
 546             'info_dict': {
 547                 'id': 'Mrj4DVp2zeA',
 548                 'ext': 'mp4',
 549                 'upload_date': '20150212',
 550                 'uploader': 'The National Archives UK',
 551                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
 552                 'uploader_id': 'NationalArchives08',
 553                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
 554             },
 555         },
 556         # rtl.nl embed
 557         {
 558             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
 559             'playlist_mincount': 5,
 560             'info_dict': {
 561                 'id': 'aanslagen-kopenhagen',
 562                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
 563             }
 564         },
 565         # Zapiks embed
 566         {
 567             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
 568             'info_dict': {
 569                 'id': '118046',
 570                 'ext': 'mp4',
 571                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
 572             }
 573         },
 574         # Kaltura embed
 575         {
 576             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
 577             'info_dict': {
 578                 'id': '1_eergr3h1',
 579                 'ext': 'mp4',
 580                 'upload_date': '20150226',
 581                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
 582                 'timestamp': int,
 583                 'title': 'John Carlson Postgame 2/25/15',
 584             },
 585         },
 586         # Eagle.Platform embed (generic URL)
 587         {
 588             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
 589             'info_dict': {
 590                 'id': '227304',
 591                 'ext': 'mp4',
 592                 'title': 'Навальный вышел на свободу',
 593                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
 594                 'thumbnail': 're:^https?://.*\.jpg$',
 595                 'duration': 87,
 596                 'view_count': int,
 597                 'age_limit': 0,
 598             },
 599         },
 600         # ClipYou (Eagle.Platform) embed (custom URL)
 601         {
 602             'url': 'http://muz-tv.ru/play/7129/',
 603             'info_dict': {
 604                 'id': '12820',
 605                 'ext': 'mp4',
 606                 'title': "'O Sole Mio",
 607                 'thumbnail': 're:^https?://.*\.jpg$',
 608                 'duration': 216,
 609                 'view_count': int,
 610             },
 611         },
 612         # Pladform embed
 613         {
 614             'url': 'http://muz-tv.ru/kinozal/view/7400/',
 615             'info_dict': {
 616                 'id': '100183293',
 617                 'ext': 'mp4',
 618                 'title': 'Тайны перевала Дятлова • Тайна перевала Дятлова 1 серия 2 часть',
 619                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
 620                 'thumbnail': 're:^https?://.*\.jpg$',
 621                 'duration': 694,
 622                 'age_limit': 0,
 623             },
 624         },
 625         # 5min embed
 626         {
 627             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
 628             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
 629             'info_dict': {
 630                 'id': '518726732',
 631                 'ext': 'mp4',
 632                 'title': 'Facebook Creates "On This Day" | Crunch Report',
 633             },
 634         },
 635         # RSS feed with enclosure
 636         {
 637             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
 638             'info_dict': {
 639                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 640                 'ext': 'm4v',
 641                 'upload_date': '20150228',
 642                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 643             }
 644         },
 645         # Crooks and Liars embed
 646         {
 647             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
 648             'info_dict': {
 649                 'id': '8RUoRhRi',
 650                 'ext': 'mp4',
 651                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
 652                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
 653                 'timestamp': 1428207000,
 654                 'upload_date': '20150405',
 655                 'uploader': 'Heather',
 656             },
 657         },
 658         # Crooks and Liars external embed
 659         {
 660             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
 661             'info_dict': {
 662                 'id': 'MTE3MjUtMzQ2MzA',
 663                 'ext': 'mp4',
 664                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
 665                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
 666                 'timestamp': 1265032391,
 667                 'upload_date': '20100201',
 668                 'uploader': 'Heather',
 669             },
 670         },
 671         # NBC Sports vplayer embed
 672         {
 673             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
 674             'info_dict': {
 675                 'id': 'ln7x1qSThw4k',
 676                 'ext': 'flv',
 677                 'title': "PFT Live: New leader in the 'new-look' defense",
 678                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
 679             },
 680         },
 681         # UDN embed
 682         {
 683             'url': 'http://www.udn.com/news/story/7314/822787',
 684             'md5': 'fd2060e988c326991037b9aff9df21a6',
 685             'info_dict': {
 686                 'id': '300346',
 687                 'ext': 'mp4',
 688                 'title': '中一中男師變性 全校師生力挺',
 689                 'thumbnail': 're:^https?://.*\.jpg$',
 690             }
 691         },
 692         # Ooyala embed
 693         {
 694             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
 695             'info_dict': {
 696                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
 697                 'ext': 'mp4',
 698                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
 699                 'title': 'This is what separates the Excel masters from the wannabes',
 700             },
 701             'params': {
 702                 # m3u8 downloads
 703                 'skip_download': True,
 704             }
 705         }
 706     ]
 707
 708     def report_following_redirect(self, new_url):
 709         """Report information extraction."""
 710         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
 711
 712     def _extract_rss(self, url, video_id, doc):
 713         playlist_title = doc.find('./channel/title').text
 714         playlist_desc_el = doc.find('./channel/description')
 715         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
 716
 717         entries = []
 718         for it in doc.findall('./channel/item'):
 719             next_url = xpath_text(it, 'link', fatal=False)
 720             if not next_url:
 721                 enclosure_nodes = it.findall('./enclosure')
 722                 for e in enclosure_nodes:
 723                     next_url = e.attrib.get('url')
 724                     if next_url:
 725                         break
 726
 727             if not next_url:
 728                 continue
 729
 730             entries.append({
 731                 '_type': 'url',
 732                 'url': next_url,
 733                 'title': it.find('title').text,
 734             })
 735
 736         return {
 737             '_type': 'playlist',
 738             'id': url,
 739             'title': playlist_title,
 740             'description': playlist_desc,
 741             'entries': entries,
 742         }
 743
 744     def _extract_camtasia(self, url, video_id, webpage):
 745         """ Returns None if no camtasia video can be found. """
 746
 747         camtasia_cfg = self._search_regex(
 748             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
 749             webpage, 'camtasia configuration file', default=None)
 750         if camtasia_cfg is None:
 751             return None
 752
 753         title = self._html_search_meta('DC.title', webpage, fatal=True)
 754
 755         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
 756         camtasia_cfg = self._download_xml(
 757             camtasia_url, video_id,
 758             note='Downloading camtasia configuration',
 759             errnote='Failed to download camtasia configuration')
 760         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
 761
 762         entries = []
 763         for n in fileset_node.getchildren():
 764             url_n = n.find('./uri')
 765             if url_n is None:
 766                 continue
 767
 768             entries.append({
 769                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
 770                 'title': '%s - %s' % (title, n.tag),
 771                 'url': compat_urlparse.urljoin(url, url_n.text),
 772                 'duration': float_or_none(n.find('./duration').text),
 773             })
 774
 775         return {
 776             '_type': 'playlist',
 777             'entries': entries,
 778             'title': title,
 779         }
 780
 781     def _real_extract(self, url):
 782         if url.startswith('//'):
 783             return {
 784                 '_type': 'url',
 785                 'url': self.http_scheme() + url,
 786             }
 787
 788         parsed_url = compat_urlparse.urlparse(url)
 789         if not parsed_url.scheme:
 790             default_search = self._downloader.params.get('default_search')
 791             if default_search is None:
 792                 default_search = 'fixup_error'
 793
 794             if default_search in ('auto', 'auto_warning', 'fixup_error'):
 795                 if '/' in url:
 796                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
 797                     return self.url_result('http://' + url)
 798                 elif default_search != 'fixup_error':
 799                     if default_search == 'auto_warning':
 800                         if re.match(r'^(?:url|URL)$', url):
 801                             raise ExtractorError(
 802                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
 803                                 expected=True)
 804                         else:
 805                             self._downloader.report_warning(
 806                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
 807                     return self.url_result('ytsearch:' + url)
 808
 809             if default_search in ('error', 'fixup_error'):
 810                 raise ExtractorError(
 811                     '%r is not a valid URL. '
 812                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
 813                     % (url, url), expected=True)
 814             else:
 815                 if ':' not in default_search:
 816                     default_search += ':'
 817                 return self.url_result(default_search + url)
 818
 819         url, smuggled_data = unsmuggle_url(url)
 820         force_videoid = None
 821         is_intentional = smuggled_data and smuggled_data.get('to_generic')
 822         if smuggled_data and 'force_videoid' in smuggled_data:
 823             force_videoid = smuggled_data['force_videoid']
 824             video_id = force_videoid
 825         else:
 826             video_id = os.path.splitext(url.rstrip('/').split('/')[-1])[0]
 827
 828         self.to_screen('%s: Requesting header' % video_id)
 829
 830         head_req = HEADRequest(url)
 831         head_response = self._request_webpage(
 832             head_req, video_id,
 833             note=False, errnote='Could not send HEAD request to %s' % url,
 834             fatal=False)
 835
 836         if head_response is not False:
 837             # Check for redirect
 838             new_url = head_response.geturl()
 839             if url != new_url:
 840                 self.report_following_redirect(new_url)
 841                 if force_videoid:
 842                     new_url = smuggle_url(
 843                         new_url, {'force_videoid': force_videoid})
 844                 return self.url_result(new_url)
 845
 846         full_response = None
 847         if head_response is False:
 848             full_response = self._request_webpage(url, video_id)
 849             head_response = full_response
 850
 851         # Check for direct link to a video
 852         content_type = head_response.headers.get('Content-Type', '')
 853         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
 854         if m:
 855             upload_date = unified_strdate(
 856                 head_response.headers.get('Last-Modified'))
 857             return {
 858                 'id': video_id,
 859                 'title': os.path.splitext(url_basename(url))[0],
 860                 'direct': True,
 861                 'formats': [{
 862                     'format_id': m.group('format_id'),
 863                     'url': url,
 864                     'vcodec': 'none' if m.group('type') == 'audio' else None
 865                 }],
 866                 'upload_date': upload_date,
 867             }
 868
 869         if not self._downloader.params.get('test', False) and not is_intentional:
 870             self._downloader.report_warning('Falling back on generic information extractor.')
 871
 872         if not full_response:
 873             full_response = self._request_webpage(url, video_id)
 874
 875         # Maybe it's a direct link to a video?
 876         # Be careful not to download the whole thing!
 877         first_bytes = full_response.read(512)
 878         if not is_html(first_bytes):
 879             self._downloader.report_warning(
 880                 'URL could be a direct video link, returning it as such.')
 881             upload_date = unified_strdate(
 882                 head_response.headers.get('Last-Modified'))
 883             return {
 884                 'id': video_id,
 885                 'title': os.path.splitext(url_basename(url))[0],
 886                 'direct': True,
 887                 'url': url,
 888                 'upload_date': upload_date,
 889             }
 890
 891         webpage = self._webpage_read_content(
 892             full_response, url, video_id, prefix=first_bytes)
 893
 894         self.report_extraction(video_id)
 895
 896         # Is it an RSS feed?
 897         try:
 898             doc = parse_xml(webpage)
 899             if doc.tag == 'rss':
 900                 return self._extract_rss(url, video_id, doc)
 901         except compat_xml_parse_error:
 902             pass
 903
 904         # Is it a Camtasia project?
 905         camtasia_res = self._extract_camtasia(url, video_id, webpage)
 906         if camtasia_res is not None:
 907             return camtasia_res
 908
 909         # Sometimes embedded video player is hidden behind percent encoding
 910         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
 911         # Unescaping the whole page allows to handle those cases in a generic way
 912         webpage = compat_urllib_parse.unquote(webpage)
 913
 914         # it's tempting to parse this further, but you would
 915         # have to take into account all the variations like
 916         #   Video Title - Site Name
 917         #   Site Name | Video Title
 918         #   Video Title - Tagline | Site Name
 919         # and so on and so forth; it's just not practical
 920         video_title = self._html_search_regex(
 921             r'(?s)<title>(.*?)</title>', webpage, 'video title',
 922             default='video')
 923
 924         # Try to detect age limit automatically
 925         age_limit = self._rta_search(webpage)
 926         # And then there are the jokers who advertise that they use RTA,
 927         # but actually don't.
 928         AGE_LIMIT_MARKERS = [
 929             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
 930         ]
 931         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
 932             age_limit = 18
 933
 934         # video uploader is domain name
 935         video_uploader = self._search_regex(
 936             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
 937
 938         # Helper method
 939         def _playlist_from_matches(matches, getter=None, ie=None):
 940             urlrs = orderedSet(
 941                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 942                 for m in matches)
 943             return self.playlist_result(
 944                 urlrs, playlist_id=video_id, playlist_title=video_title)
 945
 946         # Look for BrightCove:
 947         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
 948         if bc_urls:
 949             self.to_screen('Brightcove video detected.')
 950             entries = [{
 951                 '_type': 'url',
 952                 'url': smuggle_url(bc_url, {'Referer': url}),
 953                 'ie_key': 'Brightcove'
 954             } for bc_url in bc_urls]
 955
 956             return {
 957                 '_type': 'playlist',
 958                 'title': video_title,
 959                 'id': video_id,
 960                 'entries': entries,
 961             }
 962
 963         # Look for embedded rtl.nl player
 964         matches = re.findall(
 965             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+video_embed[^"]+)"',
 966             webpage)
 967         if matches:
 968             return _playlist_from_matches(matches, ie='RtlNl')
 969
 970         # Look for embedded (iframe) Vimeo player
 971         mobj = re.search(
 972             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
 973         if mobj:
 974             player_url = unescapeHTML(mobj.group('url'))
 975             surl = smuggle_url(player_url, {'Referer': url})
 976             return self.url_result(surl)
 977         # Look for embedded (swf embed) Vimeo player
 978         mobj = re.search(
 979             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
 980         if mobj:
 981             return self.url_result(mobj.group(1))
 982
 983         # Look for embedded YouTube player
 984         matches = re.findall(r'''(?x)
 985             (?:
 986                 <iframe[^>]+?src=|
 987                 data-video-url=|
 988                 <embed[^>]+?src=|
 989                 embedSWF\(?:\s*|
 990                 new\s+SWFObject\(
 991             )
 992             (["\'])
 993                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
 994                 (?:embed|v|p)/.+?)
 995             \1''', webpage)
 996         if matches:
 997             return _playlist_from_matches(
 998                 matches, lambda m: unescapeHTML(m[1]))
 999
1000         # Look for lazyYT YouTube embed
1001         matches = re.findall(
1002             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1003         if matches:
1004             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1005
1006         # Look for embedded Dailymotion player
1007         matches = re.findall(
1008             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1009         if matches:
1010             return _playlist_from_matches(
1011                 matches, lambda m: unescapeHTML(m[1]))
1012
1013         # Look for embedded Dailymotion playlist player (#3822)
1014         m = re.search(
1015             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1016         if m:
1017             playlists = re.findall(
1018                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1019             if playlists:
1020                 return _playlist_from_matches(
1021                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1022
1023         # Look for embedded Wistia player
1024         match = re.search(
1025             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1026         if match:
1027             embed_url = self._proto_relative_url(
1028                 unescapeHTML(match.group('url')))
1029             return {
1030                 '_type': 'url_transparent',
1031                 'url': embed_url,
1032                 'ie_key': 'Wistia',
1033                 'uploader': video_uploader,
1034                 'title': video_title,
1035                 'id': video_id,
1036             }
1037
1038         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1039         if match:
1040             return {
1041                 '_type': 'url_transparent',
1042                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1043                 'ie_key': 'Wistia',
1044                 'uploader': video_uploader,
1045                 'title': video_title,
1046                 'id': match.group('id')
1047             }
1048
1049         # Look for embedded blip.tv player
1050         mobj = re.search(r'<meta\s[^>]*https?://api\.blip\.tv/\w+/redirect/\w+/(\d+)', webpage)
1051         if mobj:
1052             return self.url_result('http://blip.tv/a/a-' + mobj.group(1), 'BlipTV')
1053         mobj = re.search(r'<(?:iframe|embed|object)\s[^>]*(https?://(?:\w+\.)?blip\.tv/(?:play/|api\.swf#)[a-zA-Z0-9_]+)', webpage)
1054         if mobj:
1055             return self.url_result(mobj.group(1), 'BlipTV')
1056
1057         # Look for embedded condenast player
1058         matches = re.findall(
1059             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1060             webpage)
1061         if matches:
1062             return {
1063                 '_type': 'playlist',
1064                 'entries': [{
1065                     '_type': 'url',
1066                     'ie_key': 'CondeNast',
1067                     'url': ma,
1068                 } for ma in matches],
1069                 'title': video_title,
1070                 'id': video_id,
1071             }
1072
1073         # Look for Bandcamp pages with custom domain
1074         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1075         if mobj is not None:
1076             burl = unescapeHTML(mobj.group(1))
1077             # Don't set the extractor because it can be a track url or an album
1078             return self.url_result(burl)
1079
1080         # Look for embedded Vevo player
1081         mobj = re.search(
1082             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1083         if mobj is not None:
1084             return self.url_result(mobj.group('url'))
1085
1086         # Look for embedded Viddler player
1087         mobj = re.search(
1088             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1089             webpage)
1090         if mobj is not None:
1091             return self.url_result(mobj.group('url'))
1092
1093         # Look for NYTimes player
1094         mobj = re.search(
1095             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1096             webpage)
1097         if mobj is not None:
1098             return self.url_result(mobj.group('url'))
1099
1100         # Look for Libsyn player
1101         mobj = re.search(
1102             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1103         if mobj is not None:
1104             return self.url_result(mobj.group('url'))
1105
1106         # Look for Ooyala videos
1107         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1108                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1109                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1110                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1111         if mobj is not None:
1112             return OoyalaIE._build_url_result(mobj.group('ec'))
1113
1114         # Look for multiple Ooyala embeds on SBN network websites
1115         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1116         if mobj is not None:
1117             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1118             if embeds:
1119                 return _playlist_from_matches(
1120                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1121
1122         # Look for Aparat videos
1123         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1124         if mobj is not None:
1125             return self.url_result(mobj.group(1), 'Aparat')
1126
1127         # Look for MPORA videos
1128         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1129         if mobj is not None:
1130             return self.url_result(mobj.group(1), 'Mpora')
1131
1132         # Look for embedded NovaMov-based player
1133         mobj = re.search(
1134             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1135                     (?P<url>http://(?:(?:embed|www)\.)?
1136                         (?:novamov\.com|
1137                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1138                            videoweed\.(?:es|com)|
1139                            movshare\.(?:net|sx|ag)|
1140                            divxstage\.(?:eu|net|ch|co|at|ag))
1141                         /embed\.php.+?)\1''', webpage)
1142         if mobj is not None:
1143             return self.url_result(mobj.group('url'))
1144
1145         # Look for embedded Facebook player
1146         mobj = re.search(
1147             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1148         if mobj is not None:
1149             return self.url_result(mobj.group('url'), 'Facebook')
1150
1151         # Look for embedded VK player
1152         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1153         if mobj is not None:
1154             return self.url_result(mobj.group('url'), 'VK')
1155
1156         # Look for embedded ivi player
1157         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1158         if mobj is not None:
1159             return self.url_result(mobj.group('url'), 'Ivi')
1160
1161         # Look for embedded Huffington Post player
1162         mobj = re.search(
1163             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1164         if mobj is not None:
1165             return self.url_result(mobj.group('url'), 'HuffPost')
1166
1167         # Look for embed.ly
1168         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1169         if mobj is not None:
1170             return self.url_result(mobj.group('url'))
1171         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1172         if mobj is not None:
1173             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1174
1175         # Look for funnyordie embed
1176         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1177         if matches:
1178             return _playlist_from_matches(
1179                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1180
1181         # Look for BBC iPlayer embed
1182         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1183         if matches:
1184             return _playlist_from_matches(matches, ie='BBCCoUk')
1185
1186         # Look for embedded RUTV player
1187         rutv_url = RUTVIE._extract_url(webpage)
1188         if rutv_url:
1189             return self.url_result(rutv_url, 'RUTV')
1190
1191         # Look for embedded TED player
1192         mobj = re.search(
1193             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1194         if mobj is not None:
1195             return self.url_result(mobj.group('url'), 'TED')
1196
1197         # Look for embedded Ustream videos
1198         mobj = re.search(
1199             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1200         if mobj is not None:
1201             return self.url_result(mobj.group('url'), 'Ustream')
1202
1203         # Look for embedded arte.tv player
1204         mobj = re.search(
1205             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1206             webpage)
1207         if mobj is not None:
1208             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1209
1210         # Look for embedded smotri.com player
1211         smotri_url = SmotriIE._extract_url(webpage)
1212         if smotri_url:
1213             return self.url_result(smotri_url, 'Smotri')
1214
1215         # Look for embeded soundcloud player
1216         mobj = re.search(
1217             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1218             webpage)
1219         if mobj is not None:
1220             url = unescapeHTML(mobj.group('url'))
1221             return self.url_result(url)
1222
1223         # Look for embedded vulture.com player
1224         mobj = re.search(
1225             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1226             webpage)
1227         if mobj is not None:
1228             url = unescapeHTML(mobj.group('url'))
1229             return self.url_result(url, ie='Vulture')
1230
1231         # Look for embedded mtvservices player
1232         mobj = re.search(
1233             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1234             webpage)
1235         if mobj is not None:
1236             url = unescapeHTML(mobj.group('url'))
1237             return self.url_result(url, ie='MTVServicesEmbedded')
1238
1239         # Look for embedded yahoo player
1240         mobj = re.search(
1241             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1242             webpage)
1243         if mobj is not None:
1244             return self.url_result(mobj.group('url'), 'Yahoo')
1245
1246         # Look for embedded sbs.com.au player
1247         mobj = re.search(
1248             r'''(?x)
1249             (?:
1250                 <meta\s+property="og:video"\s+content=|
1251                 <iframe[^>]+?src=
1252             )
1253             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1254             webpage)
1255         if mobj is not None:
1256             return self.url_result(mobj.group('url'), 'SBS')
1257
1258         # Look for embedded Cinchcast player
1259         mobj = re.search(
1260             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1261             webpage)
1262         if mobj is not None:
1263             return self.url_result(mobj.group('url'), 'Cinchcast')
1264
1265         mobj = re.search(
1266             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1267             webpage)
1268         if mobj is not None:
1269             return self.url_result(mobj.group('url'), 'MLB')
1270
1271         mobj = re.search(
1272             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1273             webpage)
1274         if mobj is not None:
1275             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1276
1277         mobj = re.search(
1278             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1279             webpage)
1280         if mobj is not None:
1281             return self.url_result(mobj.group('url'), 'Livestream')
1282
1283         # Look for Zapiks embed
1284         mobj = re.search(
1285             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1286         if mobj is not None:
1287             return self.url_result(mobj.group('url'), 'Zapiks')
1288
1289         # Look for Kaltura embeds
1290         mobj = re.search(
1291             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1292         if mobj is not None:
1293             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1294
1295         # Look for Eagle.Platform embeds
1296         mobj = re.search(
1297             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1298         if mobj is not None:
1299             return self.url_result(mobj.group('url'), 'EaglePlatform')
1300
1301         # Look for ClipYou (uses Eagle.Platform) embeds
1302         mobj = re.search(
1303             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1304         if mobj is not None:
1305             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1306
1307         # Look for Pladform embeds
1308         mobj = re.search(
1309             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1310         if mobj is not None:
1311             return self.url_result(mobj.group('url'), 'Pladform')
1312
1313         # Look for 5min embeds
1314         mobj = re.search(
1315             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1316         if mobj is not None:
1317             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1318
1319         # Look for Crooks and Liars embeds
1320         mobj = re.search(
1321             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1322         if mobj is not None:
1323             return self.url_result(mobj.group('url'))
1324
1325         # Look for NBC Sports VPlayer embeds
1326         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1327         if nbc_sports_url:
1328             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1329
1330         # Look for UDN embeds
1331         mobj = re.search(
1332             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1333         if mobj is not None:
1334             return self.url_result(
1335                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1336
1337         def check_video(vurl):
1338             if YoutubeIE.suitable(vurl):
1339                 return True
1340             vpath = compat_urlparse.urlparse(vurl).path
1341             vext = determine_ext(vpath)
1342             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1343
1344         def filter_video(urls):
1345             return list(filter(check_video, urls))
1346
1347         # Start with something easy: JW Player in SWFObject
1348         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1349         if not found:
1350             # Look for gorilla-vid style embedding
1351             found = filter_video(re.findall(r'''(?sx)
1352                 (?:
1353                     jw_plugins|
1354                     JWPlayerOptions|
1355                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1356                 )
1357                 .*?
1358                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1359         if not found:
1360             # Broaden the search a little bit
1361             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1362         if not found:
1363             # Broaden the findall a little bit: JWPlayer JS loader
1364             found = filter_video(re.findall(
1365                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1366         if not found:
1367             # Flow player
1368             found = filter_video(re.findall(r'''(?xs)
1369                 flowplayer\("[^"]+",\s*
1370                     \{[^}]+?\}\s*,
1371                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1372                         ["']?url["']?\s*:\s*["']([^"']+)["']
1373             ''', webpage))
1374         if not found:
1375             # Cinerama player
1376             found = re.findall(
1377                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1378         if not found:
1379             # Try to find twitter cards info
1380             found = filter_video(re.findall(
1381                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1382         if not found:
1383             # We look for Open Graph info:
1384             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1385             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1386             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1387             if m_video_type is not None:
1388                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1389         if not found:
1390             # HTML5 video
1391             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1392         if not found:
1393             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1394             found = re.search(
1395                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1396                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1397                 webpage)
1398             if not found:
1399                 # Look also in Refresh HTTP header
1400                 refresh_header = head_response.headers.get('Refresh')
1401                 if refresh_header:
1402                     found = re.search(REDIRECT_REGEX, refresh_header)
1403             if found:
1404                 new_url = found.group(1)
1405                 self.report_following_redirect(new_url)
1406                 return {
1407                     '_type': 'url',
1408                     'url': new_url,
1409                 }
1410         if not found:
1411             raise UnsupportedError(url)
1412
1413         entries = []
1414         for video_url in found:
1415             video_url = compat_urlparse.urljoin(url, video_url)
1416             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1417
1418             # Sometimes, jwplayer extraction will result in a YouTube URL
1419             if YoutubeIE.suitable(video_url):
1420                 entries.append(self.url_result(video_url, 'Youtube'))
1421                 continue
1422
1423             # here's a fun little line of code for you:
1424             video_id = os.path.splitext(video_id)[0]
1425
1426             entries.append({
1427                 'id': video_id,
1428                 'url': video_url,
1429                 'uploader': video_uploader,
1430                 'title': video_title,
1431                 'age_limit': age_limit,
1432             })
1433
1434         if len(entries) == 1:
1435             return entries[0]
1436         else:
1437             for num, e in enumerate(entries, start=1):
1438                 # 'url' results don't have a title
1439                 if e.get('title') is not None:
1440                     e['title'] = '%s (%d)' % (e['title'], num)
1441             return {
1442                 '_type': 'playlist',
1443                 'entries': entries,
1444             }