git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/generic.py

   1 # encoding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5 import os
   6 import re
   7
   8 from .common import InfoExtractor
   9 from .youtube import YoutubeIE
  10 from ..compat import (
  11     compat_urllib_parse,
  12     compat_urllib_parse_unquote,
  13     compat_urllib_request,
  14     compat_urlparse,
  15     compat_xml_parse_error,
  16 )
  17 from ..utils import (
  18     determine_ext,
  19     ExtractorError,
  20     float_or_none,
  21     HEADRequest,
  22     is_html,
  23     orderedSet,
  24     parse_xml,
  25     smuggle_url,
  26     unescapeHTML,
  27     unified_strdate,
  28     unsmuggle_url,
  29     UnsupportedError,
  30     url_basename,
  31     xpath_text,
  32 )
  33 from .brightcove import BrightcoveIE
  34 from .nbc import NBCSportsVPlayerIE
  35 from .ooyala import OoyalaIE
  36 from .rutv import RUTVIE
  37 from .sportbox import SportBoxEmbedIE
  38 from .smotri import SmotriIE
  39 from .condenast import CondeNastIE
  40 from .udn import UDNEmbedIE
  41 from .senateisvp import SenateISVPIE
  42 from .bliptv import BlipTVIE
  43 from .svt import SVTIE
  44
  45
  46 class GenericIE(InfoExtractor):
  47     IE_DESC = 'Generic downloader that works on some sites'
  48     _VALID_URL = r'.*'
  49     IE_NAME = 'generic'
  50     _TESTS = [
  51         # Direct link to a video
  52         {
  53             'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4',
  54             'md5': '67d406c2bcb6af27fa886f31aa934bbe',
  55             'info_dict': {
  56                 'id': 'trailer',
  57                 'ext': 'mp4',
  58                 'title': 'trailer',
  59                 'upload_date': '20100513',
  60             }
  61         },
  62         # Direct link to media delivered compressed (until Accept-Encoding is *)
  63         {
  64             'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac',
  65             'md5': '128c42e68b13950268b648275386fc74',
  66             'info_dict': {
  67                 'id': 'FictionJunction-Parallel_Hearts',
  68                 'ext': 'flac',
  69                 'title': 'FictionJunction-Parallel_Hearts',
  70                 'upload_date': '20140522',
  71             },
  72             'expected_warnings': [
  73                 'URL could be a direct video link, returning it as such.'
  74             ]
  75         },
  76         # Direct download with broken HEAD
  77         {
  78             'url': 'http://ai-radio.org:8000/radio.opus',
  79             'info_dict': {
  80                 'id': 'radio',
  81                 'ext': 'opus',
  82                 'title': 'radio',
  83             },
  84             'params': {
  85                 'skip_download': True,  # infinite live stream
  86             },
  87             'expected_warnings': [
  88                 r'501.*Not Implemented'
  89             ],
  90         },
  91         # Direct link with incorrect MIME type
  92         {
  93             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
  94             'md5': '4ccbebe5f36706d85221f204d7eb5913',
  95             'info_dict': {
  96                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm',
  97                 'id': '5_Lennart_Poettering_-_Systemd',
  98                 'ext': 'webm',
  99                 'title': '5_Lennart_Poettering_-_Systemd',
 100                 'upload_date': '20141120',
 101             },
 102             'expected_warnings': [
 103                 'URL could be a direct video link, returning it as such.'
 104             ]
 105         },
 106         # RSS feed
 107         {
 108             'url': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 109             'info_dict': {
 110                 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml',
 111                 'title': 'Zero Punctuation',
 112                 'description': 're:.*groundbreaking video review series.*'
 113             },
 114             'playlist_mincount': 11,
 115         },
 116         # RSS feed with enclosure
 117         {
 118             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
 119             'info_dict': {
 120                 'id': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 121                 'ext': 'm4v',
 122                 'upload_date': '20150228',
 123                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
 124             }
 125         },
 126         # google redirect
 127         {
 128             'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
 129             'info_dict': {
 130                 'id': 'cmQHVoWB5FY',
 131                 'ext': 'mp4',
 132                 'upload_date': '20130224',
 133                 'uploader_id': 'TheVerge',
 134                 'description': 're:^Chris Ziegler takes a look at the\.*',
 135                 'uploader': 'The Verge',
 136                 'title': 'First Firefox OS phones side-by-side',
 137             },
 138             'params': {
 139                 'skip_download': False,
 140             }
 141         },
 142         {
 143             'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
 144             'md5': '85b90ccc9d73b4acd9138d3af4c27f89',
 145             'info_dict': {
 146                 'id': '13601338388002',
 147                 'ext': 'mp4',
 148                 'uploader': 'www.hodiho.fr',
 149                 'title': 'R\u00e9gis plante sa Jeep',
 150             }
 151         },
 152         # bandcamp page with custom domain
 153         {
 154             'add_ie': ['Bandcamp'],
 155             'url': 'http://bronyrock.com/track/the-pony-mash',
 156             'info_dict': {
 157                 'id': '3235767654',
 158                 'ext': 'mp3',
 159                 'title': 'The Pony Mash',
 160                 'uploader': 'M_Pallante',
 161             },
 162             'skip': 'There is a limit of 200 free downloads / month for the test song',
 163         },
 164         # embedded brightcove video
 165         # it also tests brightcove videos that need to set the 'Referer' in the
 166         # http requests
 167         {
 168             'add_ie': ['Brightcove'],
 169             'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/',
 170             'info_dict': {
 171                 'id': '2765128793001',
 172                 'ext': 'mp4',
 173                 'title': 'Le cours de bourse : l’analyse technique',
 174                 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9',
 175                 'uploader': 'BFM BUSINESS',
 176             },
 177             'params': {
 178                 'skip_download': True,
 179             },
 180         },
 181         {
 182             # https://github.com/rg3/youtube-dl/issues/2253
 183             'url': 'http://bcove.me/i6nfkrc3',
 184             'md5': '0ba9446db037002366bab3b3eb30c88c',
 185             'info_dict': {
 186                 'id': '3101154703001',
 187                 'ext': 'mp4',
 188                 'title': 'Still no power',
 189                 'uploader': 'thestar.com',
 190                 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.',
 191             },
 192             'add_ie': ['Brightcove'],
 193         },
 194         {
 195             'url': 'http://www.championat.com/video/football/v/87/87499.html',
 196             'md5': 'fb973ecf6e4a78a67453647444222983',
 197             'info_dict': {
 198                 'id': '3414141473001',
 199                 'ext': 'mp4',
 200                 'title': 'Видео. Удаление Дзагоева (ЦСКА)',
 201                 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"',
 202                 'uploader': 'Championat',
 203             },
 204         },
 205         {
 206             # https://github.com/rg3/youtube-dl/issues/3541
 207             'add_ie': ['Brightcove'],
 208             'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1',
 209             'info_dict': {
 210                 'id': '3866516442001',
 211                 'ext': 'mp4',
 212                 'title': 'Leer mij vrouwen kennen: Aflevering 1',
 213                 'description': 'Leer mij vrouwen kennen: Aflevering 1',
 214                 'uploader': 'SBS Broadcasting',
 215             },
 216             'skip': 'Restricted to Netherlands',
 217             'params': {
 218                 'skip_download': True,  # m3u8 download
 219             },
 220         },
 221         # ooyala video
 222         {
 223             'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219',
 224             'md5': '166dd577b433b4d4ebfee10b0824d8ff',
 225             'info_dict': {
 226                 'id': 'BwY2RxaTrTkslxOfcan0UCf0YqyvWysJ',
 227                 'ext': 'mp4',
 228                 'title': '2cc213299525360.mov',  # that's what we get
 229             },
 230             'add_ie': ['Ooyala'],
 231         },
 232         # multiple ooyala embeds on SBN network websites
 233         {
 234             'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 235             'info_dict': {
 236                 'id': 'national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
 237                 'title': '25 lies you will tell yourself on National Signing Day - SBNation.com',
 238             },
 239             'playlist_mincount': 3,
 240             'params': {
 241                 'skip_download': True,
 242             },
 243             'add_ie': ['Ooyala'],
 244         },
 245         # embed.ly video
 246         {
 247             'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
 248             'info_dict': {
 249                 'id': '9ODmcdjQcHQ',
 250                 'ext': 'mp4',
 251                 'title': 'Tested: Grinding Coffee at 2000 Frames Per Second',
 252                 'upload_date': '20140225',
 253                 'description': 'md5:06a40fbf30b220468f1e0957c0f558ff',
 254                 'uploader': 'Tested',
 255                 'uploader_id': 'testedcom',
 256             },
 257             # No need to test YoutubeIE here
 258             'params': {
 259                 'skip_download': True,
 260             },
 261         },
 262         # funnyordie embed
 263         {
 264             'url': 'http://www.theguardian.com/world/2014/mar/11/obama-zach-galifianakis-between-two-ferns',
 265             'info_dict': {
 266                 'id': '18e820ec3f',
 267                 'ext': 'mp4',
 268                 'title': 'Between Two Ferns with Zach Galifianakis: President Barack Obama',
 269                 'description': 'Episode 18: President Barack Obama sits down with Zach Galifianakis for his most memorable interview yet.',
 270             },
 271         },
 272         # BBC iPlayer embeds
 273         {
 274             'url': 'http://www.bbc.co.uk/blogs/adamcurtis/posts/BUGGER',
 275             'info_dict': {
 276                 'title': 'BBC - Blogs -  Adam Curtis - BUGGER',
 277             },
 278             'playlist_mincount': 18,
 279         },
 280         # RUTV embed
 281         {
 282             'url': 'http://www.rg.ru/2014/03/15/reg-dfo/anklav-anons.html',
 283             'info_dict': {
 284                 'id': '776940',
 285                 'ext': 'mp4',
 286                 'title': 'Охотское море стало целиком российским',
 287                 'description': 'md5:5ed62483b14663e2a95ebbe115eb8f43',
 288             },
 289             'params': {
 290                 # m3u8 download
 291                 'skip_download': True,
 292             },
 293         },
 294         # SportBox embed
 295         {
 296             'url': 'http://www.vestifinance.ru/articles/25753',
 297             'info_dict': {
 298                 'id': '25753',
 299                 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"',
 300             },
 301             'playlist': [{
 302                 'info_dict': {
 303                     'id': '370908',
 304                     'title': 'Госзаказ. День 3',
 305                     'ext': 'mp4',
 306                 }
 307             }, {
 308                 'info_dict': {
 309                     'id': '370905',
 310                     'title': 'Госзаказ. День 2',
 311                     'ext': 'mp4',
 312                 }
 313             }, {
 314                 'info_dict': {
 315                     'id': '370902',
 316                     'title': 'Госзаказ. День 1',
 317                     'ext': 'mp4',
 318                 }
 319             }],
 320             'params': {
 321                 # m3u8 download
 322                 'skip_download': True,
 323             },
 324         },
 325         # Embedded TED video
 326         {
 327             'url': 'http://en.support.wordpress.com/videos/ted-talks/',
 328             'md5': '65fdff94098e4a607385a60c5177c638',
 329             'info_dict': {
 330                 'id': '1969',
 331                 'ext': 'mp4',
 332                 'title': 'Hidden miracles of the natural world',
 333                 'uploader': 'Louie Schwartzberg',
 334                 'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
 335             }
 336         },
 337         # Embeded Ustream video
 338         {
 339             'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
 340             'md5': '27b99cdb639c9b12a79bca876a073417',
 341             'info_dict': {
 342                 'id': '45734260',
 343                 'ext': 'flv',
 344                 'uploader': 'AU SPA:  The NSA and Privacy',
 345                 'title': 'NSA and Privacy Forum Debate featuring General Hayden and Barton Gellman'
 346             }
 347         },
 348         # nowvideo embed hidden behind percent encoding
 349         {
 350             'url': 'http://www.waoanime.tv/the-super-dimension-fortress-macross-episode-1/',
 351             'md5': '2baf4ddd70f697d94b1c18cf796d5107',
 352             'info_dict': {
 353                 'id': '06e53103ca9aa',
 354                 'ext': 'flv',
 355                 'title': 'Macross Episode 001  Watch Macross Episode 001 onl',
 356                 'description': 'No description',
 357             },
 358         },
 359         # arte embed
 360         {
 361             'url': 'http://www.tv-replay.fr/redirection/20-03-14/x-enius-arte-10753389.html',
 362             'md5': '7653032cbb25bf6c80d80f217055fa43',
 363             'info_dict': {
 364                 'id': '048195-004_PLUS7-F',
 365                 'ext': 'flv',
 366                 'title': 'X:enius',
 367                 'description': 'md5:d5fdf32ef6613cdbfd516ae658abf168',
 368                 'upload_date': '20140320',
 369             },
 370             'params': {
 371                 'skip_download': 'Requires rtmpdump'
 372             }
 373         },
 374         # Condé Nast embed
 375         {
 376             'url': 'http://www.wired.com/2014/04/honda-asimo/',
 377             'md5': 'ba0dfe966fa007657bd1443ee672db0f',
 378             'info_dict': {
 379                 'id': '53501be369702d3275860000',
 380                 'ext': 'mp4',
 381                 'title': 'Honda’s  New Asimo Robot Is More Human Than Ever',
 382             }
 383         },
 384         # Dailymotion embed
 385         {
 386             'url': 'http://www.spi0n.com/zap-spi0n-com-n216/',
 387             'md5': '441aeeb82eb72c422c7f14ec533999cd',
 388             'info_dict': {
 389                 'id': 'k2mm4bCdJ6CQ2i7c8o2',
 390                 'ext': 'mp4',
 391                 'title': 'Le Zap de Spi0n n°216 - Zapping du Web',
 392                 'uploader': 'Spi0n',
 393             },
 394             'add_ie': ['Dailymotion'],
 395         },
 396         # YouTube embed
 397         {
 398             'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
 399             'info_dict': {
 400                 'id': 'FXRb4ykk4S0',
 401                 'ext': 'mp4',
 402                 'title': 'The NBL Auction 2014',
 403                 'uploader': 'BADMINTON England',
 404                 'uploader_id': 'BADMINTONEvents',
 405                 'upload_date': '20140603',
 406                 'description': 'md5:9ef128a69f1e262a700ed83edb163a73',
 407             },
 408             'add_ie': ['Youtube'],
 409             'params': {
 410                 'skip_download': True,
 411             }
 412         },
 413         # MTVSercices embed
 414         {
 415             'url': 'http://www.gametrailers.com/news-post/76093/north-america-europe-is-getting-that-mario-kart-8-mercedes-dlc-too',
 416             'md5': '35727f82f58c76d996fc188f9755b0d5',
 417             'info_dict': {
 418                 'id': '0306a69b-8adf-4fb5-aace-75f8e8cbfca9',
 419                 'ext': 'mp4',
 420                 'title': 'Review',
 421                 'description': 'Mario\'s life in the fast lane has never looked so good.',
 422             },
 423         },
 424         # YouTube embed via <data-embed-url="">
 425         {
 426             'url': 'https://play.google.com/store/apps/details?id=com.gameloft.android.ANMP.GloftA8HM',
 427             'info_dict': {
 428                 'id': '4vAffPZIT44',
 429                 'ext': 'mp4',
 430                 'title': 'Asphalt 8: Airborne - Update - Welcome to Dubai!',
 431                 'uploader': 'Gameloft',
 432                 'uploader_id': 'gameloft',
 433                 'upload_date': '20140828',
 434                 'description': 'md5:c80da9ed3d83ae6d1876c834de03e1c4',
 435             },
 436             'params': {
 437                 'skip_download': True,
 438             }
 439         },
 440         # Camtasia studio
 441         {
 442             'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
 443             'playlist': [{
 444                 'md5': '0c5e352edabf715d762b0ad4e6d9ee67',
 445                 'info_dict': {
 446                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 447                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1',
 448                     'ext': 'flv',
 449                     'duration': 2235.90,
 450                 }
 451             }, {
 452                 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63',
 453                 'info_dict': {
 454                     'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP',
 455                     'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip',
 456                     'ext': 'flv',
 457                     'duration': 2235.93,
 458                 }
 459             }],
 460             'info_dict': {
 461                 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final',
 462             }
 463         },
 464         # Flowplayer
 465         {
 466             'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html',
 467             'md5': '9d65602bf31c6e20014319c7d07fba27',
 468             'info_dict': {
 469                 'id': '5123ea6d5e5a7',
 470                 'ext': 'mp4',
 471                 'age_limit': 18,
 472                 'uploader': 'www.handjobhub.com',
 473                 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com',
 474             }
 475         },
 476         # Multiple brightcove videos
 477         # https://github.com/rg3/youtube-dl/issues/2283
 478         {
 479             'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html',
 480             'info_dict': {
 481                 'id': 'always-never',
 482                 'title': 'Always / Never - The New Yorker',
 483             },
 484             'playlist_count': 3,
 485             'params': {
 486                 'extract_flat': False,
 487                 'skip_download': True,
 488             }
 489         },
 490         # MLB embed
 491         {
 492             'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/',
 493             'md5': '96f09a37e44da40dd083e12d9a683327',
 494             'info_dict': {
 495                 'id': '33322633',
 496                 'ext': 'mp4',
 497                 'title': 'Ump changes call to ball',
 498                 'description': 'md5:71c11215384298a172a6dcb4c2e20685',
 499                 'duration': 48,
 500                 'timestamp': 1401537900,
 501                 'upload_date': '20140531',
 502                 'thumbnail': 're:^https?://.*\.jpg$',
 503             },
 504         },
 505         # Wistia embed
 506         {
 507             'url': 'http://education-portal.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson',
 508             'md5': '8788b683c777a5cf25621eaf286d0c23',
 509             'info_dict': {
 510                 'id': '1cfaf6b7ea',
 511                 'ext': 'mov',
 512                 'title': 'md5:51364a8d3d009997ba99656004b5e20d',
 513                 'duration': 643.0,
 514                 'filesize': 182808282,
 515                 'uploader': 'education-portal.com',
 516             },
 517         },
 518         {
 519             'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz',
 520             'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4',
 521             'info_dict': {
 522                 'id': 'uxjb0lwrcz',
 523                 'ext': 'mp4',
 524                 'title': 'Conversation about Hexagonal Rails Part 1 - ThoughtWorks',
 525                 'duration': 1715.0,
 526                 'uploader': 'thoughtworks.wistia.com',
 527             },
 528         },
 529         # Soundcloud embed
 530         {
 531             'url': 'http://nakedsecurity.sophos.com/2014/10/29/sscc-171-are-you-sure-that-1234-is-a-bad-password-podcast/',
 532             'info_dict': {
 533                 'id': '174391317',
 534                 'ext': 'mp3',
 535                 'description': 'md5:ff867d6b555488ad3c52572bb33d432c',
 536                 'uploader': 'Sophos Security',
 537                 'title': 'Chet Chat 171 - Oct 29, 2014',
 538                 'upload_date': '20141029',
 539             }
 540         },
 541         # Livestream embed
 542         {
 543             'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast',
 544             'info_dict': {
 545                 'id': '67864563',
 546                 'ext': 'flv',
 547                 'upload_date': '20141112',
 548                 'title': 'Rosetta #CometLanding webcast HL 10',
 549             }
 550         },
 551         # LazyYT
 552         {
 553             'url': 'http://discourse.ubuntu.com/t/unity-8-desktop-mode-windows-on-mir/1986',
 554             'info_dict': {
 555                 'id': '1986',
 556                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse',
 557             },
 558             'playlist_mincount': 2,
 559         },
 560         # Cinchcast embed
 561         {
 562             'url': 'http://undergroundwellness.com/podcasts/306-5-steps-to-permanent-gut-healing/',
 563             'info_dict': {
 564                 'id': '7141703',
 565                 'ext': 'mp3',
 566                 'upload_date': '20141126',
 567                 'title': 'Jack Tips: 5 Steps to Permanent Gut Healing',
 568             }
 569         },
 570         # Cinerama player
 571         {
 572             'url': 'http://www.abc.net.au/7.30/content/2015/s4164797.htm',
 573             'info_dict': {
 574                 'id': '730m_DandD_1901_512k',
 575                 'ext': 'mp4',
 576                 'uploader': 'www.abc.net.au',
 577                 'title': 'Game of Thrones with dice - Dungeons and Dragons fantasy role-playing game gets new life - 19/01/2015',
 578             }
 579         },
 580         # embedded viddler video
 581         {
 582             'url': 'http://deadspin.com/i-cant-stop-watching-john-wall-chop-the-nuggets-with-th-1681801597',
 583             'info_dict': {
 584                 'id': '4d03aad9',
 585                 'ext': 'mp4',
 586                 'uploader': 'deadspin',
 587                 'title': 'WALL-TO-GORTAT',
 588                 'timestamp': 1422285291,
 589                 'upload_date': '20150126',
 590             },
 591             'add_ie': ['Viddler'],
 592         },
 593         # Libsyn embed
 594         {
 595             'url': 'http://thedailyshow.cc.com/podcast/episodetwelve',
 596             'info_dict': {
 597                 'id': '3377616',
 598                 'ext': 'mp3',
 599                 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
 600                 'description': 'md5:601cb790edd05908957dae8aaa866465',
 601                 'upload_date': '20150220',
 602             },
 603         },
 604         # jwplayer YouTube
 605         {
 606             'url': 'http://media.nationalarchives.gov.uk/index.php/webinar-using-discovery-national-archives-online-catalogue/',
 607             'info_dict': {
 608                 'id': 'Mrj4DVp2zeA',
 609                 'ext': 'mp4',
 610                 'upload_date': '20150212',
 611                 'uploader': 'The National Archives UK',
 612                 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
 613                 'uploader_id': 'NationalArchives08',
 614                 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
 615             },
 616         },
 617         # rtl.nl embed
 618         {
 619             'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
 620             'playlist_mincount': 5,
 621             'info_dict': {
 622                 'id': 'aanslagen-kopenhagen',
 623                 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
 624             }
 625         },
 626         # Zapiks embed
 627         {
 628             'url': 'http://www.skipass.com/news/116090-bon-appetit-s5ep3-baqueira-mi-cor.html',
 629             'info_dict': {
 630                 'id': '118046',
 631                 'ext': 'mp4',
 632                 'title': 'EP3S5 - Bon Appétit - Baqueira Mi Corazon !',
 633             }
 634         },
 635         # Kaltura embed
 636         {
 637             'url': 'http://www.monumentalnetwork.com/videos/john-carlson-postgame-2-25-15',
 638             'info_dict': {
 639                 'id': '1_eergr3h1',
 640                 'ext': 'mp4',
 641                 'upload_date': '20150226',
 642                 'uploader_id': 'MonumentalSports-Kaltura@perfectsensedigital.com',
 643                 'timestamp': int,
 644                 'title': 'John Carlson Postgame 2/25/15',
 645             },
 646         },
 647         # Eagle.Platform embed (generic URL)
 648         {
 649             'url': 'http://lenta.ru/news/2015/03/06/navalny/',
 650             'info_dict': {
 651                 'id': '227304',
 652                 'ext': 'mp4',
 653                 'title': 'Навальный вышел на свободу',
 654                 'description': 'md5:d97861ac9ae77377f3f20eaf9d04b4f5',
 655                 'thumbnail': 're:^https?://.*\.jpg$',
 656                 'duration': 87,
 657                 'view_count': int,
 658                 'age_limit': 0,
 659             },
 660         },
 661         # ClipYou (Eagle.Platform) embed (custom URL)
 662         {
 663             'url': 'http://muz-tv.ru/play/7129/',
 664             'info_dict': {
 665                 'id': '12820',
 666                 'ext': 'mp4',
 667                 'title': "'O Sole Mio",
 668                 'thumbnail': 're:^https?://.*\.jpg$',
 669                 'duration': 216,
 670                 'view_count': int,
 671             },
 672         },
 673         # Pladform embed
 674         {
 675             'url': 'http://muz-tv.ru/kinozal/view/7400/',
 676             'info_dict': {
 677                 'id': '100183293',
 678                 'ext': 'mp4',
 679                 'title': 'Тайны перевала Дятлова • 1 серия 2 часть',
 680                 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века',
 681                 'thumbnail': 're:^https?://.*\.jpg$',
 682                 'duration': 694,
 683                 'age_limit': 0,
 684             },
 685         },
 686         # Playwire embed
 687         {
 688             'url': 'http://www.cinemablend.com/new/First-Joe-Dirt-2-Trailer-Teaser-Stupid-Greatness-70874.html',
 689             'info_dict': {
 690                 'id': '3519514',
 691                 'ext': 'mp4',
 692                 'title': 'Joe Dirt 2 Beautiful Loser Teaser Trailer',
 693                 'thumbnail': 're:^https?://.*\.png$',
 694                 'duration': 45.115,
 695             },
 696         },
 697         # 5min embed
 698         {
 699             'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
 700             'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
 701             'info_dict': {
 702                 'id': '518726732',
 703                 'ext': 'mp4',
 704                 'title': 'Facebook Creates "On This Day" | Crunch Report',
 705             },
 706         },
 707         # SVT embed
 708         {
 709             'url': 'http://www.svt.se/sport/ishockey/jagr-tacklar-giroux-under-intervjun',
 710             'info_dict': {
 711                 'id': '2900353',
 712                 'ext': 'flv',
 713                 'title': 'Här trycker Jagr till Giroux (under SVT-intervjun)',
 714                 'duration': 27,
 715                 'age_limit': 0,
 716             },
 717         },
 718         # Crooks and Liars embed
 719         {
 720             'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists',
 721             'info_dict': {
 722                 'id': '8RUoRhRi',
 723                 'ext': 'mp4',
 724                 'title': "Fox & Friends Says Protecting Atheists From Discrimination Is Anti-Christian!",
 725                 'description': 'md5:e1a46ad1650e3a5ec7196d432799127f',
 726                 'timestamp': 1428207000,
 727                 'upload_date': '20150405',
 728                 'uploader': 'Heather',
 729             },
 730         },
 731         # Crooks and Liars external embed
 732         {
 733             'url': 'http://theothermccain.com/2010/02/02/video-proves-that-bill-kristol-has-been-watching-glenn-beck/comment-page-1/',
 734             'info_dict': {
 735                 'id': 'MTE3MjUtMzQ2MzA',
 736                 'ext': 'mp4',
 737                 'title': 'md5:5e3662a81a4014d24c250d76d41a08d5',
 738                 'description': 'md5:9b8e9542d6c3c5de42d6451b7d780cec',
 739                 'timestamp': 1265032391,
 740                 'upload_date': '20100201',
 741                 'uploader': 'Heather',
 742             },
 743         },
 744         # NBC Sports vplayer embed
 745         {
 746             'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
 747             'info_dict': {
 748                 'id': 'ln7x1qSThw4k',
 749                 'ext': 'flv',
 750                 'title': "PFT Live: New leader in the 'new-look' defense",
 751                 'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
 752             },
 753         },
 754         # UDN embed
 755         {
 756             'url': 'http://www.udn.com/news/story/7314/822787',
 757             'md5': 'fd2060e988c326991037b9aff9df21a6',
 758             'info_dict': {
 759                 'id': '300346',
 760                 'ext': 'mp4',
 761                 'title': '中一中男師變性 全校師生力挺',
 762                 'thumbnail': 're:^https?://.*\.jpg$',
 763             }
 764         },
 765         # Ooyala embed
 766         {
 767             'url': 'http://www.businessinsider.com/excel-index-match-vlookup-video-how-to-2015-2?IR=T',
 768             'info_dict': {
 769                 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
 770                 'ext': 'mp4',
 771                 'description': 'VIDEO: Index/Match versus VLOOKUP.',
 772                 'title': 'This is what separates the Excel masters from the wannabes',
 773             },
 774             'params': {
 775                 # m3u8 downloads
 776                 'skip_download': True,
 777             }
 778         },
 779         # Contains a SMIL manifest
 780         {
 781             'url': 'http://www.telewebion.com/fa/1263668/%D9%82%D8%B1%D8%B9%D9%87%E2%80%8C%DA%A9%D8%B4%DB%8C-%D9%84%DB%8C%DA%AF-%D9%82%D9%87%D8%B1%D9%85%D8%A7%D9%86%D8%A7%D9%86-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7/%2B-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84.html',
 782             'info_dict': {
 783                 'id': 'file',
 784                 'ext': 'flv',
 785                 'title': '+ Football: Lottery Champions League Europe',
 786                 'uploader': 'www.telewebion.com',
 787             },
 788             'params': {
 789                 # rtmpe downloads
 790                 'skip_download': True,
 791             }
 792         },
 793         # Brightcove URL in single quotes
 794         {
 795             'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/',
 796             'md5': '4ae374f1f8b91c889c4b9203c8c752af',
 797             'info_dict': {
 798                 'id': '4255764656001',
 799                 'ext': 'mp4',
 800                 'title': 'SN Presents: Russell Martin, World Citizen',
 801                 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.',
 802                 'uploader': 'Rogers Sportsnet',
 803             },
 804         }
 805     ]
 806
 807     def report_following_redirect(self, new_url):
 808         """Report information extraction."""
 809         self._downloader.to_screen('[redirect] Following redirect to %s' % new_url)
 810
 811     def _extract_rss(self, url, video_id, doc):
 812         playlist_title = doc.find('./channel/title').text
 813         playlist_desc_el = doc.find('./channel/description')
 814         playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text
 815
 816         entries = []
 817         for it in doc.findall('./channel/item'):
 818             next_url = xpath_text(it, 'link', fatal=False)
 819             if not next_url:
 820                 enclosure_nodes = it.findall('./enclosure')
 821                 for e in enclosure_nodes:
 822                     next_url = e.attrib.get('url')
 823                     if next_url:
 824                         break
 825
 826             if not next_url:
 827                 continue
 828
 829             entries.append({
 830                 '_type': 'url',
 831                 'url': next_url,
 832                 'title': it.find('title').text,
 833             })
 834
 835         return {
 836             '_type': 'playlist',
 837             'id': url,
 838             'title': playlist_title,
 839             'description': playlist_desc,
 840             'entries': entries,
 841         }
 842
 843     def _extract_camtasia(self, url, video_id, webpage):
 844         """ Returns None if no camtasia video can be found. """
 845
 846         camtasia_cfg = self._search_regex(
 847             r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);',
 848             webpage, 'camtasia configuration file', default=None)
 849         if camtasia_cfg is None:
 850             return None
 851
 852         title = self._html_search_meta('DC.title', webpage, fatal=True)
 853
 854         camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg)
 855         camtasia_cfg = self._download_xml(
 856             camtasia_url, video_id,
 857             note='Downloading camtasia configuration',
 858             errnote='Failed to download camtasia configuration')
 859         fileset_node = camtasia_cfg.find('./playlist/array/fileset')
 860
 861         entries = []
 862         for n in fileset_node.getchildren():
 863             url_n = n.find('./uri')
 864             if url_n is None:
 865                 continue
 866
 867             entries.append({
 868                 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0],
 869                 'title': '%s - %s' % (title, n.tag),
 870                 'url': compat_urlparse.urljoin(url, url_n.text),
 871                 'duration': float_or_none(n.find('./duration').text),
 872             })
 873
 874         return {
 875             '_type': 'playlist',
 876             'entries': entries,
 877             'title': title,
 878         }
 879
 880     def _real_extract(self, url):
 881         if url.startswith('//'):
 882             return {
 883                 '_type': 'url',
 884                 'url': self.http_scheme() + url,
 885             }
 886
 887         parsed_url = compat_urlparse.urlparse(url)
 888         if not parsed_url.scheme:
 889             default_search = self._downloader.params.get('default_search')
 890             if default_search is None:
 891                 default_search = 'fixup_error'
 892
 893             if default_search in ('auto', 'auto_warning', 'fixup_error'):
 894                 if '/' in url:
 895                     self._downloader.report_warning('The url doesn\'t specify the protocol, trying with http')
 896                     return self.url_result('http://' + url)
 897                 elif default_search != 'fixup_error':
 898                     if default_search == 'auto_warning':
 899                         if re.match(r'^(?:url|URL)$', url):
 900                             raise ExtractorError(
 901                                 'Invalid URL:  %r . Call youtube-dl like this:  youtube-dl -v "https://www.youtube.com/watch?v=BaW_jenozKc"  ' % url,
 902                                 expected=True)
 903                         else:
 904                             self._downloader.report_warning(
 905                                 'Falling back to youtube search for  %s . Set --default-search "auto" to suppress this warning.' % url)
 906                     return self.url_result('ytsearch:' + url)
 907
 908             if default_search in ('error', 'fixup_error'):
 909                 raise ExtractorError(
 910                     '%r is not a valid URL. '
 911                     'Set --default-search "ytsearch" (or run  youtube-dl "ytsearch:%s" ) to search YouTube'
 912                     % (url, url), expected=True)
 913             else:
 914                 if ':' not in default_search:
 915                     default_search += ':'
 916                 return self.url_result(default_search + url)
 917
 918         url, smuggled_data = unsmuggle_url(url)
 919         force_videoid = None
 920         is_intentional = smuggled_data and smuggled_data.get('to_generic')
 921         if smuggled_data and 'force_videoid' in smuggled_data:
 922             force_videoid = smuggled_data['force_videoid']
 923             video_id = force_videoid
 924         else:
 925             video_id = compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
 926
 927         self.to_screen('%s: Requesting header' % video_id)
 928
 929         head_req = HEADRequest(url)
 930         head_response = self._request_webpage(
 931             head_req, video_id,
 932             note=False, errnote='Could not send HEAD request to %s' % url,
 933             fatal=False)
 934
 935         if head_response is not False:
 936             # Check for redirect
 937             new_url = head_response.geturl()
 938             if url != new_url:
 939                 self.report_following_redirect(new_url)
 940                 if force_videoid:
 941                     new_url = smuggle_url(
 942                         new_url, {'force_videoid': force_videoid})
 943                 return self.url_result(new_url)
 944
 945         full_response = None
 946         if head_response is False:
 947             request = compat_urllib_request.Request(url)
 948             request.add_header('Accept-Encoding', '*')
 949             full_response = self._request_webpage(request, video_id)
 950             head_response = full_response
 951
 952         # Check for direct link to a video
 953         content_type = head_response.headers.get('Content-Type', '')
 954         m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
 955         if m:
 956             upload_date = unified_strdate(
 957                 head_response.headers.get('Last-Modified'))
 958             return {
 959                 'id': video_id,
 960                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
 961                 'direct': True,
 962                 'formats': [{
 963                     'format_id': m.group('format_id'),
 964                     'url': url,
 965                     'vcodec': 'none' if m.group('type') == 'audio' else None
 966                 }],
 967                 'upload_date': upload_date,
 968             }
 969
 970         if not self._downloader.params.get('test', False) and not is_intentional:
 971             self._downloader.report_warning('Falling back on generic information extractor.')
 972
 973         if not full_response:
 974             request = compat_urllib_request.Request(url)
 975             # Some webservers may serve compressed content of rather big size (e.g. gzipped flac)
 976             # making it impossible to download only chunk of the file (yet we need only 512kB to
 977             # test whether it's HTML or not). According to youtube-dl default Accept-Encoding
 978             # that will always result in downloading the whole file that is not desirable.
 979             # Therefore for extraction pass we have to override Accept-Encoding to any in order
 980             # to accept raw bytes and being able to download only a chunk.
 981             # It may probably better to solve this by checking Content-Type for application/octet-stream
 982             # after HEAD request finishes, but not sure if we can rely on this.
 983             request.add_header('Accept-Encoding', '*')
 984             full_response = self._request_webpage(request, video_id)
 985
 986         # Maybe it's a direct link to a video?
 987         # Be careful not to download the whole thing!
 988         first_bytes = full_response.read(512)
 989         if not is_html(first_bytes):
 990             self._downloader.report_warning(
 991                 'URL could be a direct video link, returning it as such.')
 992             upload_date = unified_strdate(
 993                 head_response.headers.get('Last-Modified'))
 994             return {
 995                 'id': video_id,
 996                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
 997                 'direct': True,
 998                 'url': url,
 999                 'upload_date': upload_date,
1000             }
1001
1002         webpage = self._webpage_read_content(
1003             full_response, url, video_id, prefix=first_bytes)
1004
1005         self.report_extraction(video_id)
1006
1007         # Is it an RSS feed?
1008         try:
1009             doc = parse_xml(webpage)
1010             if doc.tag == 'rss':
1011                 return self._extract_rss(url, video_id, doc)
1012         except compat_xml_parse_error:
1013             pass
1014
1015         # Is it a Camtasia project?
1016         camtasia_res = self._extract_camtasia(url, video_id, webpage)
1017         if camtasia_res is not None:
1018             return camtasia_res
1019
1020         # Sometimes embedded video player is hidden behind percent encoding
1021         # (e.g. https://github.com/rg3/youtube-dl/issues/2448)
1022         # Unescaping the whole page allows to handle those cases in a generic way
1023         webpage = compat_urllib_parse.unquote(webpage)
1024
1025         # it's tempting to parse this further, but you would
1026         # have to take into account all the variations like
1027         #   Video Title - Site Name
1028         #   Site Name | Video Title
1029         #   Video Title - Tagline | Site Name
1030         # and so on and so forth; it's just not practical
1031         video_title = self._html_search_regex(
1032             r'(?s)<title>(.*?)</title>', webpage, 'video title',
1033             default='video')
1034
1035         # Try to detect age limit automatically
1036         age_limit = self._rta_search(webpage)
1037         # And then there are the jokers who advertise that they use RTA,
1038         # but actually don't.
1039         AGE_LIMIT_MARKERS = [
1040             r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
1041         ]
1042         if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
1043             age_limit = 18
1044
1045         # video uploader is domain name
1046         video_uploader = self._search_regex(
1047             r'^(?:https?://)?([^/]*)/.*', url, 'video uploader')
1048
1049         # Helper method
1050         def _playlist_from_matches(matches, getter=None, ie=None):
1051             urlrs = orderedSet(
1052                 self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
1053                 for m in matches)
1054             return self.playlist_result(
1055                 urlrs, playlist_id=video_id, playlist_title=video_title)
1056
1057         # Look for BrightCove:
1058         bc_urls = BrightcoveIE._extract_brightcove_urls(webpage)
1059         if bc_urls:
1060             self.to_screen('Brightcove video detected.')
1061             entries = [{
1062                 '_type': 'url',
1063                 'url': smuggle_url(bc_url, {'Referer': url}),
1064                 'ie_key': 'Brightcove'
1065             } for bc_url in bc_urls]
1066
1067             return {
1068                 '_type': 'playlist',
1069                 'title': video_title,
1070                 'id': video_id,
1071                 'entries': entries,
1072             }
1073
1074         # Look for embedded rtl.nl player
1075         matches = re.findall(
1076             r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
1077             webpage)
1078         if matches:
1079             return _playlist_from_matches(matches, ie='RtlNl')
1080
1081         # Look for embedded (iframe) Vimeo player
1082         mobj = re.search(
1083             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage)
1084         if mobj:
1085             player_url = unescapeHTML(mobj.group('url'))
1086             surl = smuggle_url(player_url, {'Referer': url})
1087             return self.url_result(surl)
1088         # Look for embedded (swf embed) Vimeo player
1089         mobj = re.search(
1090             r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage)
1091         if mobj:
1092             return self.url_result(mobj.group(1))
1093
1094         # Look for embedded YouTube player
1095         matches = re.findall(r'''(?x)
1096             (?:
1097                 <iframe[^>]+?src=|
1098                 data-video-url=|
1099                 <embed[^>]+?src=|
1100                 embedSWF\(?:\s*|
1101                 new\s+SWFObject\(
1102             )
1103             (["\'])
1104                 (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
1105                 (?:embed|v|p)/.+?)
1106             \1''', webpage)
1107         if matches:
1108             return _playlist_from_matches(
1109                 matches, lambda m: unescapeHTML(m[1]))
1110
1111         # Look for lazyYT YouTube embed
1112         matches = re.findall(
1113             r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
1114         if matches:
1115             return _playlist_from_matches(matches, lambda m: unescapeHTML(m))
1116
1117         # Look for embedded Dailymotion player
1118         matches = re.findall(
1119             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
1120         if matches:
1121             return _playlist_from_matches(
1122                 matches, lambda m: unescapeHTML(m[1]))
1123
1124         # Look for embedded Dailymotion playlist player (#3822)
1125         m = re.search(
1126             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage)
1127         if m:
1128             playlists = re.findall(
1129                 r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url')))
1130             if playlists:
1131                 return _playlist_from_matches(
1132                     playlists, lambda p: '//dailymotion.com/playlist/%s' % p)
1133
1134         # Look for embedded Wistia player
1135         match = re.search(
1136             r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
1137         if match:
1138             embed_url = self._proto_relative_url(
1139                 unescapeHTML(match.group('url')))
1140             return {
1141                 '_type': 'url_transparent',
1142                 'url': embed_url,
1143                 'ie_key': 'Wistia',
1144                 'uploader': video_uploader,
1145                 'title': video_title,
1146                 'id': video_id,
1147             }
1148
1149         match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
1150         if match:
1151             return {
1152                 '_type': 'url_transparent',
1153                 'url': 'http://fast.wistia.net/embed/iframe/{0:}'.format(match.group('id')),
1154                 'ie_key': 'Wistia',
1155                 'uploader': video_uploader,
1156                 'title': video_title,
1157                 'id': match.group('id')
1158             }
1159
1160         # Look for embedded blip.tv player
1161         bliptv_url = BlipTVIE._extract_url(webpage)
1162         if bliptv_url:
1163             return self.url_result(bliptv_url, 'BlipTV')
1164
1165         # Look for SVT player
1166         svt_url = SVTIE._extract_url(webpage)
1167         if svt_url:
1168             return self.url_result(svt_url, 'SVT')
1169
1170         # Look for embedded condenast player
1171         matches = re.findall(
1172             r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
1173             webpage)
1174         if matches:
1175             return {
1176                 '_type': 'playlist',
1177                 'entries': [{
1178                     '_type': 'url',
1179                     'ie_key': 'CondeNast',
1180                     'url': ma,
1181                 } for ma in matches],
1182                 'title': video_title,
1183                 'id': video_id,
1184             }
1185
1186         # Look for Bandcamp pages with custom domain
1187         mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
1188         if mobj is not None:
1189             burl = unescapeHTML(mobj.group(1))
1190             # Don't set the extractor because it can be a track url or an album
1191             return self.url_result(burl)
1192
1193         # Look for embedded Vevo player
1194         mobj = re.search(
1195             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage)
1196         if mobj is not None:
1197             return self.url_result(mobj.group('url'))
1198
1199         # Look for embedded Viddler player
1200         mobj = re.search(
1201             r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1',
1202             webpage)
1203         if mobj is not None:
1204             return self.url_result(mobj.group('url'))
1205
1206         # Look for NYTimes player
1207         mobj = re.search(
1208             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>',
1209             webpage)
1210         if mobj is not None:
1211             return self.url_result(mobj.group('url'))
1212
1213         # Look for Libsyn player
1214         mobj = re.search(
1215             r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage)
1216         if mobj is not None:
1217             return self.url_result(mobj.group('url'))
1218
1219         # Look for Ooyala videos
1220         mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
1221                 re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
1222                 re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
1223                 re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
1224         if mobj is not None:
1225             return OoyalaIE._build_url_result(mobj.group('ec'))
1226
1227         # Look for multiple Ooyala embeds on SBN network websites
1228         mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage)
1229         if mobj is not None:
1230             embeds = self._parse_json(mobj.group(1), video_id, fatal=False)
1231             if embeds:
1232                 return _playlist_from_matches(
1233                     embeds, getter=lambda v: OoyalaIE._url_for_embed_code(v['provider_video_id']), ie='Ooyala')
1234
1235         # Look for Aparat videos
1236         mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage)
1237         if mobj is not None:
1238             return self.url_result(mobj.group(1), 'Aparat')
1239
1240         # Look for MPORA videos
1241         mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage)
1242         if mobj is not None:
1243             return self.url_result(mobj.group(1), 'Mpora')
1244
1245         # Look for embedded NovaMov-based player
1246         mobj = re.search(
1247             r'''(?x)<(?:pagespeed_)?iframe[^>]+?src=(["\'])
1248                     (?P<url>http://(?:(?:embed|www)\.)?
1249                         (?:novamov\.com|
1250                            nowvideo\.(?:ch|sx|eu|at|ag|co)|
1251                            videoweed\.(?:es|com)|
1252                            movshare\.(?:net|sx|ag)|
1253                            divxstage\.(?:eu|net|ch|co|at|ag))
1254                         /embed\.php.+?)\1''', webpage)
1255         if mobj is not None:
1256             return self.url_result(mobj.group('url'))
1257
1258         # Look for embedded Facebook player
1259         mobj = re.search(
1260             r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage)
1261         if mobj is not None:
1262             return self.url_result(mobj.group('url'), 'Facebook')
1263
1264         # Look for embedded VK player
1265         mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage)
1266         if mobj is not None:
1267             return self.url_result(mobj.group('url'), 'VK')
1268
1269         # Look for embedded ivi player
1270         mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
1271         if mobj is not None:
1272             return self.url_result(mobj.group('url'), 'Ivi')
1273
1274         # Look for embedded Huffington Post player
1275         mobj = re.search(
1276             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage)
1277         if mobj is not None:
1278             return self.url_result(mobj.group('url'), 'HuffPost')
1279
1280         # Look for embed.ly
1281         mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage)
1282         if mobj is not None:
1283             return self.url_result(mobj.group('url'))
1284         mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage)
1285         if mobj is not None:
1286             return self.url_result(compat_urllib_parse.unquote(mobj.group('url')))
1287
1288         # Look for funnyordie embed
1289         matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage)
1290         if matches:
1291             return _playlist_from_matches(
1292                 matches, getter=unescapeHTML, ie='FunnyOrDie')
1293
1294         # Look for BBC iPlayer embed
1295         matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage)
1296         if matches:
1297             return _playlist_from_matches(matches, ie='BBCCoUk')
1298
1299         # Look for embedded RUTV player
1300         rutv_url = RUTVIE._extract_url(webpage)
1301         if rutv_url:
1302             return self.url_result(rutv_url, 'RUTV')
1303
1304         # Look for embedded SportBox player
1305         sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
1306         if sportbox_urls:
1307             return _playlist_from_matches(sportbox_urls, ie='SportBoxEmbed')
1308
1309         # Look for embedded TED player
1310         mobj = re.search(
1311             r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed(?:-ssl)?\.ted\.com/.+?)\1', webpage)
1312         if mobj is not None:
1313             return self.url_result(mobj.group('url'), 'TED')
1314
1315         # Look for embedded Ustream videos
1316         mobj = re.search(
1317             r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
1318         if mobj is not None:
1319             return self.url_result(mobj.group('url'), 'Ustream')
1320
1321         # Look for embedded arte.tv player
1322         mobj = re.search(
1323             r'<script [^>]*?src="(?P<url>http://www\.arte\.tv/playerv2/embed[^"]+)"',
1324             webpage)
1325         if mobj is not None:
1326             return self.url_result(mobj.group('url'), 'ArteTVEmbed')
1327
1328         # Look for embedded smotri.com player
1329         smotri_url = SmotriIE._extract_url(webpage)
1330         if smotri_url:
1331             return self.url_result(smotri_url, 'Smotri')
1332
1333         # Look for embeded soundcloud player
1334         mobj = re.search(
1335             r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
1336             webpage)
1337         if mobj is not None:
1338             url = unescapeHTML(mobj.group('url'))
1339             return self.url_result(url)
1340
1341         # Look for embedded vulture.com player
1342         mobj = re.search(
1343             r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"',
1344             webpage)
1345         if mobj is not None:
1346             url = unescapeHTML(mobj.group('url'))
1347             return self.url_result(url, ie='Vulture')
1348
1349         # Look for embedded mtvservices player
1350         mobj = re.search(
1351             r'<iframe src="(?P<url>https?://media\.mtvnservices\.com/embed/[^"]+)"',
1352             webpage)
1353         if mobj is not None:
1354             url = unescapeHTML(mobj.group('url'))
1355             return self.url_result(url, ie='MTVServicesEmbedded')
1356
1357         # Look for embedded yahoo player
1358         mobj = re.search(
1359             r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1',
1360             webpage)
1361         if mobj is not None:
1362             return self.url_result(mobj.group('url'), 'Yahoo')
1363
1364         # Look for embedded sbs.com.au player
1365         mobj = re.search(
1366             r'''(?x)
1367             (?:
1368                 <meta\s+property="og:video"\s+content=|
1369                 <iframe[^>]+?src=
1370             )
1371             (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''',
1372             webpage)
1373         if mobj is not None:
1374             return self.url_result(mobj.group('url'), 'SBS')
1375
1376         # Look for embedded Cinchcast player
1377         mobj = re.search(
1378             r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1',
1379             webpage)
1380         if mobj is not None:
1381             return self.url_result(mobj.group('url'), 'Cinchcast')
1382
1383         mobj = re.search(
1384             r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1',
1385             webpage)
1386         if not mobj:
1387             mobj = re.search(
1388                 r'data-video-link=["\'](?P<url>http://m.mlb.com/video/[^"\']+)',
1389                 webpage)
1390         if mobj is not None:
1391             return self.url_result(mobj.group('url'), 'MLB')
1392
1393         mobj = re.search(
1394             r'<iframe[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL,
1395             webpage)
1396         if mobj is not None:
1397             return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast')
1398
1399         mobj = re.search(
1400             r'<iframe[^>]+src="(?P<url>https?://new\.livestream\.com/[^"]+/player[^"]+)"',
1401             webpage)
1402         if mobj is not None:
1403             return self.url_result(mobj.group('url'), 'Livestream')
1404
1405         # Look for Zapiks embed
1406         mobj = re.search(
1407             r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage)
1408         if mobj is not None:
1409             return self.url_result(mobj.group('url'), 'Zapiks')
1410
1411         # Look for Kaltura embeds
1412         mobj = re.search(
1413             r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?'wid'\s*:\s*'_?(?P<partner_id>[^']+)',.*?'entry_id'\s*:\s*'(?P<id>[^']+)',", webpage)
1414         if mobj is not None:
1415             return self.url_result('kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), 'Kaltura')
1416
1417         # Look for Eagle.Platform embeds
1418         mobj = re.search(
1419             r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage)
1420         if mobj is not None:
1421             return self.url_result(mobj.group('url'), 'EaglePlatform')
1422
1423         # Look for ClipYou (uses Eagle.Platform) embeds
1424         mobj = re.search(
1425             r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
1426         if mobj is not None:
1427             return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform')
1428
1429         # Look for Pladform embeds
1430         mobj = re.search(
1431             r'<iframe[^>]+src="(?P<url>https?://out\.pladform\.ru/player\?.+?)"', webpage)
1432         if mobj is not None:
1433             return self.url_result(mobj.group('url'), 'Pladform')
1434
1435         # Look for Playwire embeds
1436         mobj = re.search(
1437             r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
1438         if mobj is not None:
1439             return self.url_result(mobj.group('url'))
1440
1441         # Look for 5min embeds
1442         mobj = re.search(
1443             r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
1444         if mobj is not None:
1445             return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
1446
1447         # Look for Crooks and Liars embeds
1448         mobj = re.search(
1449             r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage)
1450         if mobj is not None:
1451             return self.url_result(mobj.group('url'))
1452
1453         # Look for NBC Sports VPlayer embeds
1454         nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
1455         if nbc_sports_url:
1456             return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
1457
1458         # Look for UDN embeds
1459         mobj = re.search(
1460             r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._VALID_URL, webpage)
1461         if mobj is not None:
1462             return self.url_result(
1463                 compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
1464
1465         # Look for Senate ISVP iframe
1466         senate_isvp_url = SenateISVPIE._search_iframe_url(webpage)
1467         if senate_isvp_url:
1468             return self.url_result(senate_isvp_url, 'SenateISVP')
1469
1470         def check_video(vurl):
1471             if YoutubeIE.suitable(vurl):
1472                 return True
1473             vpath = compat_urlparse.urlparse(vurl).path
1474             vext = determine_ext(vpath)
1475             return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml')
1476
1477         def filter_video(urls):
1478             return list(filter(check_video, urls))
1479
1480         # Start with something easy: JW Player in SWFObject
1481         found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage))
1482         if not found:
1483             # Look for gorilla-vid style embedding
1484             found = filter_video(re.findall(r'''(?sx)
1485                 (?:
1486                     jw_plugins|
1487                     JWPlayerOptions|
1488                     jwplayer\s*\(\s*["'][^'"]+["']\s*\)\s*\.setup
1489                 )
1490                 .*?
1491                 ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage))
1492         if not found:
1493             # Broaden the search a little bit
1494             found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
1495         if not found:
1496             # Broaden the findall a little bit: JWPlayer JS loader
1497             found = filter_video(re.findall(
1498                 r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
1499         if not found:
1500             # Flow player
1501             found = filter_video(re.findall(r'''(?xs)
1502                 flowplayer\("[^"]+",\s*
1503                     \{[^}]+?\}\s*,
1504                     \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s*
1505                         ["']?url["']?\s*:\s*["']([^"']+)["']
1506             ''', webpage))
1507         if not found:
1508             # Cinerama player
1509             found = re.findall(
1510                 r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage)
1511         if not found:
1512             # Try to find twitter cards info
1513             found = filter_video(re.findall(
1514                 r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage))
1515         if not found:
1516             # We look for Open Graph info:
1517             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1518             m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1519             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1520             if m_video_type is not None:
1521                 found = filter_video(re.findall(r'<meta.*?property="og:video".*?content="(.*?)"', webpage))
1522         if not found:
1523             # HTML5 video
1524             found = re.findall(r'(?s)<video[^<]*(?:>.*?<source[^>]*)?\s+src=["\'](.*?)["\']', webpage)
1525         if not found:
1526             REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)'
1527             found = re.search(
1528                 r'(?i)<meta\s+(?=(?:[a-z-]+="[^"]+"\s+)*http-equiv="refresh")'
1529                 r'(?:[a-z-]+="[^"]+"\s+)*?content="%s' % REDIRECT_REGEX,
1530                 webpage)
1531             if not found:
1532                 # Look also in Refresh HTTP header
1533                 refresh_header = head_response.headers.get('Refresh')
1534                 if refresh_header:
1535                     found = re.search(REDIRECT_REGEX, refresh_header)
1536             if found:
1537                 new_url = compat_urlparse.urljoin(url, found.group(1))
1538                 self.report_following_redirect(new_url)
1539                 return {
1540                     '_type': 'url',
1541                     'url': new_url,
1542                 }
1543         if not found:
1544             raise UnsupportedError(url)
1545
1546         entries = []
1547         for video_url in found:
1548             video_url = compat_urlparse.urljoin(url, video_url)
1549             video_id = compat_urllib_parse.unquote(os.path.basename(video_url))
1550
1551             # Sometimes, jwplayer extraction will result in a YouTube URL
1552             if YoutubeIE.suitable(video_url):
1553                 entries.append(self.url_result(video_url, 'Youtube'))
1554                 continue
1555
1556             # here's a fun little line of code for you:
1557             video_id = os.path.splitext(video_id)[0]
1558
1559             if determine_ext(video_url) == 'smil':
1560                 entries.append({
1561                     'id': video_id,
1562                     'formats': self._extract_smil_formats(video_url, video_id),
1563                     'uploader': video_uploader,
1564                     'title': video_title,
1565                     'age_limit': age_limit,
1566                 })
1567             else:
1568                 entries.append({
1569                     'id': video_id,
1570                     'url': video_url,
1571                     'uploader': video_uploader,
1572                     'title': video_title,
1573                     'age_limit': age_limit,
1574                 })
1575
1576         if len(entries) == 1:
1577             return entries[0]
1578         else:
1579             for num, e in enumerate(entries, start=1):
1580                 # 'url' results don't have a title
1581                 if e.get('title') is not None:
1582                     e['title'] = '%s (%d)' % (e['title'], num)
1583             return {
1584                 '_type': 'playlist',
1585                 'entries': entries,
1586             }