_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.dailymotion import DailymotionIE
  25 from .extractor.gametrailers import GametrailersIE
  26 from .extractor.generic import GenericIE
  27 from .extractor.metacafe import MetacafeIE
  28 from .extractor.myvideo import MyVideoIE
  29 from .extractor.statigram import StatigramIE
  30 from .extractor.photobucket import PhotobucketIE
  31 from .extractor.vimeo import VimeoIE
  32 from .extractor.yahoo import YahooIE, YahooSearchIE
  33 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  34 from .extractor.zdf import ZDFIE
  35
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54 class DepositFilesIE(InfoExtractor):
  55     """Information extractor for depositfiles.com"""
  56
  57     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
  58
  59     def _real_extract(self, url):
  60         file_id = url.split('/')[-1]
  61         # Rebuild url in english locale
  62         url = 'http://depositfiles.com/en/files/' + file_id
  63
  64         # Retrieve file webpage with 'Free download' button pressed
  65         free_download_indication = { 'gateway_result' : '1' }
  66         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
  67         try:
  68             self.report_download_webpage(file_id)
  69             webpage = compat_urllib_request.urlopen(request).read()
  70         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  71             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
  72
  73         # Search for the real file URL
  74         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
  75         if (mobj is None) or (mobj.group(1) is None):
  76             # Try to figure out reason of the error.
  77             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
  78             if (mobj is not None) and (mobj.group(1) is not None):
  79                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
  80                 raise ExtractorError(u'%s' % restriction_message)
  81             else:
  82                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
  83
  84         file_url = mobj.group(1)
  85         file_extension = os.path.splitext(file_url)[1][1:]
  86
  87         # Search for file title
  88         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
  89
  90         return [{
  91             'id':       file_id.decode('utf-8'),
  92             'url':      file_url.decode('utf-8'),
  93             'uploader': None,
  94             'upload_date':  None,
  95             'title':    file_title,
  96             'ext':      file_extension.decode('utf-8'),
  97         }]
  98
  99
 100 class FacebookIE(InfoExtractor):
 101     """Information Extractor for Facebook"""
 102
 103     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 104     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 105     _NETRC_MACHINE = 'facebook'
 106     IE_NAME = u'facebook'
 107
 108     def report_login(self):
 109         """Report attempt to log in."""
 110         self.to_screen(u'Logging in')
 111
 112     def _real_initialize(self):
 113         if self._downloader is None:
 114             return
 115
 116         useremail = None
 117         password = None
 118         downloader_params = self._downloader.params
 119
 120         # Attempt to use provided username and password or .netrc data
 121         if downloader_params.get('username', None) is not None:
 122             useremail = downloader_params['username']
 123             password = downloader_params['password']
 124         elif downloader_params.get('usenetrc', False):
 125             try:
 126                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 127                 if info is not None:
 128                     useremail = info[0]
 129                     password = info[2]
 130                 else:
 131                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 132             except (IOError, netrc.NetrcParseError) as err:
 133                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 134                 return
 135
 136         if useremail is None:
 137             return
 138
 139         # Log in
 140         login_form = {
 141             'email': useremail,
 142             'pass': password,
 143             'login': 'Log+In'
 144             }
 145         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 146         try:
 147             self.report_login()
 148             login_results = compat_urllib_request.urlopen(request).read()
 149             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 150                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 151                 return
 152         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 153             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 154             return
 155
 156     def _real_extract(self, url):
 157         mobj = re.match(self._VALID_URL, url)
 158         if mobj is None:
 159             raise ExtractorError(u'Invalid URL: %s' % url)
 160         video_id = mobj.group('ID')
 161
 162         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 163         webpage = self._download_webpage(url, video_id)
 164
 165         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 166         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 167         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 168         if not m:
 169             raise ExtractorError(u'Cannot parse data')
 170         data = dict(json.loads(m.group(1)))
 171         params_raw = compat_urllib_parse.unquote(data['params'])
 172         params = json.loads(params_raw)
 173         video_data = params['video_data'][0]
 174         video_url = video_data.get('hd_src')
 175         if not video_url:
 176             video_url = video_data['sd_src']
 177         if not video_url:
 178             raise ExtractorError(u'Cannot find video URL')
 179         video_duration = int(video_data['video_duration'])
 180         thumbnail = video_data['thumbnail_src']
 181
 182         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 183             webpage, u'title')
 184
 185         info = {
 186             'id': video_id,
 187             'title': video_title,
 188             'url': video_url,
 189             'ext': 'mp4',
 190             'duration': video_duration,
 191             'thumbnail': thumbnail,
 192         }
 193         return [info]
 194
 195
 196
 197
 198
 199 class ComedyCentralIE(InfoExtractor):
 200     """Information extractor for The Daily Show and Colbert Report """
 201
 202     # urls can be abbreviations like :thedailyshow or :colbert
 203     # urls for episodes like:
 204     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 205     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 206     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 207     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 208                       |(https?://)?(www\.)?
 209                           (?P<showname>thedailyshow|colbertnation)\.com/
 210                          (full-episodes/(?P<episode>.*)|
 211                           (?P<clip>
 212                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 213                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 214                      $"""
 215
 216     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 217
 218     _video_extensions = {
 219         '3500': 'mp4',
 220         '2200': 'mp4',
 221         '1700': 'mp4',
 222         '1200': 'mp4',
 223         '750': 'mp4',
 224         '400': 'mp4',
 225     }
 226     _video_dimensions = {
 227         '3500': '1280x720',
 228         '2200': '960x540',
 229         '1700': '768x432',
 230         '1200': '640x360',
 231         '750': '512x288',
 232         '400': '384x216',
 233     }
 234
 235     @classmethod
 236     def suitable(cls, url):
 237         """Receives a URL and returns True if suitable for this IE."""
 238         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 239
 240     def _print_formats(self, formats):
 241         print('Available formats:')
 242         for x in formats:
 243             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 244
 245
 246     def _real_extract(self, url):
 247         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 248         if mobj is None:
 249             raise ExtractorError(u'Invalid URL: %s' % url)
 250
 251         if mobj.group('shortname'):
 252             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 253                 url = u'http://www.thedailyshow.com/full-episodes/'
 254             else:
 255                 url = u'http://www.colbertnation.com/full-episodes/'
 256             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 257             assert mobj is not None
 258
 259         if mobj.group('clip'):
 260             if mobj.group('showname') == 'thedailyshow':
 261                 epTitle = mobj.group('tdstitle')
 262             else:
 263                 epTitle = mobj.group('cntitle')
 264             dlNewest = False
 265         else:
 266             dlNewest = not mobj.group('episode')
 267             if dlNewest:
 268                 epTitle = mobj.group('showname')
 269             else:
 270                 epTitle = mobj.group('episode')
 271
 272         self.report_extraction(epTitle)
 273         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 274         if dlNewest:
 275             url = htmlHandle.geturl()
 276             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 277             if mobj is None:
 278                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 279             if mobj.group('episode') == '':
 280                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 281             epTitle = mobj.group('episode')
 282
 283         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 284
 285         if len(mMovieParams) == 0:
 286             # The Colbert Report embeds the information in a without
 287             # a URL prefix; so extract the alternate reference
 288             # and then add the URL prefix manually.
 289
 290             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 291             if len(altMovieParams) == 0:
 292                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 293             else:
 294                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 295
 296         uri = mMovieParams[0][1]
 297         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 298         indexXml = self._download_webpage(indexUrl, epTitle,
 299                                           u'Downloading show index',
 300                                           u'unable to download episode index')
 301
 302         results = []
 303
 304         idoc = xml.etree.ElementTree.fromstring(indexXml)
 305         itemEls = idoc.findall('.//item')
 306         for partNum,itemEl in enumerate(itemEls):
 307             mediaId = itemEl.findall('./guid')[0].text
 308             shortMediaId = mediaId.split(':')[-1]
 309             showId = mediaId.split(':')[-2].replace('.com', '')
 310             officialTitle = itemEl.findall('./title')[0].text
 311             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 312
 313             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 314                         compat_urllib_parse.urlencode({'uri': mediaId}))
 315             configXml = self._download_webpage(configUrl, epTitle,
 316                                                u'Downloading configuration for %s' % shortMediaId)
 317
 318             cdoc = xml.etree.ElementTree.fromstring(configXml)
 319             turls = []
 320             for rendition in cdoc.findall('.//rendition'):
 321                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 322                 turls.append(finfo)
 323
 324             if len(turls) == 0:
 325                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 326                 continue
 327
 328             if self._downloader.params.get('listformats', None):
 329                 self._print_formats([i[0] for i in turls])
 330                 return
 331
 332             # For now, just pick the highest bitrate
 333             format,rtmp_video_url = turls[-1]
 334
 335             # Get the format arg from the arg stream
 336             req_format = self._downloader.params.get('format', None)
 337
 338             # Select format if we can find one
 339             for f,v in turls:
 340                 if f == req_format:
 341                     format, rtmp_video_url = f, v
 342                     break
 343
 344             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 345             if not m:
 346                 raise ExtractorError(u'Cannot transform RTMP url')
 347             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 348             video_url = base + m.group('finalid')
 349
 350             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 351             info = {
 352                 'id': shortMediaId,
 353                 'url': video_url,
 354                 'uploader': showId,
 355                 'upload_date': officialDate,
 356                 'title': effTitle,
 357                 'ext': 'mp4',
 358                 'format': format,
 359                 'thumbnail': None,
 360                 'description': officialTitle,
 361             }
 362             results.append(info)
 363
 364         return results
 365
 366
 367 class EscapistIE(InfoExtractor):
 368     """Information extractor for The Escapist """
 369
 370     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 371     IE_NAME = u'escapist'
 372
 373     def _real_extract(self, url):
 374         mobj = re.match(self._VALID_URL, url)
 375         if mobj is None:
 376             raise ExtractorError(u'Invalid URL: %s' % url)
 377         showName = mobj.group('showname')
 378         videoId = mobj.group('episode')
 379
 380         self.report_extraction(videoId)
 381         webpage = self._download_webpage(url, videoId)
 382
 383         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 384             webpage, u'description', fatal=False)
 385
 386         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 387             webpage, u'thumbnail', fatal=False)
 388
 389         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 390             webpage, u'player url')
 391
 392         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 393             webpage, u'player url').split(' : ')[-1]
 394
 395         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 396         configUrl = compat_urllib_parse.unquote(configUrl)
 397
 398         configJSON = self._download_webpage(configUrl, videoId,
 399                                             u'Downloading configuration',
 400                                             u'unable to download configuration')
 401
 402         # Technically, it's JavaScript, not JSON
 403         configJSON = configJSON.replace("'", '"')
 404
 405         try:
 406             config = json.loads(configJSON)
 407         except (ValueError,) as err:
 408             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 409
 410         playlist = config['playlist']
 411         videoUrl = playlist[1]['url']
 412
 413         info = {
 414             'id': videoId,
 415             'url': videoUrl,
 416             'uploader': showName,
 417             'upload_date': None,
 418             'title': title,
 419             'ext': 'mp4',
 420             'thumbnail': imgUrl,
 421             'description': videoDesc,
 422             'player_url': playerUrl,
 423         }
 424
 425         return [info]
 426
 427 class CollegeHumorIE(InfoExtractor):
 428     """Information extractor for collegehumor.com"""
 429
 430     _WORKING = False
 431     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 432     IE_NAME = u'collegehumor'
 433
 434     def report_manifest(self, video_id):
 435         """Report information extraction."""
 436         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 437
 438     def _real_extract(self, url):
 439         mobj = re.match(self._VALID_URL, url)
 440         if mobj is None:
 441             raise ExtractorError(u'Invalid URL: %s' % url)
 442         video_id = mobj.group('videoid')
 443
 444         info = {
 445             'id': video_id,
 446             'uploader': None,
 447             'upload_date': None,
 448         }
 449
 450         self.report_extraction(video_id)
 451         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 452         try:
 453             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 454         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 455             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 456
 457         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 458         try:
 459             videoNode = mdoc.findall('./video')[0]
 460             info['description'] = videoNode.findall('./description')[0].text
 461             info['title'] = videoNode.findall('./caption')[0].text
 462             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 463             manifest_url = videoNode.findall('./file')[0].text
 464         except IndexError:
 465             raise ExtractorError(u'Invalid metadata XML file')
 466
 467         manifest_url += '?hdcore=2.10.3'
 468         self.report_manifest(video_id)
 469         try:
 470             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 471         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 472             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 473
 474         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 475         try:
 476             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 477             node_id = media_node.attrib['url']
 478             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 479         except IndexError as err:
 480             raise ExtractorError(u'Invalid manifest file')
 481
 482         url_pr = compat_urllib_parse_urlparse(manifest_url)
 483         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 484
 485         info['url'] = url
 486         info['ext'] = 'f4f'
 487         return [info]
 488
 489
 490 class XVideosIE(InfoExtractor):
 491     """Information extractor for xvideos.com"""
 492
 493     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 494     IE_NAME = u'xvideos'
 495
 496     def _real_extract(self, url):
 497         mobj = re.match(self._VALID_URL, url)
 498         if mobj is None:
 499             raise ExtractorError(u'Invalid URL: %s' % url)
 500         video_id = mobj.group(1)
 501
 502         webpage = self._download_webpage(url, video_id)
 503
 504         self.report_extraction(video_id)
 505
 506         # Extract video URL
 507         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 508             webpage, u'video URL'))
 509
 510         # Extract title
 511         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 512             webpage, u'title')
 513
 514         # Extract video thumbnail
 515         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 516             webpage, u'thumbnail', fatal=False)
 517
 518         info = {
 519             'id': video_id,
 520             'url': video_url,
 521             'uploader': None,
 522             'upload_date': None,
 523             'title': video_title,
 524             'ext': 'flv',
 525             'thumbnail': video_thumbnail,
 526             'description': None,
 527         }
 528
 529         return [info]
 530
 531
 532 class SoundcloudIE(InfoExtractor):
 533     """Information extractor for soundcloud.com
 534        To access the media, the uid of the song and a stream token
 535        must be extracted from the page source and the script must make
 536        a request to media.soundcloud.com/crossdomain.xml. Then
 537        the media can be grabbed by requesting from an url composed
 538        of the stream token and uid
 539      """
 540
 541     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 542     IE_NAME = u'soundcloud'
 543
 544     def report_resolve(self, video_id):
 545         """Report information extraction."""
 546         self.to_screen(u'%s: Resolving id' % video_id)
 547
 548     def _real_extract(self, url):
 549         mobj = re.match(self._VALID_URL, url)
 550         if mobj is None:
 551             raise ExtractorError(u'Invalid URL: %s' % url)
 552
 553         # extract uploader (which is in the url)
 554         uploader = mobj.group(1)
 555         # extract simple title (uploader + slug of song title)
 556         slug_title =  mobj.group(2)
 557         simple_title = uploader + u'-' + slug_title
 558         full_title = '%s/%s' % (uploader, slug_title)
 559
 560         self.report_resolve(full_title)
 561
 562         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 563         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 564         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
 565
 566         info = json.loads(info_json)
 567         video_id = info['id']
 568         self.report_extraction(full_title)
 569
 570         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 571         stream_json = self._download_webpage(streams_url, full_title,
 572                                              u'Downloading stream definitions',
 573                                              u'unable to download stream definitions')
 574
 575         streams = json.loads(stream_json)
 576         mediaURL = streams['http_mp3_128_url']
 577         upload_date = unified_strdate(info['created_at'])
 578
 579         return [{
 580             'id':       info['id'],
 581             'url':      mediaURL,
 582             'uploader': info['user']['username'],
 583             'upload_date': upload_date,
 584             'title':    info['title'],
 585             'ext':      u'mp3',
 586             'description': info['description'],
 587         }]
 588
 589 class SoundcloudSetIE(InfoExtractor):
 590     """Information extractor for soundcloud.com sets
 591        To access the media, the uid of the song and a stream token
 592        must be extracted from the page source and the script must make
 593        a request to media.soundcloud.com/crossdomain.xml. Then
 594        the media can be grabbed by requesting from an url composed
 595        of the stream token and uid
 596      """
 597
 598     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 599     IE_NAME = u'soundcloud:set'
 600
 601     def report_resolve(self, video_id):
 602         """Report information extraction."""
 603         self.to_screen(u'%s: Resolving id' % video_id)
 604
 605     def _real_extract(self, url):
 606         mobj = re.match(self._VALID_URL, url)
 607         if mobj is None:
 608             raise ExtractorError(u'Invalid URL: %s' % url)
 609
 610         # extract uploader (which is in the url)
 611         uploader = mobj.group(1)
 612         # extract simple title (uploader + slug of song title)
 613         slug_title =  mobj.group(2)
 614         simple_title = uploader + u'-' + slug_title
 615         full_title = '%s/sets/%s' % (uploader, slug_title)
 616
 617         self.report_resolve(full_title)
 618
 619         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 620         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 621         info_json = self._download_webpage(resolv_url, full_title)
 622
 623         videos = []
 624         info = json.loads(info_json)
 625         if 'errors' in info:
 626             for err in info['errors']:
 627                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 628             return
 629
 630         self.report_extraction(full_title)
 631         for track in info['tracks']:
 632             video_id = track['id']
 633
 634             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 635             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
 636
 637             self.report_extraction(video_id)
 638             streams = json.loads(stream_json)
 639             mediaURL = streams['http_mp3_128_url']
 640
 641             videos.append({
 642                 'id':       video_id,
 643                 'url':      mediaURL,
 644                 'uploader': track['user']['username'],
 645                 'upload_date':  unified_strdate(track['created_at']),
 646                 'title':    track['title'],
 647                 'ext':      u'mp3',
 648                 'description': track['description'],
 649             })
 650         return videos
 651
 652
 653 class InfoQIE(InfoExtractor):
 654     """Information extractor for infoq.com"""
 655     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 656
 657     def _real_extract(self, url):
 658         mobj = re.match(self._VALID_URL, url)
 659         if mobj is None:
 660             raise ExtractorError(u'Invalid URL: %s' % url)
 661
 662         webpage = self._download_webpage(url, video_id=url)
 663         self.report_extraction(url)
 664
 665         # Extract video URL
 666         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 667         if mobj is None:
 668             raise ExtractorError(u'Unable to extract video url')
 669         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 670         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 671
 672         # Extract title
 673         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 674             webpage, u'title')
 675
 676         # Extract description
 677         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 678             webpage, u'description', fatal=False)
 679
 680         video_filename = video_url.split('/')[-1]
 681         video_id, extension = video_filename.split('.')
 682
 683         info = {
 684             'id': video_id,
 685             'url': video_url,
 686             'uploader': None,
 687             'upload_date': None,
 688             'title': video_title,
 689             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 690             'thumbnail': None,
 691             'description': video_description,
 692         }
 693
 694         return [info]
 695
 696 class MixcloudIE(InfoExtractor):
 697     """Information extractor for www.mixcloud.com"""
 698
 699     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 700     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 701     IE_NAME = u'mixcloud'
 702
 703     def report_download_json(self, file_id):
 704         """Report JSON download."""
 705         self.to_screen(u'Downloading json')
 706
 707     def get_urls(self, jsonData, fmt, bitrate='best'):
 708         """Get urls from 'audio_formats' section in json"""
 709         file_url = None
 710         try:
 711             bitrate_list = jsonData[fmt]
 712             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 713                 bitrate = max(bitrate_list) # select highest
 714
 715             url_list = jsonData[fmt][bitrate]
 716         except TypeError: # we have no bitrate info.
 717             url_list = jsonData[fmt]
 718         return url_list
 719
 720     def check_urls(self, url_list):
 721         """Returns 1st active url from list"""
 722         for url in url_list:
 723             try:
 724                 compat_urllib_request.urlopen(url)
 725                 return url
 726             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 727                 url = None
 728
 729         return None
 730
 731     def _print_formats(self, formats):
 732         print('Available formats:')
 733         for fmt in formats.keys():
 734             for b in formats[fmt]:
 735                 try:
 736                     ext = formats[fmt][b][0]
 737                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 738                 except TypeError: # we have no bitrate info
 739                     ext = formats[fmt][0]
 740                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 741                     break
 742
 743     def _real_extract(self, url):
 744         mobj = re.match(self._VALID_URL, url)
 745         if mobj is None:
 746             raise ExtractorError(u'Invalid URL: %s' % url)
 747         # extract uploader & filename from url
 748         uploader = mobj.group(1).decode('utf-8')
 749         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 750
 751         # construct API request
 752         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 753         # retrieve .json file with links to files
 754         request = compat_urllib_request.Request(file_url)
 755         try:
 756             self.report_download_json(file_url)
 757             jsonData = compat_urllib_request.urlopen(request).read()
 758         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 759             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 760
 761         # parse JSON
 762         json_data = json.loads(jsonData)
 763         player_url = json_data['player_swf_url']
 764         formats = dict(json_data['audio_formats'])
 765
 766         req_format = self._downloader.params.get('format', None)
 767         bitrate = None
 768
 769         if self._downloader.params.get('listformats', None):
 770             self._print_formats(formats)
 771             return
 772
 773         if req_format is None or req_format == 'best':
 774             for format_param in formats.keys():
 775                 url_list = self.get_urls(formats, format_param)
 776                 # check urls
 777                 file_url = self.check_urls(url_list)
 778                 if file_url is not None:
 779                     break # got it!
 780         else:
 781             if req_format not in formats:
 782                 raise ExtractorError(u'Format is not available')
 783
 784             url_list = self.get_urls(formats, req_format)
 785             file_url = self.check_urls(url_list)
 786             format_param = req_format
 787
 788         return [{
 789             'id': file_id.decode('utf-8'),
 790             'url': file_url.decode('utf-8'),
 791             'uploader': uploader.decode('utf-8'),
 792             'upload_date': None,
 793             'title': json_data['name'],
 794             'ext': file_url.split('.')[-1].decode('utf-8'),
 795             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 796             'thumbnail': json_data['thumbnail_url'],
 797             'description': json_data['description'],
 798             'player_url': player_url.decode('utf-8'),
 799         }]
 800
 801 class StanfordOpenClassroomIE(InfoExtractor):
 802     """Information extractor for Stanford's Open ClassRoom"""
 803
 804     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 805     IE_NAME = u'stanfordoc'
 806
 807     def _real_extract(self, url):
 808         mobj = re.match(self._VALID_URL, url)
 809         if mobj is None:
 810             raise ExtractorError(u'Invalid URL: %s' % url)
 811
 812         if mobj.group('course') and mobj.group('video'): # A specific video
 813             course = mobj.group('course')
 814             video = mobj.group('video')
 815             info = {
 816                 'id': course + '_' + video,
 817                 'uploader': None,
 818                 'upload_date': None,
 819             }
 820
 821             self.report_extraction(info['id'])
 822             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 823             xmlUrl = baseUrl + video + '.xml'
 824             try:
 825                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 826             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 827                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 828             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 829             try:
 830                 info['title'] = mdoc.findall('./title')[0].text
 831                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 832             except IndexError:
 833                 raise ExtractorError(u'Invalid metadata XML file')
 834             info['ext'] = info['url'].rpartition('.')[2]
 835             return [info]
 836         elif mobj.group('course'): # A course page
 837             course = mobj.group('course')
 838             info = {
 839                 'id': course,
 840                 'type': 'playlist',
 841                 'uploader': None,
 842                 'upload_date': None,
 843             }
 844
 845             coursepage = self._download_webpage(url, info['id'],
 846                                         note='Downloading course info page',
 847                                         errnote='Unable to download course info page')
 848
 849             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 850
 851             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 852                 coursepage, u'description', fatal=False)
 853
 854             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 855             info['list'] = [
 856                 {
 857                     'type': 'reference',
 858                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 859                 }
 860                     for vpage in links]
 861             results = []
 862             for entry in info['list']:
 863                 assert entry['type'] == 'reference'
 864                 results += self.extract(entry['url'])
 865             return results
 866         else: # Root page
 867             info = {
 868                 'id': 'Stanford OpenClassroom',
 869                 'type': 'playlist',
 870                 'uploader': None,
 871                 'upload_date': None,
 872             }
 873
 874             self.report_download_webpage(info['id'])
 875             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 876             try:
 877                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 878             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 879                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 880
 881             info['title'] = info['id']
 882
 883             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 884             info['list'] = [
 885                 {
 886                     'type': 'reference',
 887                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 888                 }
 889                     for cpage in links]
 890
 891             results = []
 892             for entry in info['list']:
 893                 assert entry['type'] == 'reference'
 894                 results += self.extract(entry['url'])
 895             return results
 896
 897 class MTVIE(InfoExtractor):
 898     """Information extractor for MTV.com"""
 899
 900     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 901     IE_NAME = u'mtv'
 902
 903     def _real_extract(self, url):
 904         mobj = re.match(self._VALID_URL, url)
 905         if mobj is None:
 906             raise ExtractorError(u'Invalid URL: %s' % url)
 907         if not mobj.group('proto'):
 908             url = 'http://' + url
 909         video_id = mobj.group('videoid')
 910
 911         webpage = self._download_webpage(url, video_id)
 912
 913         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 914             webpage, u'song name', fatal=False)
 915
 916         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 917             webpage, u'title')
 918
 919         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 920             webpage, u'mtvn_uri', fatal=False)
 921
 922         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 923             webpage, u'content id', fatal=False)
 924
 925         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 926         self.report_extraction(video_id)
 927         request = compat_urllib_request.Request(videogen_url)
 928         try:
 929             metadataXml = compat_urllib_request.urlopen(request).read()
 930         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 931             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 932
 933         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 934         renditions = mdoc.findall('.//rendition')
 935
 936         # For now, always pick the highest quality.
 937         rendition = renditions[-1]
 938
 939         try:
 940             _,_,ext = rendition.attrib['type'].partition('/')
 941             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 942             video_url = rendition.find('./src').text
 943         except KeyError:
 944             raise ExtractorError('Invalid rendition field.')
 945
 946         info = {
 947             'id': video_id,
 948             'url': video_url,
 949             'uploader': performer,
 950             'upload_date': None,
 951             'title': video_title,
 952             'ext': ext,
 953             'format': format,
 954         }
 955
 956         return [info]
 957
 958
 959 class YoukuIE(InfoExtractor):
 960     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 961
 962     def _gen_sid(self):
 963         nowTime = int(time.time() * 1000)
 964         random1 = random.randint(1000,1998)
 965         random2 = random.randint(1000,9999)
 966
 967         return "%d%d%d" %(nowTime,random1,random2)
 968
 969     def _get_file_ID_mix_string(self, seed):
 970         mixed = []
 971         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 972         seed = float(seed)
 973         for i in range(len(source)):
 974             seed  =  (seed * 211 + 30031 ) % 65536
 975             index  =  math.floor(seed / 65536 * len(source) )
 976             mixed.append(source[int(index)])
 977             source.remove(source[int(index)])
 978         #return ''.join(mixed)
 979         return mixed
 980
 981     def _get_file_id(self, fileId, seed):
 982         mixed = self._get_file_ID_mix_string(seed)
 983         ids = fileId.split('*')
 984         realId = []
 985         for ch in ids:
 986             if ch:
 987                 realId.append(mixed[int(ch)])
 988         return ''.join(realId)
 989
 990     def _real_extract(self, url):
 991         mobj = re.match(self._VALID_URL, url)
 992         if mobj is None:
 993             raise ExtractorError(u'Invalid URL: %s' % url)
 994         video_id = mobj.group('ID')
 995
 996         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 997
 998         jsondata = self._download_webpage(info_url, video_id)
 999
1000         self.report_extraction(video_id)
1001         try:
1002             config = json.loads(jsondata)
1003
1004             video_title =  config['data'][0]['title']
1005             seed = config['data'][0]['seed']
1006
1007             format = self._downloader.params.get('format', None)
1008             supported_format = list(config['data'][0]['streamfileids'].keys())
1009
1010             if format is None or format == 'best':
1011                 if 'hd2' in supported_format:
1012                     format = 'hd2'
1013                 else:
1014                     format = 'flv'
1015                 ext = u'flv'
1016             elif format == 'worst':
1017                 format = 'mp4'
1018                 ext = u'mp4'
1019             else:
1020                 format = 'flv'
1021                 ext = u'flv'
1022
1023
1024             fileid = config['data'][0]['streamfileids'][format]
1025             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1026         except (UnicodeDecodeError, ValueError, KeyError):
1027             raise ExtractorError(u'Unable to extract info section')
1028
1029         files_info=[]
1030         sid = self._gen_sid()
1031         fileid = self._get_file_id(fileid, seed)
1032
1033         #column 8,9 of fileid represent the segment number
1034         #fileid[7:9] should be changed
1035         for index, key in enumerate(keys):
1036
1037             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1038             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1039
1040             info = {
1041                 'id': '%s_part%02d' % (video_id, index),
1042                 'url': download_url,
1043                 'uploader': None,
1044                 'upload_date': None,
1045                 'title': video_title,
1046                 'ext': ext,
1047             }
1048             files_info.append(info)
1049
1050         return files_info
1051
1052
1053 class XNXXIE(InfoExtractor):
1054     """Information extractor for xnxx.com"""
1055
1056     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1057     IE_NAME = u'xnxx'
1058     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1059     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1060     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1061
1062     def _real_extract(self, url):
1063         mobj = re.match(self._VALID_URL, url)
1064         if mobj is None:
1065             raise ExtractorError(u'Invalid URL: %s' % url)
1066         video_id = mobj.group(1)
1067
1068         # Get webpage content
1069         webpage = self._download_webpage(url, video_id)
1070
1071         video_url = self._search_regex(self.VIDEO_URL_RE,
1072             webpage, u'video URL')
1073         video_url = compat_urllib_parse.unquote(video_url)
1074
1075         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1076             webpage, u'title')
1077
1078         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1079             webpage, u'thumbnail', fatal=False)
1080
1081         return [{
1082             'id': video_id,
1083             'url': video_url,
1084             'uploader': None,
1085             'upload_date': None,
1086             'title': video_title,
1087             'ext': 'flv',
1088             'thumbnail': video_thumbnail,
1089             'description': None,
1090         }]
1091
1092
1093 class GooglePlusIE(InfoExtractor):
1094     """Information extractor for plus.google.com."""
1095
1096     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1097     IE_NAME = u'plus.google'
1098
1099     def _real_extract(self, url):
1100         # Extract id from URL
1101         mobj = re.match(self._VALID_URL, url)
1102         if mobj is None:
1103             raise ExtractorError(u'Invalid URL: %s' % url)
1104
1105         post_url = mobj.group(0)
1106         video_id = mobj.group(1)
1107
1108         video_extension = 'flv'
1109
1110         # Step 1, Retrieve post webpage to extract further information
1111         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1112
1113         self.report_extraction(video_id)
1114
1115         # Extract update date
1116         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1117             webpage, u'upload date', fatal=False)
1118         if upload_date:
1119             # Convert timestring to a format suitable for filename
1120             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1121             upload_date = upload_date.strftime('%Y%m%d')
1122
1123         # Extract uploader
1124         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1125             webpage, u'uploader', fatal=False)
1126
1127         # Extract title
1128         # Get the first line for title
1129         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1130             webpage, 'title', default=u'NA')
1131
1132         # Step 2, Stimulate clicking the image box to launch video
1133         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1134             webpage, u'video page URL')
1135         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1136
1137         # Extract video links on video page
1138         """Extract video links of all sizes"""
1139         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1140         mobj = re.findall(pattern, webpage)
1141         if len(mobj) == 0:
1142             raise ExtractorError(u'Unable to extract video links')
1143
1144         # Sort in resolution
1145         links = sorted(mobj)
1146
1147         # Choose the lowest of the sort, i.e. highest resolution
1148         video_url = links[-1]
1149         # Only get the url. The resolution part in the tuple has no use anymore
1150         video_url = video_url[-1]
1151         # Treat escaped \u0026 style hex
1152         try:
1153             video_url = video_url.decode("unicode_escape")
1154         except AttributeError: # Python 3
1155             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1156
1157
1158         return [{
1159             'id':       video_id,
1160             'url':      video_url,
1161             'uploader': uploader,
1162             'upload_date':  upload_date,
1163             'title':    video_title,
1164             'ext':      video_extension,
1165         }]
1166
1167 class NBAIE(InfoExtractor):
1168     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1169     IE_NAME = u'nba'
1170
1171     def _real_extract(self, url):
1172         mobj = re.match(self._VALID_URL, url)
1173         if mobj is None:
1174             raise ExtractorError(u'Invalid URL: %s' % url)
1175
1176         video_id = mobj.group(1)
1177
1178         webpage = self._download_webpage(url, video_id)
1179
1180         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1181
1182         shortened_video_id = video_id.rpartition('/')[2]
1183         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1184             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1185
1186         # It isn't there in the HTML it returns to us
1187         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1188
1189         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1190
1191         info = {
1192             'id': shortened_video_id,
1193             'url': video_url,
1194             'ext': 'mp4',
1195             'title': title,
1196             # 'uploader_date': uploader_date,
1197             'description': description,
1198         }
1199         return [info]
1200
1201 class JustinTVIE(InfoExtractor):
1202     """Information extractor for justin.tv and twitch.tv"""
1203     # TODO: One broadcast may be split into multiple videos. The key
1204     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1205     # starts at 1 and increases. Can we treat all parts as one video?
1206
1207     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1208         (?:
1209             (?P<channelid>[^/]+)|
1210             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1211             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1212         )
1213         /?(?:\#.*)?$
1214         """
1215     _JUSTIN_PAGE_LIMIT = 100
1216     IE_NAME = u'justin.tv'
1217
1218     def report_download_page(self, channel, offset):
1219         """Report attempt to download a single page of videos."""
1220         self.to_screen(u'%s: Downloading video information from %d to %d' %
1221                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1222
1223     # Return count of items, list of *valid* items
1224     def _parse_page(self, url, video_id):
1225         webpage = self._download_webpage(url, video_id,
1226                                          u'Downloading video info JSON',
1227                                          u'unable to download video info JSON')
1228
1229         response = json.loads(webpage)
1230         if type(response) != list:
1231             error_text = response.get('error', 'unknown error')
1232             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1233         info = []
1234         for clip in response:
1235             video_url = clip['video_file_url']
1236             if video_url:
1237                 video_extension = os.path.splitext(video_url)[1][1:]
1238                 video_date = re.sub('-', '', clip['start_time'][:10])
1239                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1240                 video_id = clip['id']
1241                 video_title = clip.get('title', video_id)
1242                 info.append({
1243                     'id': video_id,
1244                     'url': video_url,
1245                     'title': video_title,
1246                     'uploader': clip.get('channel_name', video_uploader_id),
1247                     'uploader_id': video_uploader_id,
1248                     'upload_date': video_date,
1249                     'ext': video_extension,
1250                 })
1251         return (len(response), info)
1252
1253     def _real_extract(self, url):
1254         mobj = re.match(self._VALID_URL, url)
1255         if mobj is None:
1256             raise ExtractorError(u'invalid URL: %s' % url)
1257
1258         api_base = 'http://api.justin.tv'
1259         paged = False
1260         if mobj.group('channelid'):
1261             paged = True
1262             video_id = mobj.group('channelid')
1263             api = api_base + '/channel/archives/%s.json' % video_id
1264         elif mobj.group('chapterid'):
1265             chapter_id = mobj.group('chapterid')
1266
1267             webpage = self._download_webpage(url, chapter_id)
1268             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1269             if not m:
1270                 raise ExtractorError(u'Cannot find archive of a chapter')
1271             archive_id = m.group(1)
1272
1273             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1274             chapter_info_xml = self._download_webpage(api, chapter_id,
1275                                              note=u'Downloading chapter information',
1276                                              errnote=u'Chapter information download failed')
1277             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1278             for a in doc.findall('.//archive'):
1279                 if archive_id == a.find('./id').text:
1280                     break
1281             else:
1282                 raise ExtractorError(u'Could not find chapter in chapter information')
1283
1284             video_url = a.find('./video_file_url').text
1285             video_ext = video_url.rpartition('.')[2] or u'flv'
1286
1287             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1288             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1289                                    note='Downloading chapter metadata',
1290                                    errnote='Download of chapter metadata failed')
1291             chapter_info = json.loads(chapter_info_json)
1292
1293             bracket_start = int(doc.find('.//bracket_start').text)
1294             bracket_end = int(doc.find('.//bracket_end').text)
1295
1296             # TODO determine start (and probably fix up file)
1297             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1298             #video_url += u'?start=' + TODO:start_timestamp
1299             # bracket_start is 13290, but we want 51670615
1300             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1301                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1302
1303             info = {
1304                 'id': u'c' + chapter_id,
1305                 'url': video_url,
1306                 'ext': video_ext,
1307                 'title': chapter_info['title'],
1308                 'thumbnail': chapter_info['preview'],
1309                 'description': chapter_info['description'],
1310                 'uploader': chapter_info['channel']['display_name'],
1311                 'uploader_id': chapter_info['channel']['name'],
1312             }
1313             return [info]
1314         else:
1315             video_id = mobj.group('videoid')
1316             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1317
1318         self.report_extraction(video_id)
1319
1320         info = []
1321         offset = 0
1322         limit = self._JUSTIN_PAGE_LIMIT
1323         while True:
1324             if paged:
1325                 self.report_download_page(video_id, offset)
1326             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1327             page_count, page_info = self._parse_page(page_url, video_id)
1328             info.extend(page_info)
1329             if not paged or page_count != limit:
1330                 break
1331             offset += limit
1332         return info
1333
1334 class FunnyOrDieIE(InfoExtractor):
1335     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1336
1337     def _real_extract(self, url):
1338         mobj = re.match(self._VALID_URL, url)
1339         if mobj is None:
1340             raise ExtractorError(u'invalid URL: %s' % url)
1341
1342         video_id = mobj.group('id')
1343         webpage = self._download_webpage(url, video_id)
1344
1345         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1346             webpage, u'video URL', flags=re.DOTALL)
1347
1348         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1349             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1350
1351         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1352             webpage, u'description', fatal=False, flags=re.DOTALL)
1353
1354         info = {
1355             'id': video_id,
1356             'url': video_url,
1357             'ext': 'mp4',
1358             'title': title,
1359             'description': video_description,
1360         }
1361         return [info]
1362
1363 class SteamIE(InfoExtractor):
1364     _VALID_URL = r"""http://store\.steampowered\.com/
1365                 (agecheck/)?
1366                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1367                 (?P<gameID>\d+)/?
1368                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1369                 """
1370     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1371     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1372
1373     @classmethod
1374     def suitable(cls, url):
1375         """Receives a URL and returns True if suitable for this IE."""
1376         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1377
1378     def _real_extract(self, url):
1379         m = re.match(self._VALID_URL, url, re.VERBOSE)
1380         gameID = m.group('gameID')
1381
1382         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1383         webpage = self._download_webpage(videourl, gameID)
1384
1385         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1386             videourl = self._AGECHECK_TEMPLATE % gameID
1387             self.report_age_confirmation()
1388             webpage = self._download_webpage(videourl, gameID)
1389
1390         self.report_extraction(gameID)
1391         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1392                                              webpage, 'game title')
1393
1394         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1395         mweb = re.finditer(urlRE, webpage)
1396         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1397         titles = re.finditer(namesRE, webpage)
1398         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1399         thumbs = re.finditer(thumbsRE, webpage)
1400         videos = []
1401         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1402             video_id = vid.group('videoID')
1403             title = vtitle.group('videoName')
1404             video_url = vid.group('videoURL')
1405             video_thumb = thumb.group('thumbnail')
1406             if not video_url:
1407                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1408             info = {
1409                 'id':video_id,
1410                 'url':video_url,
1411                 'ext': 'flv',
1412                 'title': unescapeHTML(title),
1413                 'thumbnail': video_thumb
1414                   }
1415             videos.append(info)
1416         return [self.playlist_result(videos, gameID, game_title)]
1417
1418 class UstreamIE(InfoExtractor):
1419     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1420     IE_NAME = u'ustream'
1421
1422     def _real_extract(self, url):
1423         m = re.match(self._VALID_URL, url)
1424         video_id = m.group('videoID')
1425
1426         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1427         webpage = self._download_webpage(url, video_id)
1428
1429         self.report_extraction(video_id)
1430
1431         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1432             webpage, u'title')
1433
1434         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1435             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1436
1437         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1438             webpage, u'thumbnail', fatal=False)
1439
1440         info = {
1441                 'id': video_id,
1442                 'url': video_url,
1443                 'ext': 'flv',
1444                 'title': video_title,
1445                 'uploader': uploader,
1446                 'thumbnail': thumbnail,
1447                }
1448         return info
1449
1450 class WorldStarHipHopIE(InfoExtractor):
1451     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1452     IE_NAME = u'WorldStarHipHop'
1453
1454     def _real_extract(self, url):
1455         m = re.match(self._VALID_URL, url)
1456         video_id = m.group('id')
1457
1458         webpage_src = self._download_webpage(url, video_id)
1459
1460         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1461             webpage_src, u'video URL')
1462
1463         if 'mp4' in video_url:
1464             ext = 'mp4'
1465         else:
1466             ext = 'flv'
1467
1468         video_title = self._html_search_regex(r"<title>(.*)</title>",
1469             webpage_src, u'title')
1470
1471         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1472         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1473             webpage_src, u'thumbnail', fatal=False)
1474
1475         if not thumbnail:
1476             _title = r"""candytitles.*>(.*)</span>"""
1477             mobj = re.search(_title, webpage_src)
1478             if mobj is not None:
1479                 video_title = mobj.group(1)
1480
1481         results = [{
1482                     'id': video_id,
1483                     'url' : video_url,
1484                     'title' : video_title,
1485                     'thumbnail' : thumbnail,
1486                     'ext' : ext,
1487                     }]
1488         return results
1489
1490 class RBMARadioIE(InfoExtractor):
1491     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1492
1493     def _real_extract(self, url):
1494         m = re.match(self._VALID_URL, url)
1495         video_id = m.group('videoID')
1496
1497         webpage = self._download_webpage(url, video_id)
1498
1499         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1500             webpage, u'json data', flags=re.MULTILINE)
1501
1502         try:
1503             data = json.loads(json_data)
1504         except ValueError as e:
1505             raise ExtractorError(u'Invalid JSON: ' + str(e))
1506
1507         video_url = data['akamai_url'] + '&cbr=256'
1508         url_parts = compat_urllib_parse_urlparse(video_url)
1509         video_ext = url_parts.path.rpartition('.')[2]
1510         info = {
1511                 'id': video_id,
1512                 'url': video_url,
1513                 'ext': video_ext,
1514                 'title': data['title'],
1515                 'description': data.get('teaser_text'),
1516                 'location': data.get('country_of_origin'),
1517                 'uploader': data.get('host', {}).get('name'),
1518                 'uploader_id': data.get('host', {}).get('slug'),
1519                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1520                 'duration': data.get('duration'),
1521         }
1522         return [info]
1523
1524
1525 class YouPornIE(InfoExtractor):
1526     """Information extractor for youporn.com."""
1527     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1528
1529     def _print_formats(self, formats):
1530         """Print all available formats"""
1531         print(u'Available formats:')
1532         print(u'ext\t\tformat')
1533         print(u'---------------------------------')
1534         for format in formats:
1535             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1536
1537     def _specific(self, req_format, formats):
1538         for x in formats:
1539             if(x["format"]==req_format):
1540                 return x
1541         return None
1542
1543     def _real_extract(self, url):
1544         mobj = re.match(self._VALID_URL, url)
1545         if mobj is None:
1546             raise ExtractorError(u'Invalid URL: %s' % url)
1547         video_id = mobj.group('videoid')
1548
1549         req = compat_urllib_request.Request(url)
1550         req.add_header('Cookie', 'age_verified=1')
1551         webpage = self._download_webpage(req, video_id)
1552
1553         # Get JSON parameters
1554         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1555         try:
1556             params = json.loads(json_params)
1557         except:
1558             raise ExtractorError(u'Invalid JSON')
1559
1560         self.report_extraction(video_id)
1561         try:
1562             video_title = params['title']
1563             upload_date = unified_strdate(params['release_date_f'])
1564             video_description = params['description']
1565             video_uploader = params['submitted_by']
1566             thumbnail = params['thumbnails'][0]['image']
1567         except KeyError:
1568             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1569
1570         # Get all of the formats available
1571         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1572         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1573             webpage, u'download list').strip()
1574
1575         # Get all of the links from the page
1576         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1577         links = re.findall(LINK_RE, download_list_html)
1578         if(len(links) == 0):
1579             raise ExtractorError(u'ERROR: no known formats available for video')
1580
1581         self.to_screen(u'Links found: %d' % len(links))
1582
1583         formats = []
1584         for link in links:
1585
1586             # A link looks like this:
1587             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1588             # A path looks like this:
1589             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1590             video_url = unescapeHTML( link )
1591             path = compat_urllib_parse_urlparse( video_url ).path
1592             extension = os.path.splitext( path )[1][1:]
1593             format = path.split('/')[4].split('_')[:2]
1594             size = format[0]
1595             bitrate = format[1]
1596             format = "-".join( format )
1597             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1598
1599             formats.append({
1600                 'id': video_id,
1601                 'url': video_url,
1602                 'uploader': video_uploader,
1603                 'upload_date': upload_date,
1604                 'title': video_title,
1605                 'ext': extension,
1606                 'format': format,
1607                 'thumbnail': thumbnail,
1608                 'description': video_description
1609             })
1610
1611         if self._downloader.params.get('listformats', None):
1612             self._print_formats(formats)
1613             return
1614
1615         req_format = self._downloader.params.get('format', None)
1616         self.to_screen(u'Format: %s' % req_format)
1617
1618         if req_format is None or req_format == 'best':
1619             return [formats[0]]
1620         elif req_format == 'worst':
1621             return [formats[-1]]
1622         elif req_format in ('-1', 'all'):
1623             return formats
1624         else:
1625             format = self._specific( req_format, formats )
1626             if result is None:
1627                 raise ExtractorError(u'Requested format not available')
1628             return [format]
1629
1630
1631
1632 class PornotubeIE(InfoExtractor):
1633     """Information extractor for pornotube.com."""
1634     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1635
1636     def _real_extract(self, url):
1637         mobj = re.match(self._VALID_URL, url)
1638         if mobj is None:
1639             raise ExtractorError(u'Invalid URL: %s' % url)
1640
1641         video_id = mobj.group('videoid')
1642         video_title = mobj.group('title')
1643
1644         # Get webpage content
1645         webpage = self._download_webpage(url, video_id)
1646
1647         # Get the video URL
1648         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1649         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1650         video_url = compat_urllib_parse.unquote(video_url)
1651
1652         #Get the uploaded date
1653         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1654         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1655         if upload_date: upload_date = unified_strdate(upload_date)
1656
1657         info = {'id': video_id,
1658                 'url': video_url,
1659                 'uploader': None,
1660                 'upload_date': upload_date,
1661                 'title': video_title,
1662                 'ext': 'flv',
1663                 'format': 'flv'}
1664
1665         return [info]
1666
1667 class YouJizzIE(InfoExtractor):
1668     """Information extractor for youjizz.com."""
1669     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1670
1671     def _real_extract(self, url):
1672         mobj = re.match(self._VALID_URL, url)
1673         if mobj is None:
1674             raise ExtractorError(u'Invalid URL: %s' % url)
1675
1676         video_id = mobj.group('videoid')
1677
1678         # Get webpage content
1679         webpage = self._download_webpage(url, video_id)
1680
1681         # Get the video title
1682         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1683             webpage, u'title').strip()
1684
1685         # Get the embed page
1686         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1687         if result is None:
1688             raise ExtractorError(u'ERROR: unable to extract embed page')
1689
1690         embed_page_url = result.group(0).strip()
1691         video_id = result.group('videoid')
1692
1693         webpage = self._download_webpage(embed_page_url, video_id)
1694
1695         # Get the video URL
1696         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1697             webpage, u'video URL')
1698
1699         info = {'id': video_id,
1700                 'url': video_url,
1701                 'title': video_title,
1702                 'ext': 'flv',
1703                 'format': 'flv',
1704                 'player_url': embed_page_url}
1705
1706         return [info]
1707
1708 class EightTracksIE(InfoExtractor):
1709     IE_NAME = '8tracks'
1710     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1711
1712     def _real_extract(self, url):
1713         mobj = re.match(self._VALID_URL, url)
1714         if mobj is None:
1715             raise ExtractorError(u'Invalid URL: %s' % url)
1716         playlist_id = mobj.group('id')
1717
1718         webpage = self._download_webpage(url, playlist_id)
1719
1720         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1721         data = json.loads(json_like)
1722
1723         session = str(random.randint(0, 1000000000))
1724         mix_id = data['id']
1725         track_count = data['tracks_count']
1726         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1727         next_url = first_url
1728         res = []
1729         for i in itertools.count():
1730             api_json = self._download_webpage(next_url, playlist_id,
1731                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1732                 errnote=u'Failed to download song information')
1733             api_data = json.loads(api_json)
1734             track_data = api_data[u'set']['track']
1735             info = {
1736                 'id': track_data['id'],
1737                 'url': track_data['track_file_stream_url'],
1738                 'title': track_data['performer'] + u' - ' + track_data['name'],
1739                 'raw_title': track_data['name'],
1740                 'uploader_id': data['user']['login'],
1741                 'ext': 'm4a',
1742             }
1743             res.append(info)
1744             if api_data['set']['at_last_track']:
1745                 break
1746             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1747         return res
1748
1749 class KeekIE(InfoExtractor):
1750     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1751     IE_NAME = u'keek'
1752
1753     def _real_extract(self, url):
1754         m = re.match(self._VALID_URL, url)
1755         video_id = m.group('videoID')
1756
1757         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1758         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1759         webpage = self._download_webpage(url, video_id)
1760
1761         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1762             webpage, u'title')
1763
1764         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1765             webpage, u'uploader', fatal=False)
1766
1767         info = {
1768                 'id': video_id,
1769                 'url': video_url,
1770                 'ext': 'mp4',
1771                 'title': video_title,
1772                 'thumbnail': thumbnail,
1773                 'uploader': uploader
1774         }
1775         return [info]
1776
1777 class TEDIE(InfoExtractor):
1778     _VALID_URL=r'''http://www\.ted\.com/
1779                    (
1780                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1781                         |
1782                         ((?P<type_talk>talks)) # We have a simple talk
1783                    )
1784                    (/lang/(.*?))? # The url may contain the language
1785                    /(?P<name>\w+) # Here goes the name and then ".html"
1786                    '''
1787
1788     @classmethod
1789     def suitable(cls, url):
1790         """Receives a URL and returns True if suitable for this IE."""
1791         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1792
1793     def _real_extract(self, url):
1794         m=re.match(self._VALID_URL, url, re.VERBOSE)
1795         if m.group('type_talk'):
1796             return [self._talk_info(url)]
1797         else :
1798             playlist_id=m.group('playlist_id')
1799             name=m.group('name')
1800             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1801             return [self._playlist_videos_info(url,name,playlist_id)]
1802
1803     def _playlist_videos_info(self,url,name,playlist_id=0):
1804         '''Returns the videos of the playlist'''
1805         video_RE=r'''
1806                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1807                      ([.\s]*?)data-playlist_item_id="(\d+)"
1808                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1809                      '''
1810         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1811         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1812         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1813         m_names=re.finditer(video_name_RE,webpage)
1814
1815         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1816                                                  webpage, 'playlist title')
1817
1818         playlist_entries = []
1819         for m_video, m_name in zip(m_videos,m_names):
1820             video_id=m_video.group('video_id')
1821             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1822             playlist_entries.append(self.url_result(talk_url, 'TED'))
1823         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1824
1825     def _talk_info(self, url, video_id=0):
1826         """Return the video for the talk in the url"""
1827         m = re.match(self._VALID_URL, url,re.VERBOSE)
1828         video_name = m.group('name')
1829         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1830         self.report_extraction(video_name)
1831         # If the url includes the language we get the title translated
1832         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1833                                         webpage, 'title')
1834         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1835                                     webpage, 'json data')
1836         info = json.loads(json_data)
1837         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1838                                        webpage, 'description', flags = re.DOTALL)
1839
1840         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1841                                        webpage, 'thumbnail')
1842         info = {
1843                 'id': info['id'],
1844                 'url': info['htmlStreams'][-1]['file'],
1845                 'ext': 'mp4',
1846                 'title': title,
1847                 'thumbnail': thumbnail,
1848                 'description': desc,
1849                 }
1850         return info
1851
1852 class MySpassIE(InfoExtractor):
1853     _VALID_URL = r'http://www.myspass.de/.*'
1854
1855     def _real_extract(self, url):
1856         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1857
1858         # video id is the last path element of the URL
1859         # usually there is a trailing slash, so also try the second but last
1860         url_path = compat_urllib_parse_urlparse(url).path
1861         url_parent_path, video_id = os.path.split(url_path)
1862         if not video_id:
1863             _, video_id = os.path.split(url_parent_path)
1864
1865         # get metadata
1866         metadata_url = META_DATA_URL_TEMPLATE % video_id
1867         metadata_text = self._download_webpage(metadata_url, video_id)
1868         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1869
1870         # extract values from metadata
1871         url_flv_el = metadata.find('url_flv')
1872         if url_flv_el is None:
1873             raise ExtractorError(u'Unable to extract download url')
1874         video_url = url_flv_el.text
1875         extension = os.path.splitext(video_url)[1][1:]
1876         title_el = metadata.find('title')
1877         if title_el is None:
1878             raise ExtractorError(u'Unable to extract title')
1879         title = title_el.text
1880         format_id_el = metadata.find('format_id')
1881         if format_id_el is None:
1882             format = ext
1883         else:
1884             format = format_id_el.text
1885         description_el = metadata.find('description')
1886         if description_el is not None:
1887             description = description_el.text
1888         else:
1889             description = None
1890         imagePreview_el = metadata.find('imagePreview')
1891         if imagePreview_el is not None:
1892             thumbnail = imagePreview_el.text
1893         else:
1894             thumbnail = None
1895         info = {
1896             'id': video_id,
1897             'url': video_url,
1898             'title': title,
1899             'ext': extension,
1900             'format': format,
1901             'thumbnail': thumbnail,
1902             'description': description
1903         }
1904         return [info]
1905
1906 class SpiegelIE(InfoExtractor):
1907     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1908
1909     def _real_extract(self, url):
1910         m = re.match(self._VALID_URL, url)
1911         video_id = m.group('videoID')
1912
1913         webpage = self._download_webpage(url, video_id)
1914
1915         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1916             webpage, u'title')
1917
1918         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1919         xml_code = self._download_webpage(xml_url, video_id,
1920                     note=u'Downloading XML', errnote=u'Failed to download XML')
1921
1922         idoc = xml.etree.ElementTree.fromstring(xml_code)
1923         last_type = idoc[-1]
1924         filename = last_type.findall('./filename')[0].text
1925         duration = float(last_type.findall('./duration')[0].text)
1926
1927         video_url = 'http://video2.spiegel.de/flash/' + filename
1928         video_ext = filename.rpartition('.')[2]
1929         info = {
1930             'id': video_id,
1931             'url': video_url,
1932             'ext': video_ext,
1933             'title': video_title,
1934             'duration': duration,
1935         }
1936         return [info]
1937
1938 class LiveLeakIE(InfoExtractor):
1939
1940     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1941     IE_NAME = u'liveleak'
1942
1943     def _real_extract(self, url):
1944         mobj = re.match(self._VALID_URL, url)
1945         if mobj is None:
1946             raise ExtractorError(u'Invalid URL: %s' % url)
1947
1948         video_id = mobj.group('video_id')
1949
1950         webpage = self._download_webpage(url, video_id)
1951
1952         video_url = self._search_regex(r'file: "(.*?)",',
1953             webpage, u'video URL')
1954
1955         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1956             webpage, u'title').replace('LiveLeak.com -', '').strip()
1957
1958         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1959             webpage, u'description', fatal=False)
1960
1961         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1962             webpage, u'uploader', fatal=False)
1963
1964         info = {
1965             'id':  video_id,
1966             'url': video_url,
1967             'ext': 'mp4',
1968             'title': video_title,
1969             'description': video_description,
1970             'uploader': video_uploader
1971         }
1972
1973         return [info]
1974
1975
1976
1977 class TumblrIE(InfoExtractor):
1978     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1979
1980     def _real_extract(self, url):
1981         m_url = re.match(self._VALID_URL, url)
1982         video_id = m_url.group('id')
1983         blog = m_url.group('blog_name')
1984
1985         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1986         webpage = self._download_webpage(url, video_id)
1987
1988         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1989         video = re.search(re_video, webpage)
1990         if video is None:
1991            raise ExtractorError(u'Unable to extract video')
1992         video_url = video.group('video_url')
1993         ext = video.group('ext')
1994
1995         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1996             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1997         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1998
1999         # The only place where you can get a title, it's not complete,
2000         # but searching in other places doesn't work for all videos
2001         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2002             webpage, u'title', flags=re.DOTALL)
2003
2004         return [{'id': video_id,
2005                  'url': video_url,
2006                  'title': video_title,
2007                  'thumbnail': video_thumbnail,
2008                  'ext': ext
2009                  }]
2010
2011 class BandcampIE(InfoExtractor):
2012     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2013
2014     def _real_extract(self, url):
2015         mobj = re.match(self._VALID_URL, url)
2016         title = mobj.group('title')
2017         webpage = self._download_webpage(url, title)
2018         # We get the link to the free download page
2019         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2020         if m_download is None:
2021             raise ExtractorError(u'No free songs found')
2022
2023         download_link = m_download.group(1)
2024         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2025                        webpage, re.MULTILINE|re.DOTALL).group('id')
2026
2027         download_webpage = self._download_webpage(download_link, id,
2028                                                   'Downloading free downloads page')
2029         # We get the dictionary of the track from some javascrip code
2030         info = re.search(r'items: (.*?),$',
2031                          download_webpage, re.MULTILINE).group(1)
2032         info = json.loads(info)[0]
2033         # We pick mp3-320 for now, until format selection can be easily implemented.
2034         mp3_info = info[u'downloads'][u'mp3-320']
2035         # If we try to use this url it says the link has expired
2036         initial_url = mp3_info[u'url']
2037         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2038         m_url = re.match(re_url, initial_url)
2039         #We build the url we will use to get the final track url
2040         # This url is build in Bandcamp in the script download_bunde_*.js
2041         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2042         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2043         # If we could correctly generate the .rand field the url would be
2044         #in the "download_url" key
2045         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2046
2047         track_info = {'id':id,
2048                       'title' : info[u'title'],
2049                       'ext' :   'mp3',
2050                       'url' :   final_url,
2051                       'thumbnail' : info[u'thumb_url'],
2052                       'uploader' :  info[u'artist']
2053                       }
2054
2055         return [track_info]
2056
2057 class RedTubeIE(InfoExtractor):
2058     """Information Extractor for redtube"""
2059     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2060
2061     def _real_extract(self,url):
2062         mobj = re.match(self._VALID_URL, url)
2063         if mobj is None:
2064             raise ExtractorError(u'Invalid URL: %s' % url)
2065
2066         video_id = mobj.group('id')
2067         video_extension = 'mp4'
2068         webpage = self._download_webpage(url, video_id)
2069
2070         self.report_extraction(video_id)
2071
2072         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2073             webpage, u'video URL')
2074
2075         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2076             webpage, u'title')
2077
2078         return [{
2079             'id':       video_id,
2080             'url':      video_url,
2081             'ext':      video_extension,
2082             'title':    video_title,
2083         }]
2084
2085 class InaIE(InfoExtractor):
2086     """Information Extractor for Ina.fr"""
2087     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2088
2089     def _real_extract(self,url):
2090         mobj = re.match(self._VALID_URL, url)
2091
2092         video_id = mobj.group('id')
2093         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2094         video_extension = 'mp4'
2095         webpage = self._download_webpage(mrss_url, video_id)
2096
2097         self.report_extraction(video_id)
2098
2099         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2100             webpage, u'video URL')
2101
2102         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2103             webpage, u'title')
2104
2105         return [{
2106             'id':       video_id,
2107             'url':      video_url,
2108             'ext':      video_extension,
2109             'title':    video_title,
2110         }]
2111
2112 class HowcastIE(InfoExtractor):
2113     """Information Extractor for Howcast.com"""
2114     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2115
2116     def _real_extract(self, url):
2117         mobj = re.match(self._VALID_URL, url)
2118
2119         video_id = mobj.group('id')
2120         webpage_url = 'http://www.howcast.com/videos/' + video_id
2121         webpage = self._download_webpage(webpage_url, video_id)
2122
2123         self.report_extraction(video_id)
2124
2125         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2126             webpage, u'video URL')
2127
2128         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2129             webpage, u'title')
2130
2131         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2132             webpage, u'description', fatal=False)
2133
2134         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2135             webpage, u'thumbnail', fatal=False)
2136
2137         return [{
2138             'id':       video_id,
2139             'url':      video_url,
2140             'ext':      'mp4',
2141             'title':    video_title,
2142             'description': video_description,
2143             'thumbnail': thumbnail,
2144         }]
2145
2146 class VineIE(InfoExtractor):
2147     """Information Extractor for Vine.co"""
2148     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2149
2150     def _real_extract(self, url):
2151         mobj = re.match(self._VALID_URL, url)
2152
2153         video_id = mobj.group('id')
2154         webpage_url = 'https://vine.co/v/' + video_id
2155         webpage = self._download_webpage(webpage_url, video_id)
2156
2157         self.report_extraction(video_id)
2158
2159         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2160             webpage, u'video URL')
2161
2162         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2163             webpage, u'title')
2164
2165         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2166             webpage, u'thumbnail', fatal=False)
2167
2168         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2169             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2170
2171         return [{
2172             'id':        video_id,
2173             'url':       video_url,
2174             'ext':       'mp4',
2175             'title':     video_title,
2176             'thumbnail': thumbnail,
2177             'uploader':  uploader,
2178         }]
2179
2180 class FlickrIE(InfoExtractor):
2181     """Information Extractor for Flickr videos"""
2182     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2183
2184     def _real_extract(self, url):
2185         mobj = re.match(self._VALID_URL, url)
2186
2187         video_id = mobj.group('id')
2188         video_uploader_id = mobj.group('uploader_id')
2189         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2190         webpage = self._download_webpage(webpage_url, video_id)
2191
2192         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2193
2194         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2195         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2196
2197         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2198             first_xml, u'node_id')
2199
2200         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2201         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2202
2203         self.report_extraction(video_id)
2204
2205         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2206         if mobj is None:
2207             raise ExtractorError(u'Unable to extract video url')
2208         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2209
2210         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2211             webpage, u'video title')
2212
2213         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2214             webpage, u'description', fatal=False)
2215
2216         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2217             webpage, u'thumbnail', fatal=False)
2218
2219         return [{
2220             'id':          video_id,
2221             'url':         video_url,
2222             'ext':         'mp4',
2223             'title':       video_title,
2224             'description': video_description,
2225             'thumbnail':   thumbnail,
2226             'uploader_id': video_uploader_id,
2227         }]
2228
2229 class TeamcocoIE(InfoExtractor):
2230     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2231
2232     def _real_extract(self, url):
2233         mobj = re.match(self._VALID_URL, url)
2234         if mobj is None:
2235             raise ExtractorError(u'Invalid URL: %s' % url)
2236         url_title = mobj.group('url_title')
2237         webpage = self._download_webpage(url, url_title)
2238
2239         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2240             webpage, u'video id')
2241
2242         self.report_extraction(video_id)
2243
2244         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2245             webpage, u'title')
2246
2247         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2248             webpage, u'thumbnail', fatal=False)
2249
2250         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2251             webpage, u'description', fatal=False)
2252
2253         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2254         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2255
2256         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2257             data, u'video URL')
2258
2259         return [{
2260             'id':          video_id,
2261             'url':         video_url,
2262             'ext':         'mp4',
2263             'title':       video_title,
2264             'thumbnail':   thumbnail,
2265             'description': video_description,
2266         }]
2267
2268 class XHamsterIE(InfoExtractor):
2269     """Information Extractor for xHamster"""
2270     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2271
2272     def _real_extract(self,url):
2273         mobj = re.match(self._VALID_URL, url)
2274
2275         video_id = mobj.group('id')
2276         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2277         webpage = self._download_webpage(mrss_url, video_id)
2278
2279         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2280         if mobj is None:
2281             raise ExtractorError(u'Unable to extract media URL')
2282         if len(mobj.group('server')) == 0:
2283             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2284         else:
2285             video_url = mobj.group('server')+'/key='+mobj.group('file')
2286         video_extension = video_url.split('.')[-1]
2287
2288         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2289             webpage, u'title')
2290
2291         # Can't see the description anywhere in the UI
2292         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2293         #     webpage, u'description', fatal=False)
2294         # if video_description: video_description = unescapeHTML(video_description)
2295
2296         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2297         if mobj:
2298             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2299         else:
2300             video_upload_date = None
2301             self._downloader.report_warning(u'Unable to extract upload date')
2302
2303         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2304             webpage, u'uploader id', default=u'anonymous')
2305
2306         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2307             webpage, u'thumbnail', fatal=False)
2308
2309         return [{
2310             'id':       video_id,
2311             'url':      video_url,
2312             'ext':      video_extension,
2313             'title':    video_title,
2314             # 'description': video_description,
2315             'upload_date': video_upload_date,
2316             'uploader_id': video_uploader_id,
2317             'thumbnail': video_thumbnail
2318         }]
2319
2320 class HypemIE(InfoExtractor):
2321     """Information Extractor for hypem"""
2322     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2323
2324     def _real_extract(self, url):
2325         mobj = re.match(self._VALID_URL, url)
2326         if mobj is None:
2327             raise ExtractorError(u'Invalid URL: %s' % url)
2328         track_id = mobj.group(1)
2329
2330         data = { 'ax': 1, 'ts': time.time() }
2331         data_encoded = compat_urllib_parse.urlencode(data)
2332         complete_url = url + "?" + data_encoded
2333         request = compat_urllib_request.Request(complete_url)
2334         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2335         cookie = urlh.headers.get('Set-Cookie', '')
2336
2337         self.report_extraction(track_id)
2338
2339         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2340             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2341         try:
2342             track_list = json.loads(html_tracks)
2343             track = track_list[u'tracks'][0]
2344         except ValueError:
2345             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2346
2347         key = track[u"key"]
2348         track_id = track[u"id"]
2349         artist = track[u"artist"]
2350         title = track[u"song"]
2351
2352         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2353         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2354         request.add_header('cookie', cookie)
2355         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2356         try:
2357             song_data = json.loads(song_data_json)
2358         except ValueError:
2359             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2360         final_url = song_data[u"url"]
2361
2362         return [{
2363             'id':       track_id,
2364             'url':      final_url,
2365             'ext':      "mp3",
2366             'title':    title,
2367             'artist':   artist,
2368         }]
2369
2370 class Vbox7IE(InfoExtractor):
2371     """Information Extractor for Vbox7"""
2372     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2373
2374     def _real_extract(self,url):
2375         mobj = re.match(self._VALID_URL, url)
2376         if mobj is None:
2377             raise ExtractorError(u'Invalid URL: %s' % url)
2378         video_id = mobj.group(1)
2379
2380         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2381         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2382         redirect_url = urlh.geturl() + new_location
2383         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2384
2385         title = self._html_search_regex(r'<title>(.*)</title>',
2386             webpage, u'title').split('/')[0].strip()
2387
2388         ext = "flv"
2389         info_url = "http://vbox7.com/play/magare.do"
2390         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2391         info_request = compat_urllib_request.Request(info_url, data)
2392         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2393         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2394         if info_response is None:
2395             raise ExtractorError(u'Unable to extract the media url')
2396         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2397
2398         return [{
2399             'id':        video_id,
2400             'url':       final_url,
2401             'ext':       ext,
2402             'title':     title,
2403             'thumbnail': thumbnail_url,
2404         }]
2405
2406
2407 def gen_extractors():
2408     """ Return a list of an instance of every supported extractor.
2409     The order does matter; the first extractor matched is the one handling the URL.
2410     """
2411     return [
2412         YoutubePlaylistIE(),
2413         YoutubeChannelIE(),
2414         YoutubeUserIE(),
2415         YoutubeSearchIE(),
2416         YoutubeIE(),
2417         MetacafeIE(),
2418         DailymotionIE(),
2419         GoogleSearchIE(),
2420         PhotobucketIE(),
2421         YahooIE(),
2422         YahooSearchIE(),
2423         DepositFilesIE(),
2424         FacebookIE(),
2425         BlipTVIE(),
2426         BlipTVUserIE(),
2427         VimeoIE(),
2428         MyVideoIE(),
2429         ComedyCentralIE(),
2430         EscapistIE(),
2431         CollegeHumorIE(),
2432         XVideosIE(),
2433         SoundcloudSetIE(),
2434         SoundcloudIE(),
2435         InfoQIE(),
2436         MixcloudIE(),
2437         StanfordOpenClassroomIE(),
2438         MTVIE(),
2439         YoukuIE(),
2440         XNXXIE(),
2441         YouJizzIE(),
2442         PornotubeIE(),
2443         YouPornIE(),
2444         GooglePlusIE(),
2445         ArteTvIE(),
2446         NBAIE(),
2447         WorldStarHipHopIE(),
2448         JustinTVIE(),
2449         FunnyOrDieIE(),
2450         SteamIE(),
2451         UstreamIE(),
2452         RBMARadioIE(),
2453         EightTracksIE(),
2454         KeekIE(),
2455         TEDIE(),
2456         MySpassIE(),
2457         SpiegelIE(),
2458         LiveLeakIE(),
2459         ARDIE(),
2460         ZDFIE(),
2461         TumblrIE(),
2462         BandcampIE(),
2463         RedTubeIE(),
2464         InaIE(),
2465         HowcastIE(),
2466         VineIE(),
2467         FlickrIE(),
2468         TeamcocoIE(),
2469         XHamsterIE(),
2470         HypemIE(),
2471         Vbox7IE(),
2472         GametrailersIE(),
2473         StatigramIE(),
2474         GenericIE()
2475     ]
2476
2477 def get_info_extractor(ie_name):
2478     """Returns the info extractor class with the given ie_name"""
2479     return globals()[ie_name+'IE']