_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24 from .extractor.common import InfoExtractor, SearchInfoExtractor
  25
  26 from .extractor.ard import ARDIE
  27 from .extractor.arte import ArteTvIE
  28 from .extractor.dailymotion import DailymotionIE
  29 from .extractor.gametrailers import GametrailersIE
  30 from .extractor.metacafe import MetacafeIE
  31 from .extractor.statigram import StatigramIE
  32 from .extractor.photobucket import PhotobucketIE
  33 from .extractor.vimeo import VimeoIE
  34 from .extractor.yahoo import YahooIE
  35 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  36 from .extractor.zdf import ZDFIE
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48 class GenericIE(InfoExtractor):
  49     """Generic last-resort information extractor."""
  50
  51     _VALID_URL = r'.*'
  52     IE_NAME = u'generic'
  53
  54     def report_download_webpage(self, video_id):
  55         """Report webpage download."""
  56         if not self._downloader.params.get('test', False):
  57             self._downloader.report_warning(u'Falling back on generic information extractor.')
  58         super(GenericIE, self).report_download_webpage(video_id)
  59
  60     def report_following_redirect(self, new_url):
  61         """Report information extraction."""
  62         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
  63
  64     def _test_redirect(self, url):
  65         """Check if it is a redirect, like url shorteners, in case return the new url."""
  66         class HeadRequest(compat_urllib_request.Request):
  67             def get_method(self):
  68                 return "HEAD"
  69
  70         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
  71             """
  72             Subclass the HTTPRedirectHandler to make it use our
  73             HeadRequest also on the redirected URL
  74             """
  75             def redirect_request(self, req, fp, code, msg, headers, newurl):
  76                 if code in (301, 302, 303, 307):
  77                     newurl = newurl.replace(' ', '%20')
  78                     newheaders = dict((k,v) for k,v in req.headers.items()
  79                                       if k.lower() not in ("content-length", "content-type"))
  80                     return HeadRequest(newurl,
  81                                        headers=newheaders,
  82                                        origin_req_host=req.get_origin_req_host(),
  83                                        unverifiable=True)
  84                 else:
  85                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
  86
  87         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
  88             """
  89             Fallback to GET if HEAD is not allowed (405 HTTP error)
  90             """
  91             def http_error_405(self, req, fp, code, msg, headers):
  92                 fp.read()
  93                 fp.close()
  94
  95                 newheaders = dict((k,v) for k,v in req.headers.items()
  96                                   if k.lower() not in ("content-length", "content-type"))
  97                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
  98                                                  headers=newheaders,
  99                                                  origin_req_host=req.get_origin_req_host(),
 100                                                  unverifiable=True))
 101
 102         # Build our opener
 103         opener = compat_urllib_request.OpenerDirector()
 104         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
 105                         HTTPMethodFallback, HEADRedirectHandler,
 106                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
 107             opener.add_handler(handler())
 108
 109         response = opener.open(HeadRequest(url))
 110         if response is None:
 111             raise ExtractorError(u'Invalid URL protocol')
 112         new_url = response.geturl()
 113
 114         if url == new_url:
 115             return False
 116
 117         self.report_following_redirect(new_url)
 118         return new_url
 119
 120     def _real_extract(self, url):
 121         new_url = self._test_redirect(url)
 122         if new_url: return [self.url_result(new_url)]
 123
 124         video_id = url.split('/')[-1]
 125         try:
 126             webpage = self._download_webpage(url, video_id)
 127         except ValueError as err:
 128             # since this is the last-resort InfoExtractor, if
 129             # this error is thrown, it'll be thrown here
 130             raise ExtractorError(u'Invalid URL: %s' % url)
 131
 132         self.report_extraction(video_id)
 133         # Start with something easy: JW Player in SWFObject
 134         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 135         if mobj is None:
 136             # Broaden the search a little bit
 137             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 138         if mobj is None:
 139             # Broaden the search a little bit: JWPlayer JS loader
 140             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
 141         if mobj is None:
 142             # Try to find twitter cards info
 143             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
 144         if mobj is None:
 145             # We look for Open Graph info:
 146             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
 147             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
 148             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
 149             if m_video_type is not None:
 150                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
 151         if mobj is None:
 152             raise ExtractorError(u'Invalid URL: %s' % url)
 153
 154         # It's possible that one of the regexes
 155         # matched, but returned an empty group:
 156         if mobj.group(1) is None:
 157             raise ExtractorError(u'Invalid URL: %s' % url)
 158
 159         video_url = compat_urllib_parse.unquote(mobj.group(1))
 160         video_id = os.path.basename(video_url)
 161
 162         # here's a fun little line of code for you:
 163         video_extension = os.path.splitext(video_id)[1][1:]
 164         video_id = os.path.splitext(video_id)[0]
 165
 166         # it's tempting to parse this further, but you would
 167         # have to take into account all the variations like
 168         #   Video Title - Site Name
 169         #   Site Name | Video Title
 170         #   Video Title - Tagline | Site Name
 171         # and so on and so forth; it's just not practical
 172         video_title = self._html_search_regex(r'<title>(.*)</title>',
 173             webpage, u'video title')
 174
 175         # video uploader is domain name
 176         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
 177             url, u'video uploader')
 178
 179         return [{
 180             'id':       video_id,
 181             'url':      video_url,
 182             'uploader': video_uploader,
 183             'upload_date':  None,
 184             'title':    video_title,
 185             'ext':      video_extension,
 186         }]
 187
 188
 189
 190 class GoogleSearchIE(SearchInfoExtractor):
 191     """Information Extractor for Google Video search queries."""
 192     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
 193     _MAX_RESULTS = 1000
 194     IE_NAME = u'video.google:search'
 195     _SEARCH_KEY = 'gvsearch'
 196
 197     def _get_n_results(self, query, n):
 198         """Get a specified number of results for a query"""
 199
 200         res = {
 201             '_type': 'playlist',
 202             'id': query,
 203             'entries': []
 204         }
 205
 206         for pagenum in itertools.count(1):
 207             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
 208             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
 209                                              note='Downloading result page ' + str(pagenum))
 210
 211             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
 212                 e = {
 213                     '_type': 'url',
 214                     'url': mobj.group(1)
 215                 }
 216                 res['entries'].append(e)
 217
 218             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
 219                 return res
 220
 221 class YahooSearchIE(SearchInfoExtractor):
 222     """Information Extractor for Yahoo! Video search queries."""
 223
 224     _MAX_RESULTS = 1000
 225     IE_NAME = u'screen.yahoo:search'
 226     _SEARCH_KEY = 'yvsearch'
 227
 228     def _get_n_results(self, query, n):
 229         """Get a specified number of results for a query"""
 230
 231         res = {
 232             '_type': 'playlist',
 233             'id': query,
 234             'entries': []
 235         }
 236         for pagenum in itertools.count(0):
 237             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 238             webpage = self._download_webpage(result_url, query,
 239                                              note='Downloading results page '+str(pagenum+1))
 240             info = json.loads(webpage)
 241             m = info[u'm']
 242             results = info[u'results']
 243
 244             for (i, r) in enumerate(results):
 245                 if (pagenum * 30) +i >= n:
 246                     break
 247                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 248                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 249                 res['entries'].append(e)
 250             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
 251                 break
 252
 253         return res
 254
 255
 256 class BlipTVUserIE(InfoExtractor):
 257     """Information Extractor for blip.tv users."""
 258
 259     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 260     _PAGE_SIZE = 12
 261     IE_NAME = u'blip.tv:user'
 262
 263     def _real_extract(self, url):
 264         # Extract username
 265         mobj = re.match(self._VALID_URL, url)
 266         if mobj is None:
 267             raise ExtractorError(u'Invalid URL: %s' % url)
 268
 269         username = mobj.group(1)
 270
 271         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 272
 273         page = self._download_webpage(url, username, u'Downloading user page')
 274         mobj = re.search(r'data-users-id="([^"]+)"', page)
 275         page_base = page_base % mobj.group(1)
 276
 277
 278         # Download video ids using BlipTV Ajax calls. Result size per
 279         # query is limited (currently to 12 videos) so we need to query
 280         # page by page until there are no video ids - it means we got
 281         # all of them.
 282
 283         video_ids = []
 284         pagenum = 1
 285
 286         while True:
 287             url = page_base + "&page=" + str(pagenum)
 288             page = self._download_webpage(url, username,
 289                                           u'Downloading video ids from page %d' % pagenum)
 290
 291             # Extract video identifiers
 292             ids_in_page = []
 293
 294             for mobj in re.finditer(r'href="/([^"]+)"', page):
 295                 if mobj.group(1) not in ids_in_page:
 296                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 297
 298             video_ids.extend(ids_in_page)
 299
 300             # A little optimization - if current page is not
 301             # "full", ie. does not contain PAGE_SIZE video ids then
 302             # we can assume that this page is the last one - there
 303             # are no more ids on further pages - no need to query
 304             # again.
 305
 306             if len(ids_in_page) < self._PAGE_SIZE:
 307                 break
 308
 309             pagenum += 1
 310
 311         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 312         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 313         return [self.playlist_result(url_entries, playlist_title = username)]
 314
 315
 316 class DepositFilesIE(InfoExtractor):
 317     """Information extractor for depositfiles.com"""
 318
 319     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 320
 321     def _real_extract(self, url):
 322         file_id = url.split('/')[-1]
 323         # Rebuild url in english locale
 324         url = 'http://depositfiles.com/en/files/' + file_id
 325
 326         # Retrieve file webpage with 'Free download' button pressed
 327         free_download_indication = { 'gateway_result' : '1' }
 328         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 329         try:
 330             self.report_download_webpage(file_id)
 331             webpage = compat_urllib_request.urlopen(request).read()
 332         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 333             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 334
 335         # Search for the real file URL
 336         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 337         if (mobj is None) or (mobj.group(1) is None):
 338             # Try to figure out reason of the error.
 339             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 340             if (mobj is not None) and (mobj.group(1) is not None):
 341                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 342                 raise ExtractorError(u'%s' % restriction_message)
 343             else:
 344                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 345
 346         file_url = mobj.group(1)
 347         file_extension = os.path.splitext(file_url)[1][1:]
 348
 349         # Search for file title
 350         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 351
 352         return [{
 353             'id':       file_id.decode('utf-8'),
 354             'url':      file_url.decode('utf-8'),
 355             'uploader': None,
 356             'upload_date':  None,
 357             'title':    file_title,
 358             'ext':      file_extension.decode('utf-8'),
 359         }]
 360
 361
 362 class FacebookIE(InfoExtractor):
 363     """Information Extractor for Facebook"""
 364
 365     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 366     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 367     _NETRC_MACHINE = 'facebook'
 368     IE_NAME = u'facebook'
 369
 370     def report_login(self):
 371         """Report attempt to log in."""
 372         self.to_screen(u'Logging in')
 373
 374     def _real_initialize(self):
 375         if self._downloader is None:
 376             return
 377
 378         useremail = None
 379         password = None
 380         downloader_params = self._downloader.params
 381
 382         # Attempt to use provided username and password or .netrc data
 383         if downloader_params.get('username', None) is not None:
 384             useremail = downloader_params['username']
 385             password = downloader_params['password']
 386         elif downloader_params.get('usenetrc', False):
 387             try:
 388                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 389                 if info is not None:
 390                     useremail = info[0]
 391                     password = info[2]
 392                 else:
 393                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 394             except (IOError, netrc.NetrcParseError) as err:
 395                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 396                 return
 397
 398         if useremail is None:
 399             return
 400
 401         # Log in
 402         login_form = {
 403             'email': useremail,
 404             'pass': password,
 405             'login': 'Log+In'
 406             }
 407         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 408         try:
 409             self.report_login()
 410             login_results = compat_urllib_request.urlopen(request).read()
 411             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 412                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 413                 return
 414         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 415             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 416             return
 417
 418     def _real_extract(self, url):
 419         mobj = re.match(self._VALID_URL, url)
 420         if mobj is None:
 421             raise ExtractorError(u'Invalid URL: %s' % url)
 422         video_id = mobj.group('ID')
 423
 424         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 425         webpage = self._download_webpage(url, video_id)
 426
 427         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 428         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 429         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 430         if not m:
 431             raise ExtractorError(u'Cannot parse data')
 432         data = dict(json.loads(m.group(1)))
 433         params_raw = compat_urllib_parse.unquote(data['params'])
 434         params = json.loads(params_raw)
 435         video_data = params['video_data'][0]
 436         video_url = video_data.get('hd_src')
 437         if not video_url:
 438             video_url = video_data['sd_src']
 439         if not video_url:
 440             raise ExtractorError(u'Cannot find video URL')
 441         video_duration = int(video_data['video_duration'])
 442         thumbnail = video_data['thumbnail_src']
 443
 444         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 445             webpage, u'title')
 446
 447         info = {
 448             'id': video_id,
 449             'title': video_title,
 450             'url': video_url,
 451             'ext': 'mp4',
 452             'duration': video_duration,
 453             'thumbnail': thumbnail,
 454         }
 455         return [info]
 456
 457
 458 class BlipTVIE(InfoExtractor):
 459     """Information extractor for blip.tv"""
 460
 461     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 462     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 463     IE_NAME = u'blip.tv'
 464
 465     def report_direct_download(self, title):
 466         """Report information extraction."""
 467         self.to_screen(u'%s: Direct download detected' % title)
 468
 469     def _real_extract(self, url):
 470         mobj = re.match(self._VALID_URL, url)
 471         if mobj is None:
 472             raise ExtractorError(u'Invalid URL: %s' % url)
 473
 474         # See https://github.com/rg3/youtube-dl/issues/857
 475         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 476         if api_mobj is not None:
 477             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 478         urlp = compat_urllib_parse_urlparse(url)
 479         if urlp.path.startswith('/play/'):
 480             request = compat_urllib_request.Request(url)
 481             response = compat_urllib_request.urlopen(request)
 482             redirecturl = response.geturl()
 483             rurlp = compat_urllib_parse_urlparse(redirecturl)
 484             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 485             url = 'http://blip.tv/a/a-' + file_id
 486             return self._real_extract(url)
 487
 488
 489         if '?' in url:
 490             cchar = '&'
 491         else:
 492             cchar = '?'
 493         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 494         request = compat_urllib_request.Request(json_url)
 495         request.add_header('User-Agent', 'iTunes/10.6.1')
 496         self.report_extraction(mobj.group(1))
 497         info = None
 498         try:
 499             urlh = compat_urllib_request.urlopen(request)
 500             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 501                 basename = url.split('/')[-1]
 502                 title,ext = os.path.splitext(basename)
 503                 title = title.decode('UTF-8')
 504                 ext = ext.replace('.', '')
 505                 self.report_direct_download(title)
 506                 info = {
 507                     'id': title,
 508                     'url': url,
 509                     'uploader': None,
 510                     'upload_date': None,
 511                     'title': title,
 512                     'ext': ext,
 513                     'urlhandle': urlh
 514                 }
 515         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 516             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 517         if info is None: # Regular URL
 518             try:
 519                 json_code_bytes = urlh.read()
 520                 json_code = json_code_bytes.decode('utf-8')
 521             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 522                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 523
 524             try:
 525                 json_data = json.loads(json_code)
 526                 if 'Post' in json_data:
 527                     data = json_data['Post']
 528                 else:
 529                     data = json_data
 530
 531                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 532                 video_url = data['media']['url']
 533                 umobj = re.match(self._URL_EXT, video_url)
 534                 if umobj is None:
 535                     raise ValueError('Can not determine filename extension')
 536                 ext = umobj.group(1)
 537
 538                 info = {
 539                     'id': data['item_id'],
 540                     'url': video_url,
 541                     'uploader': data['display_name'],
 542                     'upload_date': upload_date,
 543                     'title': data['title'],
 544                     'ext': ext,
 545                     'format': data['media']['mimeType'],
 546                     'thumbnail': data['thumbnailUrl'],
 547                     'description': data['description'],
 548                     'player_url': data['embedUrl'],
 549                     'user_agent': 'iTunes/10.6.1',
 550                 }
 551             except (ValueError,KeyError) as err:
 552                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 553
 554         return [info]
 555
 556
 557 class MyVideoIE(InfoExtractor):
 558     """Information Extractor for myvideo.de."""
 559
 560     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 561     IE_NAME = u'myvideo'
 562
 563     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 564     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 565     # https://github.com/rg3/youtube-dl/pull/842
 566     def __rc4crypt(self,data, key):
 567         x = 0
 568         box = list(range(256))
 569         for i in list(range(256)):
 570             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 571             box[i], box[x] = box[x], box[i]
 572         x = 0
 573         y = 0
 574         out = ''
 575         for char in data:
 576             x = (x + 1) % 256
 577             y = (y + box[x]) % 256
 578             box[x], box[y] = box[y], box[x]
 579             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 580         return out
 581
 582     def __md5(self,s):
 583         return hashlib.md5(s).hexdigest().encode()
 584
 585     def _real_extract(self,url):
 586         mobj = re.match(self._VALID_URL, url)
 587         if mobj is None:
 588             raise ExtractorError(u'invalid URL: %s' % url)
 589
 590         video_id = mobj.group(1)
 591
 592         GK = (
 593           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 594           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 595           b'TnpsbA0KTVRkbU1tSTRNdz09'
 596         )
 597
 598         # Get video webpage
 599         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 600         webpage = self._download_webpage(webpage_url, video_id)
 601
 602         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 603         if mobj is not None:
 604             self.report_extraction(video_id)
 605             video_url = mobj.group(1) + '.flv'
 606
 607             video_title = self._html_search_regex('<title>([^<]+)</title>',
 608                 webpage, u'title')
 609
 610             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 611
 612             return [{
 613                 'id':       video_id,
 614                 'url':      video_url,
 615                 'uploader': None,
 616                 'upload_date':  None,
 617                 'title':    video_title,
 618                 'ext':      u'flv',
 619             }]
 620
 621         # try encxml
 622         mobj = re.search('var flashvars={(.+?)}', webpage)
 623         if mobj is None:
 624             raise ExtractorError(u'Unable to extract video')
 625
 626         params = {}
 627         encxml = ''
 628         sec = mobj.group(1)
 629         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 630             if not a == '_encxml':
 631                 params[a] = b
 632             else:
 633                 encxml = compat_urllib_parse.unquote(b)
 634         if not params.get('domain'):
 635             params['domain'] = 'www.myvideo.de'
 636         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 637         if 'flash_playertype=MTV' in xmldata_url:
 638             self._downloader.report_warning(u'avoiding MTV player')
 639             xmldata_url = (
 640                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 641                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 642             ) % video_id
 643
 644         # get enc data
 645         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 646         enc_data_b = binascii.unhexlify(enc_data)
 647         sk = self.__md5(
 648             base64.b64decode(base64.b64decode(GK)) +
 649             self.__md5(
 650                 str(video_id).encode('utf-8')
 651             )
 652         )
 653         dec_data = self.__rc4crypt(enc_data_b, sk)
 654
 655         # extracting infos
 656         self.report_extraction(video_id)
 657
 658         video_url = None
 659         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
 660         if mobj:
 661             video_url = compat_urllib_parse.unquote(mobj.group(1))
 662             if 'myvideo2flash' in video_url:
 663                 self._downloader.report_warning(u'forcing RTMPT ...')
 664                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
 665
 666         if not video_url:
 667             # extract non rtmp videos
 668             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
 669             if mobj is None:
 670                 raise ExtractorError(u'unable to extract url')
 671             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 672
 673         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
 674         video_file = compat_urllib_parse.unquote(video_file)
 675
 676         if not video_file.endswith('f4m'):
 677             ppath, prefix = video_file.split('.')
 678             video_playpath = '%s:%s' % (prefix, ppath)
 679             video_hls_playlist = ''
 680         else:
 681             video_playpath = ''
 682             video_hls_playlist = (
 683                 video_filepath + video_file
 684             ).replace('.f4m', '.m3u8')
 685
 686         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
 687         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 688
 689         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
 690             webpage, u'title')
 691
 692         return [{
 693             'id':                 video_id,
 694             'url':                video_url,
 695             'tc_url':             video_url,
 696             'uploader':           None,
 697             'upload_date':        None,
 698             'title':              video_title,
 699             'ext':                u'flv',
 700             'play_path':          video_playpath,
 701             'video_file':         video_file,
 702             'video_hls_playlist': video_hls_playlist,
 703             'player_url':         video_swfobj,
 704         }]
 705
 706
 707 class ComedyCentralIE(InfoExtractor):
 708     """Information extractor for The Daily Show and Colbert Report """
 709
 710     # urls can be abbreviations like :thedailyshow or :colbert
 711     # urls for episodes like:
 712     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 713     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 714     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 715     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 716                       |(https?://)?(www\.)?
 717                           (?P<showname>thedailyshow|colbertnation)\.com/
 718                          (full-episodes/(?P<episode>.*)|
 719                           (?P<clip>
 720                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 721                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 722                      $"""
 723
 724     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 725
 726     _video_extensions = {
 727         '3500': 'mp4',
 728         '2200': 'mp4',
 729         '1700': 'mp4',
 730         '1200': 'mp4',
 731         '750': 'mp4',
 732         '400': 'mp4',
 733     }
 734     _video_dimensions = {
 735         '3500': '1280x720',
 736         '2200': '960x540',
 737         '1700': '768x432',
 738         '1200': '640x360',
 739         '750': '512x288',
 740         '400': '384x216',
 741     }
 742
 743     @classmethod
 744     def suitable(cls, url):
 745         """Receives a URL and returns True if suitable for this IE."""
 746         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 747
 748     def _print_formats(self, formats):
 749         print('Available formats:')
 750         for x in formats:
 751             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 752
 753
 754     def _real_extract(self, url):
 755         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 756         if mobj is None:
 757             raise ExtractorError(u'Invalid URL: %s' % url)
 758
 759         if mobj.group('shortname'):
 760             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 761                 url = u'http://www.thedailyshow.com/full-episodes/'
 762             else:
 763                 url = u'http://www.colbertnation.com/full-episodes/'
 764             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 765             assert mobj is not None
 766
 767         if mobj.group('clip'):
 768             if mobj.group('showname') == 'thedailyshow':
 769                 epTitle = mobj.group('tdstitle')
 770             else:
 771                 epTitle = mobj.group('cntitle')
 772             dlNewest = False
 773         else:
 774             dlNewest = not mobj.group('episode')
 775             if dlNewest:
 776                 epTitle = mobj.group('showname')
 777             else:
 778                 epTitle = mobj.group('episode')
 779
 780         self.report_extraction(epTitle)
 781         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 782         if dlNewest:
 783             url = htmlHandle.geturl()
 784             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 785             if mobj is None:
 786                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 787             if mobj.group('episode') == '':
 788                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 789             epTitle = mobj.group('episode')
 790
 791         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 792
 793         if len(mMovieParams) == 0:
 794             # The Colbert Report embeds the information in a without
 795             # a URL prefix; so extract the alternate reference
 796             # and then add the URL prefix manually.
 797
 798             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 799             if len(altMovieParams) == 0:
 800                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 801             else:
 802                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 803
 804         uri = mMovieParams[0][1]
 805         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 806         indexXml = self._download_webpage(indexUrl, epTitle,
 807                                           u'Downloading show index',
 808                                           u'unable to download episode index')
 809
 810         results = []
 811
 812         idoc = xml.etree.ElementTree.fromstring(indexXml)
 813         itemEls = idoc.findall('.//item')
 814         for partNum,itemEl in enumerate(itemEls):
 815             mediaId = itemEl.findall('./guid')[0].text
 816             shortMediaId = mediaId.split(':')[-1]
 817             showId = mediaId.split(':')[-2].replace('.com', '')
 818             officialTitle = itemEl.findall('./title')[0].text
 819             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 820
 821             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 822                         compat_urllib_parse.urlencode({'uri': mediaId}))
 823             configXml = self._download_webpage(configUrl, epTitle,
 824                                                u'Downloading configuration for %s' % shortMediaId)
 825
 826             cdoc = xml.etree.ElementTree.fromstring(configXml)
 827             turls = []
 828             for rendition in cdoc.findall('.//rendition'):
 829                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 830                 turls.append(finfo)
 831
 832             if len(turls) == 0:
 833                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 834                 continue
 835
 836             if self._downloader.params.get('listformats', None):
 837                 self._print_formats([i[0] for i in turls])
 838                 return
 839
 840             # For now, just pick the highest bitrate
 841             format,rtmp_video_url = turls[-1]
 842
 843             # Get the format arg from the arg stream
 844             req_format = self._downloader.params.get('format', None)
 845
 846             # Select format if we can find one
 847             for f,v in turls:
 848                 if f == req_format:
 849                     format, rtmp_video_url = f, v
 850                     break
 851
 852             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 853             if not m:
 854                 raise ExtractorError(u'Cannot transform RTMP url')
 855             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 856             video_url = base + m.group('finalid')
 857
 858             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 859             info = {
 860                 'id': shortMediaId,
 861                 'url': video_url,
 862                 'uploader': showId,
 863                 'upload_date': officialDate,
 864                 'title': effTitle,
 865                 'ext': 'mp4',
 866                 'format': format,
 867                 'thumbnail': None,
 868                 'description': officialTitle,
 869             }
 870             results.append(info)
 871
 872         return results
 873
 874
 875 class EscapistIE(InfoExtractor):
 876     """Information extractor for The Escapist """
 877
 878     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 879     IE_NAME = u'escapist'
 880
 881     def _real_extract(self, url):
 882         mobj = re.match(self._VALID_URL, url)
 883         if mobj is None:
 884             raise ExtractorError(u'Invalid URL: %s' % url)
 885         showName = mobj.group('showname')
 886         videoId = mobj.group('episode')
 887
 888         self.report_extraction(videoId)
 889         webpage = self._download_webpage(url, videoId)
 890
 891         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 892             webpage, u'description', fatal=False)
 893
 894         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 895             webpage, u'thumbnail', fatal=False)
 896
 897         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 898             webpage, u'player url')
 899
 900         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 901             webpage, u'player url').split(' : ')[-1]
 902
 903         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 904         configUrl = compat_urllib_parse.unquote(configUrl)
 905
 906         configJSON = self._download_webpage(configUrl, videoId,
 907                                             u'Downloading configuration',
 908                                             u'unable to download configuration')
 909
 910         # Technically, it's JavaScript, not JSON
 911         configJSON = configJSON.replace("'", '"')
 912
 913         try:
 914             config = json.loads(configJSON)
 915         except (ValueError,) as err:
 916             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 917
 918         playlist = config['playlist']
 919         videoUrl = playlist[1]['url']
 920
 921         info = {
 922             'id': videoId,
 923             'url': videoUrl,
 924             'uploader': showName,
 925             'upload_date': None,
 926             'title': title,
 927             'ext': 'mp4',
 928             'thumbnail': imgUrl,
 929             'description': videoDesc,
 930             'player_url': playerUrl,
 931         }
 932
 933         return [info]
 934
 935 class CollegeHumorIE(InfoExtractor):
 936     """Information extractor for collegehumor.com"""
 937
 938     _WORKING = False
 939     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 940     IE_NAME = u'collegehumor'
 941
 942     def report_manifest(self, video_id):
 943         """Report information extraction."""
 944         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 945
 946     def _real_extract(self, url):
 947         mobj = re.match(self._VALID_URL, url)
 948         if mobj is None:
 949             raise ExtractorError(u'Invalid URL: %s' % url)
 950         video_id = mobj.group('videoid')
 951
 952         info = {
 953             'id': video_id,
 954             'uploader': None,
 955             'upload_date': None,
 956         }
 957
 958         self.report_extraction(video_id)
 959         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 960         try:
 961             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 962         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 963             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 964
 965         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 966         try:
 967             videoNode = mdoc.findall('./video')[0]
 968             info['description'] = videoNode.findall('./description')[0].text
 969             info['title'] = videoNode.findall('./caption')[0].text
 970             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 971             manifest_url = videoNode.findall('./file')[0].text
 972         except IndexError:
 973             raise ExtractorError(u'Invalid metadata XML file')
 974
 975         manifest_url += '?hdcore=2.10.3'
 976         self.report_manifest(video_id)
 977         try:
 978             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 979         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 980             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 981
 982         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 983         try:
 984             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 985             node_id = media_node.attrib['url']
 986             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 987         except IndexError as err:
 988             raise ExtractorError(u'Invalid manifest file')
 989
 990         url_pr = compat_urllib_parse_urlparse(manifest_url)
 991         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 992
 993         info['url'] = url
 994         info['ext'] = 'f4f'
 995         return [info]
 996
 997
 998 class XVideosIE(InfoExtractor):
 999     """Information extractor for xvideos.com"""
1000
1001     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1002     IE_NAME = u'xvideos'
1003
1004     def _real_extract(self, url):
1005         mobj = re.match(self._VALID_URL, url)
1006         if mobj is None:
1007             raise ExtractorError(u'Invalid URL: %s' % url)
1008         video_id = mobj.group(1)
1009
1010         webpage = self._download_webpage(url, video_id)
1011
1012         self.report_extraction(video_id)
1013
1014         # Extract video URL
1015         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1016             webpage, u'video URL'))
1017
1018         # Extract title
1019         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1020             webpage, u'title')
1021
1022         # Extract video thumbnail
1023         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1024             webpage, u'thumbnail', fatal=False)
1025
1026         info = {
1027             'id': video_id,
1028             'url': video_url,
1029             'uploader': None,
1030             'upload_date': None,
1031             'title': video_title,
1032             'ext': 'flv',
1033             'thumbnail': video_thumbnail,
1034             'description': None,
1035         }
1036
1037         return [info]
1038
1039
1040 class SoundcloudIE(InfoExtractor):
1041     """Information extractor for soundcloud.com
1042        To access the media, the uid of the song and a stream token
1043        must be extracted from the page source and the script must make
1044        a request to media.soundcloud.com/crossdomain.xml. Then
1045        the media can be grabbed by requesting from an url composed
1046        of the stream token and uid
1047      """
1048
1049     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1050     IE_NAME = u'soundcloud'
1051
1052     def report_resolve(self, video_id):
1053         """Report information extraction."""
1054         self.to_screen(u'%s: Resolving id' % video_id)
1055
1056     def _real_extract(self, url):
1057         mobj = re.match(self._VALID_URL, url)
1058         if mobj is None:
1059             raise ExtractorError(u'Invalid URL: %s' % url)
1060
1061         # extract uploader (which is in the url)
1062         uploader = mobj.group(1)
1063         # extract simple title (uploader + slug of song title)
1064         slug_title =  mobj.group(2)
1065         simple_title = uploader + u'-' + slug_title
1066         full_title = '%s/%s' % (uploader, slug_title)
1067
1068         self.report_resolve(full_title)
1069
1070         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1071         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1072         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1073
1074         info = json.loads(info_json)
1075         video_id = info['id']
1076         self.report_extraction(full_title)
1077
1078         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1079         stream_json = self._download_webpage(streams_url, full_title,
1080                                              u'Downloading stream definitions',
1081                                              u'unable to download stream definitions')
1082
1083         streams = json.loads(stream_json)
1084         mediaURL = streams['http_mp3_128_url']
1085         upload_date = unified_strdate(info['created_at'])
1086
1087         return [{
1088             'id':       info['id'],
1089             'url':      mediaURL,
1090             'uploader': info['user']['username'],
1091             'upload_date': upload_date,
1092             'title':    info['title'],
1093             'ext':      u'mp3',
1094             'description': info['description'],
1095         }]
1096
1097 class SoundcloudSetIE(InfoExtractor):
1098     """Information extractor for soundcloud.com sets
1099        To access the media, the uid of the song and a stream token
1100        must be extracted from the page source and the script must make
1101        a request to media.soundcloud.com/crossdomain.xml. Then
1102        the media can be grabbed by requesting from an url composed
1103        of the stream token and uid
1104      """
1105
1106     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1107     IE_NAME = u'soundcloud:set'
1108
1109     def report_resolve(self, video_id):
1110         """Report information extraction."""
1111         self.to_screen(u'%s: Resolving id' % video_id)
1112
1113     def _real_extract(self, url):
1114         mobj = re.match(self._VALID_URL, url)
1115         if mobj is None:
1116             raise ExtractorError(u'Invalid URL: %s' % url)
1117
1118         # extract uploader (which is in the url)
1119         uploader = mobj.group(1)
1120         # extract simple title (uploader + slug of song title)
1121         slug_title =  mobj.group(2)
1122         simple_title = uploader + u'-' + slug_title
1123         full_title = '%s/sets/%s' % (uploader, slug_title)
1124
1125         self.report_resolve(full_title)
1126
1127         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1128         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1129         info_json = self._download_webpage(resolv_url, full_title)
1130
1131         videos = []
1132         info = json.loads(info_json)
1133         if 'errors' in info:
1134             for err in info['errors']:
1135                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1136             return
1137
1138         self.report_extraction(full_title)
1139         for track in info['tracks']:
1140             video_id = track['id']
1141
1142             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1143             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1144
1145             self.report_extraction(video_id)
1146             streams = json.loads(stream_json)
1147             mediaURL = streams['http_mp3_128_url']
1148
1149             videos.append({
1150                 'id':       video_id,
1151                 'url':      mediaURL,
1152                 'uploader': track['user']['username'],
1153                 'upload_date':  unified_strdate(track['created_at']),
1154                 'title':    track['title'],
1155                 'ext':      u'mp3',
1156                 'description': track['description'],
1157             })
1158         return videos
1159
1160
1161 class InfoQIE(InfoExtractor):
1162     """Information extractor for infoq.com"""
1163     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1164
1165     def _real_extract(self, url):
1166         mobj = re.match(self._VALID_URL, url)
1167         if mobj is None:
1168             raise ExtractorError(u'Invalid URL: %s' % url)
1169
1170         webpage = self._download_webpage(url, video_id=url)
1171         self.report_extraction(url)
1172
1173         # Extract video URL
1174         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1175         if mobj is None:
1176             raise ExtractorError(u'Unable to extract video url')
1177         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1178         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1179
1180         # Extract title
1181         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1182             webpage, u'title')
1183
1184         # Extract description
1185         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1186             webpage, u'description', fatal=False)
1187
1188         video_filename = video_url.split('/')[-1]
1189         video_id, extension = video_filename.split('.')
1190
1191         info = {
1192             'id': video_id,
1193             'url': video_url,
1194             'uploader': None,
1195             'upload_date': None,
1196             'title': video_title,
1197             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1198             'thumbnail': None,
1199             'description': video_description,
1200         }
1201
1202         return [info]
1203
1204 class MixcloudIE(InfoExtractor):
1205     """Information extractor for www.mixcloud.com"""
1206
1207     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1208     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1209     IE_NAME = u'mixcloud'
1210
1211     def report_download_json(self, file_id):
1212         """Report JSON download."""
1213         self.to_screen(u'Downloading json')
1214
1215     def get_urls(self, jsonData, fmt, bitrate='best'):
1216         """Get urls from 'audio_formats' section in json"""
1217         file_url = None
1218         try:
1219             bitrate_list = jsonData[fmt]
1220             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1221                 bitrate = max(bitrate_list) # select highest
1222
1223             url_list = jsonData[fmt][bitrate]
1224         except TypeError: # we have no bitrate info.
1225             url_list = jsonData[fmt]
1226         return url_list
1227
1228     def check_urls(self, url_list):
1229         """Returns 1st active url from list"""
1230         for url in url_list:
1231             try:
1232                 compat_urllib_request.urlopen(url)
1233                 return url
1234             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1235                 url = None
1236
1237         return None
1238
1239     def _print_formats(self, formats):
1240         print('Available formats:')
1241         for fmt in formats.keys():
1242             for b in formats[fmt]:
1243                 try:
1244                     ext = formats[fmt][b][0]
1245                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1246                 except TypeError: # we have no bitrate info
1247                     ext = formats[fmt][0]
1248                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1249                     break
1250
1251     def _real_extract(self, url):
1252         mobj = re.match(self._VALID_URL, url)
1253         if mobj is None:
1254             raise ExtractorError(u'Invalid URL: %s' % url)
1255         # extract uploader & filename from url
1256         uploader = mobj.group(1).decode('utf-8')
1257         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1258
1259         # construct API request
1260         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1261         # retrieve .json file with links to files
1262         request = compat_urllib_request.Request(file_url)
1263         try:
1264             self.report_download_json(file_url)
1265             jsonData = compat_urllib_request.urlopen(request).read()
1266         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1267             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1268
1269         # parse JSON
1270         json_data = json.loads(jsonData)
1271         player_url = json_data['player_swf_url']
1272         formats = dict(json_data['audio_formats'])
1273
1274         req_format = self._downloader.params.get('format', None)
1275         bitrate = None
1276
1277         if self._downloader.params.get('listformats', None):
1278             self._print_formats(formats)
1279             return
1280
1281         if req_format is None or req_format == 'best':
1282             for format_param in formats.keys():
1283                 url_list = self.get_urls(formats, format_param)
1284                 # check urls
1285                 file_url = self.check_urls(url_list)
1286                 if file_url is not None:
1287                     break # got it!
1288         else:
1289             if req_format not in formats:
1290                 raise ExtractorError(u'Format is not available')
1291
1292             url_list = self.get_urls(formats, req_format)
1293             file_url = self.check_urls(url_list)
1294             format_param = req_format
1295
1296         return [{
1297             'id': file_id.decode('utf-8'),
1298             'url': file_url.decode('utf-8'),
1299             'uploader': uploader.decode('utf-8'),
1300             'upload_date': None,
1301             'title': json_data['name'],
1302             'ext': file_url.split('.')[-1].decode('utf-8'),
1303             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1304             'thumbnail': json_data['thumbnail_url'],
1305             'description': json_data['description'],
1306             'player_url': player_url.decode('utf-8'),
1307         }]
1308
1309 class StanfordOpenClassroomIE(InfoExtractor):
1310     """Information extractor for Stanford's Open ClassRoom"""
1311
1312     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1313     IE_NAME = u'stanfordoc'
1314
1315     def _real_extract(self, url):
1316         mobj = re.match(self._VALID_URL, url)
1317         if mobj is None:
1318             raise ExtractorError(u'Invalid URL: %s' % url)
1319
1320         if mobj.group('course') and mobj.group('video'): # A specific video
1321             course = mobj.group('course')
1322             video = mobj.group('video')
1323             info = {
1324                 'id': course + '_' + video,
1325                 'uploader': None,
1326                 'upload_date': None,
1327             }
1328
1329             self.report_extraction(info['id'])
1330             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1331             xmlUrl = baseUrl + video + '.xml'
1332             try:
1333                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1334             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1335                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1336             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1337             try:
1338                 info['title'] = mdoc.findall('./title')[0].text
1339                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1340             except IndexError:
1341                 raise ExtractorError(u'Invalid metadata XML file')
1342             info['ext'] = info['url'].rpartition('.')[2]
1343             return [info]
1344         elif mobj.group('course'): # A course page
1345             course = mobj.group('course')
1346             info = {
1347                 'id': course,
1348                 'type': 'playlist',
1349                 'uploader': None,
1350                 'upload_date': None,
1351             }
1352
1353             coursepage = self._download_webpage(url, info['id'],
1354                                         note='Downloading course info page',
1355                                         errnote='Unable to download course info page')
1356
1357             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1358
1359             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1360                 coursepage, u'description', fatal=False)
1361
1362             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1363             info['list'] = [
1364                 {
1365                     'type': 'reference',
1366                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1367                 }
1368                     for vpage in links]
1369             results = []
1370             for entry in info['list']:
1371                 assert entry['type'] == 'reference'
1372                 results += self.extract(entry['url'])
1373             return results
1374         else: # Root page
1375             info = {
1376                 'id': 'Stanford OpenClassroom',
1377                 'type': 'playlist',
1378                 'uploader': None,
1379                 'upload_date': None,
1380             }
1381
1382             self.report_download_webpage(info['id'])
1383             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1384             try:
1385                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1386             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1387                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1388
1389             info['title'] = info['id']
1390
1391             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1392             info['list'] = [
1393                 {
1394                     'type': 'reference',
1395                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1396                 }
1397                     for cpage in links]
1398
1399             results = []
1400             for entry in info['list']:
1401                 assert entry['type'] == 'reference'
1402                 results += self.extract(entry['url'])
1403             return results
1404
1405 class MTVIE(InfoExtractor):
1406     """Information extractor for MTV.com"""
1407
1408     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1409     IE_NAME = u'mtv'
1410
1411     def _real_extract(self, url):
1412         mobj = re.match(self._VALID_URL, url)
1413         if mobj is None:
1414             raise ExtractorError(u'Invalid URL: %s' % url)
1415         if not mobj.group('proto'):
1416             url = 'http://' + url
1417         video_id = mobj.group('videoid')
1418
1419         webpage = self._download_webpage(url, video_id)
1420
1421         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1422             webpage, u'song name', fatal=False)
1423
1424         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1425             webpage, u'title')
1426
1427         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1428             webpage, u'mtvn_uri', fatal=False)
1429
1430         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1431             webpage, u'content id', fatal=False)
1432
1433         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1434         self.report_extraction(video_id)
1435         request = compat_urllib_request.Request(videogen_url)
1436         try:
1437             metadataXml = compat_urllib_request.urlopen(request).read()
1438         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1439             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1440
1441         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1442         renditions = mdoc.findall('.//rendition')
1443
1444         # For now, always pick the highest quality.
1445         rendition = renditions[-1]
1446
1447         try:
1448             _,_,ext = rendition.attrib['type'].partition('/')
1449             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1450             video_url = rendition.find('./src').text
1451         except KeyError:
1452             raise ExtractorError('Invalid rendition field.')
1453
1454         info = {
1455             'id': video_id,
1456             'url': video_url,
1457             'uploader': performer,
1458             'upload_date': None,
1459             'title': video_title,
1460             'ext': ext,
1461             'format': format,
1462         }
1463
1464         return [info]
1465
1466
1467 class YoukuIE(InfoExtractor):
1468     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1469
1470     def _gen_sid(self):
1471         nowTime = int(time.time() * 1000)
1472         random1 = random.randint(1000,1998)
1473         random2 = random.randint(1000,9999)
1474
1475         return "%d%d%d" %(nowTime,random1,random2)
1476
1477     def _get_file_ID_mix_string(self, seed):
1478         mixed = []
1479         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1480         seed = float(seed)
1481         for i in range(len(source)):
1482             seed  =  (seed * 211 + 30031 ) % 65536
1483             index  =  math.floor(seed / 65536 * len(source) )
1484             mixed.append(source[int(index)])
1485             source.remove(source[int(index)])
1486         #return ''.join(mixed)
1487         return mixed
1488
1489     def _get_file_id(self, fileId, seed):
1490         mixed = self._get_file_ID_mix_string(seed)
1491         ids = fileId.split('*')
1492         realId = []
1493         for ch in ids:
1494             if ch:
1495                 realId.append(mixed[int(ch)])
1496         return ''.join(realId)
1497
1498     def _real_extract(self, url):
1499         mobj = re.match(self._VALID_URL, url)
1500         if mobj is None:
1501             raise ExtractorError(u'Invalid URL: %s' % url)
1502         video_id = mobj.group('ID')
1503
1504         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1505
1506         jsondata = self._download_webpage(info_url, video_id)
1507
1508         self.report_extraction(video_id)
1509         try:
1510             config = json.loads(jsondata)
1511
1512             video_title =  config['data'][0]['title']
1513             seed = config['data'][0]['seed']
1514
1515             format = self._downloader.params.get('format', None)
1516             supported_format = list(config['data'][0]['streamfileids'].keys())
1517
1518             if format is None or format == 'best':
1519                 if 'hd2' in supported_format:
1520                     format = 'hd2'
1521                 else:
1522                     format = 'flv'
1523                 ext = u'flv'
1524             elif format == 'worst':
1525                 format = 'mp4'
1526                 ext = u'mp4'
1527             else:
1528                 format = 'flv'
1529                 ext = u'flv'
1530
1531
1532             fileid = config['data'][0]['streamfileids'][format]
1533             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1534         except (UnicodeDecodeError, ValueError, KeyError):
1535             raise ExtractorError(u'Unable to extract info section')
1536
1537         files_info=[]
1538         sid = self._gen_sid()
1539         fileid = self._get_file_id(fileid, seed)
1540
1541         #column 8,9 of fileid represent the segment number
1542         #fileid[7:9] should be changed
1543         for index, key in enumerate(keys):
1544
1545             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1546             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1547
1548             info = {
1549                 'id': '%s_part%02d' % (video_id, index),
1550                 'url': download_url,
1551                 'uploader': None,
1552                 'upload_date': None,
1553                 'title': video_title,
1554                 'ext': ext,
1555             }
1556             files_info.append(info)
1557
1558         return files_info
1559
1560
1561 class XNXXIE(InfoExtractor):
1562     """Information extractor for xnxx.com"""
1563
1564     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1565     IE_NAME = u'xnxx'
1566     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1567     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1568     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1569
1570     def _real_extract(self, url):
1571         mobj = re.match(self._VALID_URL, url)
1572         if mobj is None:
1573             raise ExtractorError(u'Invalid URL: %s' % url)
1574         video_id = mobj.group(1)
1575
1576         # Get webpage content
1577         webpage = self._download_webpage(url, video_id)
1578
1579         video_url = self._search_regex(self.VIDEO_URL_RE,
1580             webpage, u'video URL')
1581         video_url = compat_urllib_parse.unquote(video_url)
1582
1583         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1584             webpage, u'title')
1585
1586         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1587             webpage, u'thumbnail', fatal=False)
1588
1589         return [{
1590             'id': video_id,
1591             'url': video_url,
1592             'uploader': None,
1593             'upload_date': None,
1594             'title': video_title,
1595             'ext': 'flv',
1596             'thumbnail': video_thumbnail,
1597             'description': None,
1598         }]
1599
1600
1601 class GooglePlusIE(InfoExtractor):
1602     """Information extractor for plus.google.com."""
1603
1604     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1605     IE_NAME = u'plus.google'
1606
1607     def _real_extract(self, url):
1608         # Extract id from URL
1609         mobj = re.match(self._VALID_URL, url)
1610         if mobj is None:
1611             raise ExtractorError(u'Invalid URL: %s' % url)
1612
1613         post_url = mobj.group(0)
1614         video_id = mobj.group(1)
1615
1616         video_extension = 'flv'
1617
1618         # Step 1, Retrieve post webpage to extract further information
1619         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1620
1621         self.report_extraction(video_id)
1622
1623         # Extract update date
1624         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1625             webpage, u'upload date', fatal=False)
1626         if upload_date:
1627             # Convert timestring to a format suitable for filename
1628             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1629             upload_date = upload_date.strftime('%Y%m%d')
1630
1631         # Extract uploader
1632         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1633             webpage, u'uploader', fatal=False)
1634
1635         # Extract title
1636         # Get the first line for title
1637         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1638             webpage, 'title', default=u'NA')
1639
1640         # Step 2, Stimulate clicking the image box to launch video
1641         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1642             webpage, u'video page URL')
1643         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1644
1645         # Extract video links on video page
1646         """Extract video links of all sizes"""
1647         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1648         mobj = re.findall(pattern, webpage)
1649         if len(mobj) == 0:
1650             raise ExtractorError(u'Unable to extract video links')
1651
1652         # Sort in resolution
1653         links = sorted(mobj)
1654
1655         # Choose the lowest of the sort, i.e. highest resolution
1656         video_url = links[-1]
1657         # Only get the url. The resolution part in the tuple has no use anymore
1658         video_url = video_url[-1]
1659         # Treat escaped \u0026 style hex
1660         try:
1661             video_url = video_url.decode("unicode_escape")
1662         except AttributeError: # Python 3
1663             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1664
1665
1666         return [{
1667             'id':       video_id,
1668             'url':      video_url,
1669             'uploader': uploader,
1670             'upload_date':  upload_date,
1671             'title':    video_title,
1672             'ext':      video_extension,
1673         }]
1674
1675 class NBAIE(InfoExtractor):
1676     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1677     IE_NAME = u'nba'
1678
1679     def _real_extract(self, url):
1680         mobj = re.match(self._VALID_URL, url)
1681         if mobj is None:
1682             raise ExtractorError(u'Invalid URL: %s' % url)
1683
1684         video_id = mobj.group(1)
1685
1686         webpage = self._download_webpage(url, video_id)
1687
1688         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1689
1690         shortened_video_id = video_id.rpartition('/')[2]
1691         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1692             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1693
1694         # It isn't there in the HTML it returns to us
1695         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1696
1697         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1698
1699         info = {
1700             'id': shortened_video_id,
1701             'url': video_url,
1702             'ext': 'mp4',
1703             'title': title,
1704             # 'uploader_date': uploader_date,
1705             'description': description,
1706         }
1707         return [info]
1708
1709 class JustinTVIE(InfoExtractor):
1710     """Information extractor for justin.tv and twitch.tv"""
1711     # TODO: One broadcast may be split into multiple videos. The key
1712     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1713     # starts at 1 and increases. Can we treat all parts as one video?
1714
1715     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1716         (?:
1717             (?P<channelid>[^/]+)|
1718             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1719             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1720         )
1721         /?(?:\#.*)?$
1722         """
1723     _JUSTIN_PAGE_LIMIT = 100
1724     IE_NAME = u'justin.tv'
1725
1726     def report_download_page(self, channel, offset):
1727         """Report attempt to download a single page of videos."""
1728         self.to_screen(u'%s: Downloading video information from %d to %d' %
1729                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1730
1731     # Return count of items, list of *valid* items
1732     def _parse_page(self, url, video_id):
1733         webpage = self._download_webpage(url, video_id,
1734                                          u'Downloading video info JSON',
1735                                          u'unable to download video info JSON')
1736
1737         response = json.loads(webpage)
1738         if type(response) != list:
1739             error_text = response.get('error', 'unknown error')
1740             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1741         info = []
1742         for clip in response:
1743             video_url = clip['video_file_url']
1744             if video_url:
1745                 video_extension = os.path.splitext(video_url)[1][1:]
1746                 video_date = re.sub('-', '', clip['start_time'][:10])
1747                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1748                 video_id = clip['id']
1749                 video_title = clip.get('title', video_id)
1750                 info.append({
1751                     'id': video_id,
1752                     'url': video_url,
1753                     'title': video_title,
1754                     'uploader': clip.get('channel_name', video_uploader_id),
1755                     'uploader_id': video_uploader_id,
1756                     'upload_date': video_date,
1757                     'ext': video_extension,
1758                 })
1759         return (len(response), info)
1760
1761     def _real_extract(self, url):
1762         mobj = re.match(self._VALID_URL, url)
1763         if mobj is None:
1764             raise ExtractorError(u'invalid URL: %s' % url)
1765
1766         api_base = 'http://api.justin.tv'
1767         paged = False
1768         if mobj.group('channelid'):
1769             paged = True
1770             video_id = mobj.group('channelid')
1771             api = api_base + '/channel/archives/%s.json' % video_id
1772         elif mobj.group('chapterid'):
1773             chapter_id = mobj.group('chapterid')
1774
1775             webpage = self._download_webpage(url, chapter_id)
1776             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1777             if not m:
1778                 raise ExtractorError(u'Cannot find archive of a chapter')
1779             archive_id = m.group(1)
1780
1781             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1782             chapter_info_xml = self._download_webpage(api, chapter_id,
1783                                              note=u'Downloading chapter information',
1784                                              errnote=u'Chapter information download failed')
1785             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1786             for a in doc.findall('.//archive'):
1787                 if archive_id == a.find('./id').text:
1788                     break
1789             else:
1790                 raise ExtractorError(u'Could not find chapter in chapter information')
1791
1792             video_url = a.find('./video_file_url').text
1793             video_ext = video_url.rpartition('.')[2] or u'flv'
1794
1795             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1796             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1797                                    note='Downloading chapter metadata',
1798                                    errnote='Download of chapter metadata failed')
1799             chapter_info = json.loads(chapter_info_json)
1800
1801             bracket_start = int(doc.find('.//bracket_start').text)
1802             bracket_end = int(doc.find('.//bracket_end').text)
1803
1804             # TODO determine start (and probably fix up file)
1805             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1806             #video_url += u'?start=' + TODO:start_timestamp
1807             # bracket_start is 13290, but we want 51670615
1808             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1809                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1810
1811             info = {
1812                 'id': u'c' + chapter_id,
1813                 'url': video_url,
1814                 'ext': video_ext,
1815                 'title': chapter_info['title'],
1816                 'thumbnail': chapter_info['preview'],
1817                 'description': chapter_info['description'],
1818                 'uploader': chapter_info['channel']['display_name'],
1819                 'uploader_id': chapter_info['channel']['name'],
1820             }
1821             return [info]
1822         else:
1823             video_id = mobj.group('videoid')
1824             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1825
1826         self.report_extraction(video_id)
1827
1828         info = []
1829         offset = 0
1830         limit = self._JUSTIN_PAGE_LIMIT
1831         while True:
1832             if paged:
1833                 self.report_download_page(video_id, offset)
1834             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1835             page_count, page_info = self._parse_page(page_url, video_id)
1836             info.extend(page_info)
1837             if not paged or page_count != limit:
1838                 break
1839             offset += limit
1840         return info
1841
1842 class FunnyOrDieIE(InfoExtractor):
1843     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1844
1845     def _real_extract(self, url):
1846         mobj = re.match(self._VALID_URL, url)
1847         if mobj is None:
1848             raise ExtractorError(u'invalid URL: %s' % url)
1849
1850         video_id = mobj.group('id')
1851         webpage = self._download_webpage(url, video_id)
1852
1853         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1854             webpage, u'video URL', flags=re.DOTALL)
1855
1856         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1857             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1858
1859         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1860             webpage, u'description', fatal=False, flags=re.DOTALL)
1861
1862         info = {
1863             'id': video_id,
1864             'url': video_url,
1865             'ext': 'mp4',
1866             'title': title,
1867             'description': video_description,
1868         }
1869         return [info]
1870
1871 class SteamIE(InfoExtractor):
1872     _VALID_URL = r"""http://store\.steampowered\.com/
1873                 (agecheck/)?
1874                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1875                 (?P<gameID>\d+)/?
1876                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1877                 """
1878     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1879     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1880
1881     @classmethod
1882     def suitable(cls, url):
1883         """Receives a URL and returns True if suitable for this IE."""
1884         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1885
1886     def _real_extract(self, url):
1887         m = re.match(self._VALID_URL, url, re.VERBOSE)
1888         gameID = m.group('gameID')
1889
1890         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1891         webpage = self._download_webpage(videourl, gameID)
1892
1893         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1894             videourl = self._AGECHECK_TEMPLATE % gameID
1895             self.report_age_confirmation()
1896             webpage = self._download_webpage(videourl, gameID)
1897
1898         self.report_extraction(gameID)
1899         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1900                                              webpage, 'game title')
1901
1902         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1903         mweb = re.finditer(urlRE, webpage)
1904         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1905         titles = re.finditer(namesRE, webpage)
1906         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1907         thumbs = re.finditer(thumbsRE, webpage)
1908         videos = []
1909         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1910             video_id = vid.group('videoID')
1911             title = vtitle.group('videoName')
1912             video_url = vid.group('videoURL')
1913             video_thumb = thumb.group('thumbnail')
1914             if not video_url:
1915                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1916             info = {
1917                 'id':video_id,
1918                 'url':video_url,
1919                 'ext': 'flv',
1920                 'title': unescapeHTML(title),
1921                 'thumbnail': video_thumb
1922                   }
1923             videos.append(info)
1924         return [self.playlist_result(videos, gameID, game_title)]
1925
1926 class UstreamIE(InfoExtractor):
1927     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1928     IE_NAME = u'ustream'
1929
1930     def _real_extract(self, url):
1931         m = re.match(self._VALID_URL, url)
1932         video_id = m.group('videoID')
1933
1934         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1935         webpage = self._download_webpage(url, video_id)
1936
1937         self.report_extraction(video_id)
1938
1939         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1940             webpage, u'title')
1941
1942         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1943             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1944
1945         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1946             webpage, u'thumbnail', fatal=False)
1947
1948         info = {
1949                 'id': video_id,
1950                 'url': video_url,
1951                 'ext': 'flv',
1952                 'title': video_title,
1953                 'uploader': uploader,
1954                 'thumbnail': thumbnail,
1955                }
1956         return info
1957
1958 class WorldStarHipHopIE(InfoExtractor):
1959     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1960     IE_NAME = u'WorldStarHipHop'
1961
1962     def _real_extract(self, url):
1963         m = re.match(self._VALID_URL, url)
1964         video_id = m.group('id')
1965
1966         webpage_src = self._download_webpage(url, video_id)
1967
1968         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1969             webpage_src, u'video URL')
1970
1971         if 'mp4' in video_url:
1972             ext = 'mp4'
1973         else:
1974             ext = 'flv'
1975
1976         video_title = self._html_search_regex(r"<title>(.*)</title>",
1977             webpage_src, u'title')
1978
1979         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1980         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1981             webpage_src, u'thumbnail', fatal=False)
1982
1983         if not thumbnail:
1984             _title = r"""candytitles.*>(.*)</span>"""
1985             mobj = re.search(_title, webpage_src)
1986             if mobj is not None:
1987                 video_title = mobj.group(1)
1988
1989         results = [{
1990                     'id': video_id,
1991                     'url' : video_url,
1992                     'title' : video_title,
1993                     'thumbnail' : thumbnail,
1994                     'ext' : ext,
1995                     }]
1996         return results
1997
1998 class RBMARadioIE(InfoExtractor):
1999     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2000
2001     def _real_extract(self, url):
2002         m = re.match(self._VALID_URL, url)
2003         video_id = m.group('videoID')
2004
2005         webpage = self._download_webpage(url, video_id)
2006
2007         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2008             webpage, u'json data', flags=re.MULTILINE)
2009
2010         try:
2011             data = json.loads(json_data)
2012         except ValueError as e:
2013             raise ExtractorError(u'Invalid JSON: ' + str(e))
2014
2015         video_url = data['akamai_url'] + '&cbr=256'
2016         url_parts = compat_urllib_parse_urlparse(video_url)
2017         video_ext = url_parts.path.rpartition('.')[2]
2018         info = {
2019                 'id': video_id,
2020                 'url': video_url,
2021                 'ext': video_ext,
2022                 'title': data['title'],
2023                 'description': data.get('teaser_text'),
2024                 'location': data.get('country_of_origin'),
2025                 'uploader': data.get('host', {}).get('name'),
2026                 'uploader_id': data.get('host', {}).get('slug'),
2027                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2028                 'duration': data.get('duration'),
2029         }
2030         return [info]
2031
2032
2033 class YouPornIE(InfoExtractor):
2034     """Information extractor for youporn.com."""
2035     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2036
2037     def _print_formats(self, formats):
2038         """Print all available formats"""
2039         print(u'Available formats:')
2040         print(u'ext\t\tformat')
2041         print(u'---------------------------------')
2042         for format in formats:
2043             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2044
2045     def _specific(self, req_format, formats):
2046         for x in formats:
2047             if(x["format"]==req_format):
2048                 return x
2049         return None
2050
2051     def _real_extract(self, url):
2052         mobj = re.match(self._VALID_URL, url)
2053         if mobj is None:
2054             raise ExtractorError(u'Invalid URL: %s' % url)
2055         video_id = mobj.group('videoid')
2056
2057         req = compat_urllib_request.Request(url)
2058         req.add_header('Cookie', 'age_verified=1')
2059         webpage = self._download_webpage(req, video_id)
2060
2061         # Get JSON parameters
2062         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2063         try:
2064             params = json.loads(json_params)
2065         except:
2066             raise ExtractorError(u'Invalid JSON')
2067
2068         self.report_extraction(video_id)
2069         try:
2070             video_title = params['title']
2071             upload_date = unified_strdate(params['release_date_f'])
2072             video_description = params['description']
2073             video_uploader = params['submitted_by']
2074             thumbnail = params['thumbnails'][0]['image']
2075         except KeyError:
2076             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2077
2078         # Get all of the formats available
2079         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2080         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2081             webpage, u'download list').strip()
2082
2083         # Get all of the links from the page
2084         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2085         links = re.findall(LINK_RE, download_list_html)
2086         if(len(links) == 0):
2087             raise ExtractorError(u'ERROR: no known formats available for video')
2088
2089         self.to_screen(u'Links found: %d' % len(links))
2090
2091         formats = []
2092         for link in links:
2093
2094             # A link looks like this:
2095             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2096             # A path looks like this:
2097             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2098             video_url = unescapeHTML( link )
2099             path = compat_urllib_parse_urlparse( video_url ).path
2100             extension = os.path.splitext( path )[1][1:]
2101             format = path.split('/')[4].split('_')[:2]
2102             size = format[0]
2103             bitrate = format[1]
2104             format = "-".join( format )
2105             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2106
2107             formats.append({
2108                 'id': video_id,
2109                 'url': video_url,
2110                 'uploader': video_uploader,
2111                 'upload_date': upload_date,
2112                 'title': video_title,
2113                 'ext': extension,
2114                 'format': format,
2115                 'thumbnail': thumbnail,
2116                 'description': video_description
2117             })
2118
2119         if self._downloader.params.get('listformats', None):
2120             self._print_formats(formats)
2121             return
2122
2123         req_format = self._downloader.params.get('format', None)
2124         self.to_screen(u'Format: %s' % req_format)
2125
2126         if req_format is None or req_format == 'best':
2127             return [formats[0]]
2128         elif req_format == 'worst':
2129             return [formats[-1]]
2130         elif req_format in ('-1', 'all'):
2131             return formats
2132         else:
2133             format = self._specific( req_format, formats )
2134             if result is None:
2135                 raise ExtractorError(u'Requested format not available')
2136             return [format]
2137
2138
2139
2140 class PornotubeIE(InfoExtractor):
2141     """Information extractor for pornotube.com."""
2142     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2143
2144     def _real_extract(self, url):
2145         mobj = re.match(self._VALID_URL, url)
2146         if mobj is None:
2147             raise ExtractorError(u'Invalid URL: %s' % url)
2148
2149         video_id = mobj.group('videoid')
2150         video_title = mobj.group('title')
2151
2152         # Get webpage content
2153         webpage = self._download_webpage(url, video_id)
2154
2155         # Get the video URL
2156         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2157         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2158         video_url = compat_urllib_parse.unquote(video_url)
2159
2160         #Get the uploaded date
2161         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2162         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2163         if upload_date: upload_date = unified_strdate(upload_date)
2164
2165         info = {'id': video_id,
2166                 'url': video_url,
2167                 'uploader': None,
2168                 'upload_date': upload_date,
2169                 'title': video_title,
2170                 'ext': 'flv',
2171                 'format': 'flv'}
2172
2173         return [info]
2174
2175 class YouJizzIE(InfoExtractor):
2176     """Information extractor for youjizz.com."""
2177     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2178
2179     def _real_extract(self, url):
2180         mobj = re.match(self._VALID_URL, url)
2181         if mobj is None:
2182             raise ExtractorError(u'Invalid URL: %s' % url)
2183
2184         video_id = mobj.group('videoid')
2185
2186         # Get webpage content
2187         webpage = self._download_webpage(url, video_id)
2188
2189         # Get the video title
2190         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2191             webpage, u'title').strip()
2192
2193         # Get the embed page
2194         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2195         if result is None:
2196             raise ExtractorError(u'ERROR: unable to extract embed page')
2197
2198         embed_page_url = result.group(0).strip()
2199         video_id = result.group('videoid')
2200
2201         webpage = self._download_webpage(embed_page_url, video_id)
2202
2203         # Get the video URL
2204         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2205             webpage, u'video URL')
2206
2207         info = {'id': video_id,
2208                 'url': video_url,
2209                 'title': video_title,
2210                 'ext': 'flv',
2211                 'format': 'flv',
2212                 'player_url': embed_page_url}
2213
2214         return [info]
2215
2216 class EightTracksIE(InfoExtractor):
2217     IE_NAME = '8tracks'
2218     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2219
2220     def _real_extract(self, url):
2221         mobj = re.match(self._VALID_URL, url)
2222         if mobj is None:
2223             raise ExtractorError(u'Invalid URL: %s' % url)
2224         playlist_id = mobj.group('id')
2225
2226         webpage = self._download_webpage(url, playlist_id)
2227
2228         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2229         data = json.loads(json_like)
2230
2231         session = str(random.randint(0, 1000000000))
2232         mix_id = data['id']
2233         track_count = data['tracks_count']
2234         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2235         next_url = first_url
2236         res = []
2237         for i in itertools.count():
2238             api_json = self._download_webpage(next_url, playlist_id,
2239                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2240                 errnote=u'Failed to download song information')
2241             api_data = json.loads(api_json)
2242             track_data = api_data[u'set']['track']
2243             info = {
2244                 'id': track_data['id'],
2245                 'url': track_data['track_file_stream_url'],
2246                 'title': track_data['performer'] + u' - ' + track_data['name'],
2247                 'raw_title': track_data['name'],
2248                 'uploader_id': data['user']['login'],
2249                 'ext': 'm4a',
2250             }
2251             res.append(info)
2252             if api_data['set']['at_last_track']:
2253                 break
2254             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2255         return res
2256
2257 class KeekIE(InfoExtractor):
2258     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2259     IE_NAME = u'keek'
2260
2261     def _real_extract(self, url):
2262         m = re.match(self._VALID_URL, url)
2263         video_id = m.group('videoID')
2264
2265         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2266         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2267         webpage = self._download_webpage(url, video_id)
2268
2269         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2270             webpage, u'title')
2271
2272         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2273             webpage, u'uploader', fatal=False)
2274
2275         info = {
2276                 'id': video_id,
2277                 'url': video_url,
2278                 'ext': 'mp4',
2279                 'title': video_title,
2280                 'thumbnail': thumbnail,
2281                 'uploader': uploader
2282         }
2283         return [info]
2284
2285 class TEDIE(InfoExtractor):
2286     _VALID_URL=r'''http://www\.ted\.com/
2287                    (
2288                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2289                         |
2290                         ((?P<type_talk>talks)) # We have a simple talk
2291                    )
2292                    (/lang/(.*?))? # The url may contain the language
2293                    /(?P<name>\w+) # Here goes the name and then ".html"
2294                    '''
2295
2296     @classmethod
2297     def suitable(cls, url):
2298         """Receives a URL and returns True if suitable for this IE."""
2299         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2300
2301     def _real_extract(self, url):
2302         m=re.match(self._VALID_URL, url, re.VERBOSE)
2303         if m.group('type_talk'):
2304             return [self._talk_info(url)]
2305         else :
2306             playlist_id=m.group('playlist_id')
2307             name=m.group('name')
2308             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2309             return [self._playlist_videos_info(url,name,playlist_id)]
2310
2311     def _playlist_videos_info(self,url,name,playlist_id=0):
2312         '''Returns the videos of the playlist'''
2313         video_RE=r'''
2314                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2315                      ([.\s]*?)data-playlist_item_id="(\d+)"
2316                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2317                      '''
2318         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2319         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2320         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2321         m_names=re.finditer(video_name_RE,webpage)
2322
2323         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2324                                                  webpage, 'playlist title')
2325
2326         playlist_entries = []
2327         for m_video, m_name in zip(m_videos,m_names):
2328             video_id=m_video.group('video_id')
2329             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2330             playlist_entries.append(self.url_result(talk_url, 'TED'))
2331         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2332
2333     def _talk_info(self, url, video_id=0):
2334         """Return the video for the talk in the url"""
2335         m = re.match(self._VALID_URL, url,re.VERBOSE)
2336         video_name = m.group('name')
2337         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2338         self.report_extraction(video_name)
2339         # If the url includes the language we get the title translated
2340         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2341                                         webpage, 'title')
2342         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2343                                     webpage, 'json data')
2344         info = json.loads(json_data)
2345         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2346                                        webpage, 'description', flags = re.DOTALL)
2347
2348         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2349                                        webpage, 'thumbnail')
2350         info = {
2351                 'id': info['id'],
2352                 'url': info['htmlStreams'][-1]['file'],
2353                 'ext': 'mp4',
2354                 'title': title,
2355                 'thumbnail': thumbnail,
2356                 'description': desc,
2357                 }
2358         return info
2359
2360 class MySpassIE(InfoExtractor):
2361     _VALID_URL = r'http://www.myspass.de/.*'
2362
2363     def _real_extract(self, url):
2364         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2365
2366         # video id is the last path element of the URL
2367         # usually there is a trailing slash, so also try the second but last
2368         url_path = compat_urllib_parse_urlparse(url).path
2369         url_parent_path, video_id = os.path.split(url_path)
2370         if not video_id:
2371             _, video_id = os.path.split(url_parent_path)
2372
2373         # get metadata
2374         metadata_url = META_DATA_URL_TEMPLATE % video_id
2375         metadata_text = self._download_webpage(metadata_url, video_id)
2376         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2377
2378         # extract values from metadata
2379         url_flv_el = metadata.find('url_flv')
2380         if url_flv_el is None:
2381             raise ExtractorError(u'Unable to extract download url')
2382         video_url = url_flv_el.text
2383         extension = os.path.splitext(video_url)[1][1:]
2384         title_el = metadata.find('title')
2385         if title_el is None:
2386             raise ExtractorError(u'Unable to extract title')
2387         title = title_el.text
2388         format_id_el = metadata.find('format_id')
2389         if format_id_el is None:
2390             format = ext
2391         else:
2392             format = format_id_el.text
2393         description_el = metadata.find('description')
2394         if description_el is not None:
2395             description = description_el.text
2396         else:
2397             description = None
2398         imagePreview_el = metadata.find('imagePreview')
2399         if imagePreview_el is not None:
2400             thumbnail = imagePreview_el.text
2401         else:
2402             thumbnail = None
2403         info = {
2404             'id': video_id,
2405             'url': video_url,
2406             'title': title,
2407             'ext': extension,
2408             'format': format,
2409             'thumbnail': thumbnail,
2410             'description': description
2411         }
2412         return [info]
2413
2414 class SpiegelIE(InfoExtractor):
2415     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2416
2417     def _real_extract(self, url):
2418         m = re.match(self._VALID_URL, url)
2419         video_id = m.group('videoID')
2420
2421         webpage = self._download_webpage(url, video_id)
2422
2423         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2424             webpage, u'title')
2425
2426         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2427         xml_code = self._download_webpage(xml_url, video_id,
2428                     note=u'Downloading XML', errnote=u'Failed to download XML')
2429
2430         idoc = xml.etree.ElementTree.fromstring(xml_code)
2431         last_type = idoc[-1]
2432         filename = last_type.findall('./filename')[0].text
2433         duration = float(last_type.findall('./duration')[0].text)
2434
2435         video_url = 'http://video2.spiegel.de/flash/' + filename
2436         video_ext = filename.rpartition('.')[2]
2437         info = {
2438             'id': video_id,
2439             'url': video_url,
2440             'ext': video_ext,
2441             'title': video_title,
2442             'duration': duration,
2443         }
2444         return [info]
2445
2446 class LiveLeakIE(InfoExtractor):
2447
2448     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2449     IE_NAME = u'liveleak'
2450
2451     def _real_extract(self, url):
2452         mobj = re.match(self._VALID_URL, url)
2453         if mobj is None:
2454             raise ExtractorError(u'Invalid URL: %s' % url)
2455
2456         video_id = mobj.group('video_id')
2457
2458         webpage = self._download_webpage(url, video_id)
2459
2460         video_url = self._search_regex(r'file: "(.*?)",',
2461             webpage, u'video URL')
2462
2463         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2464             webpage, u'title').replace('LiveLeak.com -', '').strip()
2465
2466         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2467             webpage, u'description', fatal=False)
2468
2469         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2470             webpage, u'uploader', fatal=False)
2471
2472         info = {
2473             'id':  video_id,
2474             'url': video_url,
2475             'ext': 'mp4',
2476             'title': video_title,
2477             'description': video_description,
2478             'uploader': video_uploader
2479         }
2480
2481         return [info]
2482
2483
2484
2485 class TumblrIE(InfoExtractor):
2486     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2487
2488     def _real_extract(self, url):
2489         m_url = re.match(self._VALID_URL, url)
2490         video_id = m_url.group('id')
2491         blog = m_url.group('blog_name')
2492
2493         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2494         webpage = self._download_webpage(url, video_id)
2495
2496         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2497         video = re.search(re_video, webpage)
2498         if video is None:
2499            raise ExtractorError(u'Unable to extract video')
2500         video_url = video.group('video_url')
2501         ext = video.group('ext')
2502
2503         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2504             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2505         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2506
2507         # The only place where you can get a title, it's not complete,
2508         # but searching in other places doesn't work for all videos
2509         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2510             webpage, u'title', flags=re.DOTALL)
2511
2512         return [{'id': video_id,
2513                  'url': video_url,
2514                  'title': video_title,
2515                  'thumbnail': video_thumbnail,
2516                  'ext': ext
2517                  }]
2518
2519 class BandcampIE(InfoExtractor):
2520     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2521
2522     def _real_extract(self, url):
2523         mobj = re.match(self._VALID_URL, url)
2524         title = mobj.group('title')
2525         webpage = self._download_webpage(url, title)
2526         # We get the link to the free download page
2527         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2528         if m_download is None:
2529             raise ExtractorError(u'No free songs found')
2530
2531         download_link = m_download.group(1)
2532         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2533                        webpage, re.MULTILINE|re.DOTALL).group('id')
2534
2535         download_webpage = self._download_webpage(download_link, id,
2536                                                   'Downloading free downloads page')
2537         # We get the dictionary of the track from some javascrip code
2538         info = re.search(r'items: (.*?),$',
2539                          download_webpage, re.MULTILINE).group(1)
2540         info = json.loads(info)[0]
2541         # We pick mp3-320 for now, until format selection can be easily implemented.
2542         mp3_info = info[u'downloads'][u'mp3-320']
2543         # If we try to use this url it says the link has expired
2544         initial_url = mp3_info[u'url']
2545         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2546         m_url = re.match(re_url, initial_url)
2547         #We build the url we will use to get the final track url
2548         # This url is build in Bandcamp in the script download_bunde_*.js
2549         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2550         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2551         # If we could correctly generate the .rand field the url would be
2552         #in the "download_url" key
2553         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2554
2555         track_info = {'id':id,
2556                       'title' : info[u'title'],
2557                       'ext' :   'mp3',
2558                       'url' :   final_url,
2559                       'thumbnail' : info[u'thumb_url'],
2560                       'uploader' :  info[u'artist']
2561                       }
2562
2563         return [track_info]
2564
2565 class RedTubeIE(InfoExtractor):
2566     """Information Extractor for redtube"""
2567     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2568
2569     def _real_extract(self,url):
2570         mobj = re.match(self._VALID_URL, url)
2571         if mobj is None:
2572             raise ExtractorError(u'Invalid URL: %s' % url)
2573
2574         video_id = mobj.group('id')
2575         video_extension = 'mp4'
2576         webpage = self._download_webpage(url, video_id)
2577
2578         self.report_extraction(video_id)
2579
2580         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2581             webpage, u'video URL')
2582
2583         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2584             webpage, u'title')
2585
2586         return [{
2587             'id':       video_id,
2588             'url':      video_url,
2589             'ext':      video_extension,
2590             'title':    video_title,
2591         }]
2592
2593 class InaIE(InfoExtractor):
2594     """Information Extractor for Ina.fr"""
2595     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2596
2597     def _real_extract(self,url):
2598         mobj = re.match(self._VALID_URL, url)
2599
2600         video_id = mobj.group('id')
2601         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2602         video_extension = 'mp4'
2603         webpage = self._download_webpage(mrss_url, video_id)
2604
2605         self.report_extraction(video_id)
2606
2607         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2608             webpage, u'video URL')
2609
2610         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2611             webpage, u'title')
2612
2613         return [{
2614             'id':       video_id,
2615             'url':      video_url,
2616             'ext':      video_extension,
2617             'title':    video_title,
2618         }]
2619
2620 class HowcastIE(InfoExtractor):
2621     """Information Extractor for Howcast.com"""
2622     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2623
2624     def _real_extract(self, url):
2625         mobj = re.match(self._VALID_URL, url)
2626
2627         video_id = mobj.group('id')
2628         webpage_url = 'http://www.howcast.com/videos/' + video_id
2629         webpage = self._download_webpage(webpage_url, video_id)
2630
2631         self.report_extraction(video_id)
2632
2633         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2634             webpage, u'video URL')
2635
2636         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2637             webpage, u'title')
2638
2639         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2640             webpage, u'description', fatal=False)
2641
2642         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2643             webpage, u'thumbnail', fatal=False)
2644
2645         return [{
2646             'id':       video_id,
2647             'url':      video_url,
2648             'ext':      'mp4',
2649             'title':    video_title,
2650             'description': video_description,
2651             'thumbnail': thumbnail,
2652         }]
2653
2654 class VineIE(InfoExtractor):
2655     """Information Extractor for Vine.co"""
2656     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2657
2658     def _real_extract(self, url):
2659         mobj = re.match(self._VALID_URL, url)
2660
2661         video_id = mobj.group('id')
2662         webpage_url = 'https://vine.co/v/' + video_id
2663         webpage = self._download_webpage(webpage_url, video_id)
2664
2665         self.report_extraction(video_id)
2666
2667         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2668             webpage, u'video URL')
2669
2670         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2671             webpage, u'title')
2672
2673         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2674             webpage, u'thumbnail', fatal=False)
2675
2676         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2677             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2678
2679         return [{
2680             'id':        video_id,
2681             'url':       video_url,
2682             'ext':       'mp4',
2683             'title':     video_title,
2684             'thumbnail': thumbnail,
2685             'uploader':  uploader,
2686         }]
2687
2688 class FlickrIE(InfoExtractor):
2689     """Information Extractor for Flickr videos"""
2690     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2691
2692     def _real_extract(self, url):
2693         mobj = re.match(self._VALID_URL, url)
2694
2695         video_id = mobj.group('id')
2696         video_uploader_id = mobj.group('uploader_id')
2697         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2698         webpage = self._download_webpage(webpage_url, video_id)
2699
2700         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2701
2702         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2703         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2704
2705         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2706             first_xml, u'node_id')
2707
2708         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2709         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2710
2711         self.report_extraction(video_id)
2712
2713         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2714         if mobj is None:
2715             raise ExtractorError(u'Unable to extract video url')
2716         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2717
2718         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2719             webpage, u'video title')
2720
2721         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2722             webpage, u'description', fatal=False)
2723
2724         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2725             webpage, u'thumbnail', fatal=False)
2726
2727         return [{
2728             'id':          video_id,
2729             'url':         video_url,
2730             'ext':         'mp4',
2731             'title':       video_title,
2732             'description': video_description,
2733             'thumbnail':   thumbnail,
2734             'uploader_id': video_uploader_id,
2735         }]
2736
2737 class TeamcocoIE(InfoExtractor):
2738     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2739
2740     def _real_extract(self, url):
2741         mobj = re.match(self._VALID_URL, url)
2742         if mobj is None:
2743             raise ExtractorError(u'Invalid URL: %s' % url)
2744         url_title = mobj.group('url_title')
2745         webpage = self._download_webpage(url, url_title)
2746
2747         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2748             webpage, u'video id')
2749
2750         self.report_extraction(video_id)
2751
2752         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2753             webpage, u'title')
2754
2755         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2756             webpage, u'thumbnail', fatal=False)
2757
2758         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2759             webpage, u'description', fatal=False)
2760
2761         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2762         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2763
2764         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2765             data, u'video URL')
2766
2767         return [{
2768             'id':          video_id,
2769             'url':         video_url,
2770             'ext':         'mp4',
2771             'title':       video_title,
2772             'thumbnail':   thumbnail,
2773             'description': video_description,
2774         }]
2775
2776 class XHamsterIE(InfoExtractor):
2777     """Information Extractor for xHamster"""
2778     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2779
2780     def _real_extract(self,url):
2781         mobj = re.match(self._VALID_URL, url)
2782
2783         video_id = mobj.group('id')
2784         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2785         webpage = self._download_webpage(mrss_url, video_id)
2786
2787         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2788         if mobj is None:
2789             raise ExtractorError(u'Unable to extract media URL')
2790         if len(mobj.group('server')) == 0:
2791             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2792         else:
2793             video_url = mobj.group('server')+'/key='+mobj.group('file')
2794         video_extension = video_url.split('.')[-1]
2795
2796         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2797             webpage, u'title')
2798
2799         # Can't see the description anywhere in the UI
2800         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2801         #     webpage, u'description', fatal=False)
2802         # if video_description: video_description = unescapeHTML(video_description)
2803
2804         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2805         if mobj:
2806             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2807         else:
2808             video_upload_date = None
2809             self._downloader.report_warning(u'Unable to extract upload date')
2810
2811         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2812             webpage, u'uploader id', default=u'anonymous')
2813
2814         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2815             webpage, u'thumbnail', fatal=False)
2816
2817         return [{
2818             'id':       video_id,
2819             'url':      video_url,
2820             'ext':      video_extension,
2821             'title':    video_title,
2822             # 'description': video_description,
2823             'upload_date': video_upload_date,
2824             'uploader_id': video_uploader_id,
2825             'thumbnail': video_thumbnail
2826         }]
2827
2828 class HypemIE(InfoExtractor):
2829     """Information Extractor for hypem"""
2830     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2831
2832     def _real_extract(self, url):
2833         mobj = re.match(self._VALID_URL, url)
2834         if mobj is None:
2835             raise ExtractorError(u'Invalid URL: %s' % url)
2836         track_id = mobj.group(1)
2837
2838         data = { 'ax': 1, 'ts': time.time() }
2839         data_encoded = compat_urllib_parse.urlencode(data)
2840         complete_url = url + "?" + data_encoded
2841         request = compat_urllib_request.Request(complete_url)
2842         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2843         cookie = urlh.headers.get('Set-Cookie', '')
2844
2845         self.report_extraction(track_id)
2846
2847         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2848             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2849         try:
2850             track_list = json.loads(html_tracks)
2851             track = track_list[u'tracks'][0]
2852         except ValueError:
2853             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2854
2855         key = track[u"key"]
2856         track_id = track[u"id"]
2857         artist = track[u"artist"]
2858         title = track[u"song"]
2859
2860         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2861         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2862         request.add_header('cookie', cookie)
2863         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2864         try:
2865             song_data = json.loads(song_data_json)
2866         except ValueError:
2867             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2868         final_url = song_data[u"url"]
2869
2870         return [{
2871             'id':       track_id,
2872             'url':      final_url,
2873             'ext':      "mp3",
2874             'title':    title,
2875             'artist':   artist,
2876         }]
2877
2878 class Vbox7IE(InfoExtractor):
2879     """Information Extractor for Vbox7"""
2880     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2881
2882     def _real_extract(self,url):
2883         mobj = re.match(self._VALID_URL, url)
2884         if mobj is None:
2885             raise ExtractorError(u'Invalid URL: %s' % url)
2886         video_id = mobj.group(1)
2887
2888         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2889         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2890         redirect_url = urlh.geturl() + new_location
2891         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2892
2893         title = self._html_search_regex(r'<title>(.*)</title>',
2894             webpage, u'title').split('/')[0].strip()
2895
2896         ext = "flv"
2897         info_url = "http://vbox7.com/play/magare.do"
2898         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2899         info_request = compat_urllib_request.Request(info_url, data)
2900         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2901         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2902         if info_response is None:
2903             raise ExtractorError(u'Unable to extract the media url')
2904         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2905
2906         return [{
2907             'id':        video_id,
2908             'url':       final_url,
2909             'ext':       ext,
2910             'title':     title,
2911             'thumbnail': thumbnail_url,
2912         }]
2913
2914
2915 def gen_extractors():
2916     """ Return a list of an instance of every supported extractor.
2917     The order does matter; the first extractor matched is the one handling the URL.
2918     """
2919     return [
2920         YoutubePlaylistIE(),
2921         YoutubeChannelIE(),
2922         YoutubeUserIE(),
2923         YoutubeSearchIE(),
2924         YoutubeIE(),
2925         MetacafeIE(),
2926         DailymotionIE(),
2927         GoogleSearchIE(),
2928         PhotobucketIE(),
2929         YahooIE(),
2930         YahooSearchIE(),
2931         DepositFilesIE(),
2932         FacebookIE(),
2933         BlipTVIE(),
2934         BlipTVUserIE(),
2935         VimeoIE(),
2936         MyVideoIE(),
2937         ComedyCentralIE(),
2938         EscapistIE(),
2939         CollegeHumorIE(),
2940         XVideosIE(),
2941         SoundcloudSetIE(),
2942         SoundcloudIE(),
2943         InfoQIE(),
2944         MixcloudIE(),
2945         StanfordOpenClassroomIE(),
2946         MTVIE(),
2947         YoukuIE(),
2948         XNXXIE(),
2949         YouJizzIE(),
2950         PornotubeIE(),
2951         YouPornIE(),
2952         GooglePlusIE(),
2953         ArteTvIE(),
2954         NBAIE(),
2955         WorldStarHipHopIE(),
2956         JustinTVIE(),
2957         FunnyOrDieIE(),
2958         SteamIE(),
2959         UstreamIE(),
2960         RBMARadioIE(),
2961         EightTracksIE(),
2962         KeekIE(),
2963         TEDIE(),
2964         MySpassIE(),
2965         SpiegelIE(),
2966         LiveLeakIE(),
2967         ARDIE(),
2968         ZDFIE(),
2969         TumblrIE(),
2970         BandcampIE(),
2971         RedTubeIE(),
2972         InaIE(),
2973         HowcastIE(),
2974         VineIE(),
2975         FlickrIE(),
2976         TeamcocoIE(),
2977         XHamsterIE(),
2978         HypemIE(),
2979         Vbox7IE(),
2980         GametrailersIE(),
2981         StatigramIE(),
2982         GenericIE()
2983     ]
2984
2985 def get_info_extractor(ie_name):
2986     """Returns the info extractor class with the given ie_name"""
2987     return globals()[ie_name+'IE']