git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24 from .extractor.common import InfoExtractor, SearchInfoExtractor
  25
  26 from .extractor.ard import ARDIE
  27 from .extractor.arte import ArteTvIE
  28 from .extractor.dailymotion import DailymotionIE
  29 from .extractor.metacafe import MetacafeIE
  30 from .extractor.statigram import StatigramIE
  31 from .extractor.photobucket import PhotobucketIE
  32 from .extractor.vimeo import VimeoIE
  33 from .extractor.yahoo import YahooIE
  34 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
  35 from .extractor.zdf import ZDFIE
  36
  37
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47 class GenericIE(InfoExtractor):
  48     """Generic last-resort information extractor."""
  49
  50     _VALID_URL = r'.*'
  51     IE_NAME = u'generic'
  52
  53     def report_download_webpage(self, video_id):
  54         """Report webpage download."""
  55         if not self._downloader.params.get('test', False):
  56             self._downloader.report_warning(u'Falling back on generic information extractor.')
  57         super(GenericIE, self).report_download_webpage(video_id)
  58
  59     def report_following_redirect(self, new_url):
  60         """Report information extraction."""
  61         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
  62
  63     def _test_redirect(self, url):
  64         """Check if it is a redirect, like url shorteners, in case return the new url."""
  65         class HeadRequest(compat_urllib_request.Request):
  66             def get_method(self):
  67                 return "HEAD"
  68
  69         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
  70             """
  71             Subclass the HTTPRedirectHandler to make it use our
  72             HeadRequest also on the redirected URL
  73             """
  74             def redirect_request(self, req, fp, code, msg, headers, newurl):
  75                 if code in (301, 302, 303, 307):
  76                     newurl = newurl.replace(' ', '%20')
  77                     newheaders = dict((k,v) for k,v in req.headers.items()
  78                                       if k.lower() not in ("content-length", "content-type"))
  79                     return HeadRequest(newurl,
  80                                        headers=newheaders,
  81                                        origin_req_host=req.get_origin_req_host(),
  82                                        unverifiable=True)
  83                 else:
  84                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
  85
  86         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
  87             """
  88             Fallback to GET if HEAD is not allowed (405 HTTP error)
  89             """
  90             def http_error_405(self, req, fp, code, msg, headers):
  91                 fp.read()
  92                 fp.close()
  93
  94                 newheaders = dict((k,v) for k,v in req.headers.items()
  95                                   if k.lower() not in ("content-length", "content-type"))
  96                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
  97                                                  headers=newheaders,
  98                                                  origin_req_host=req.get_origin_req_host(),
  99                                                  unverifiable=True))
 100
 101         # Build our opener
 102         opener = compat_urllib_request.OpenerDirector()
 103         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
 104                         HTTPMethodFallback, HEADRedirectHandler,
 105                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
 106             opener.add_handler(handler())
 107
 108         response = opener.open(HeadRequest(url))
 109         if response is None:
 110             raise ExtractorError(u'Invalid URL protocol')
 111         new_url = response.geturl()
 112
 113         if url == new_url:
 114             return False
 115
 116         self.report_following_redirect(new_url)
 117         return new_url
 118
 119     def _real_extract(self, url):
 120         new_url = self._test_redirect(url)
 121         if new_url: return [self.url_result(new_url)]
 122
 123         video_id = url.split('/')[-1]
 124         try:
 125             webpage = self._download_webpage(url, video_id)
 126         except ValueError as err:
 127             # since this is the last-resort InfoExtractor, if
 128             # this error is thrown, it'll be thrown here
 129             raise ExtractorError(u'Invalid URL: %s' % url)
 130
 131         self.report_extraction(video_id)
 132         # Start with something easy: JW Player in SWFObject
 133         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 134         if mobj is None:
 135             # Broaden the search a little bit
 136             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 137         if mobj is None:
 138             # Broaden the search a little bit: JWPlayer JS loader
 139             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
 140         if mobj is None:
 141             # Try to find twitter cards info
 142             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
 143         if mobj is None:
 144             # We look for Open Graph info:
 145             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
 146             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
 147             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
 148             if m_video_type is not None:
 149                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
 150         if mobj is None:
 151             raise ExtractorError(u'Invalid URL: %s' % url)
 152
 153         # It's possible that one of the regexes
 154         # matched, but returned an empty group:
 155         if mobj.group(1) is None:
 156             raise ExtractorError(u'Invalid URL: %s' % url)
 157
 158         video_url = compat_urllib_parse.unquote(mobj.group(1))
 159         video_id = os.path.basename(video_url)
 160
 161         # here's a fun little line of code for you:
 162         video_extension = os.path.splitext(video_id)[1][1:]
 163         video_id = os.path.splitext(video_id)[0]
 164
 165         # it's tempting to parse this further, but you would
 166         # have to take into account all the variations like
 167         #   Video Title - Site Name
 168         #   Site Name | Video Title
 169         #   Video Title - Tagline | Site Name
 170         # and so on and so forth; it's just not practical
 171         video_title = self._html_search_regex(r'<title>(.*)</title>',
 172             webpage, u'video title')
 173
 174         # video uploader is domain name
 175         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
 176             url, u'video uploader')
 177
 178         return [{
 179             'id':       video_id,
 180             'url':      video_url,
 181             'uploader': video_uploader,
 182             'upload_date':  None,
 183             'title':    video_title,
 184             'ext':      video_extension,
 185         }]
 186
 187
 188 class YoutubeSearchIE(SearchInfoExtractor):
 189     """Information Extractor for YouTube search queries."""
 190     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
 191     _MAX_RESULTS = 1000
 192     IE_NAME = u'youtube:search'
 193     _SEARCH_KEY = 'ytsearch'
 194
 195     def report_download_page(self, query, pagenum):
 196         """Report attempt to download search page with given number."""
 197         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 198
 199     def _get_n_results(self, query, n):
 200         """Get a specified number of results for a query"""
 201
 202         video_ids = []
 203         pagenum = 0
 204         limit = n
 205
 206         while (50 * pagenum) < limit:
 207             self.report_download_page(query, pagenum+1)
 208             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
 209             request = compat_urllib_request.Request(result_url)
 210             try:
 211                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
 212             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 213                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
 214             api_response = json.loads(data)['data']
 215
 216             if not 'items' in api_response:
 217                 raise ExtractorError(u'[youtube] No video results')
 218
 219             new_ids = list(video['id'] for video in api_response['items'])
 220             video_ids += new_ids
 221
 222             limit = min(n, api_response['totalItems'])
 223             pagenum += 1
 224
 225         if len(video_ids) > n:
 226             video_ids = video_ids[:n]
 227         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
 228         return self.playlist_result(videos, query)
 229
 230
 231 class GoogleSearchIE(SearchInfoExtractor):
 232     """Information Extractor for Google Video search queries."""
 233     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
 234     _MAX_RESULTS = 1000
 235     IE_NAME = u'video.google:search'
 236     _SEARCH_KEY = 'gvsearch'
 237
 238     def _get_n_results(self, query, n):
 239         """Get a specified number of results for a query"""
 240
 241         res = {
 242             '_type': 'playlist',
 243             'id': query,
 244             'entries': []
 245         }
 246
 247         for pagenum in itertools.count(1):
 248             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
 249             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
 250                                              note='Downloading result page ' + str(pagenum))
 251
 252             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
 253                 e = {
 254                     '_type': 'url',
 255                     'url': mobj.group(1)
 256                 }
 257                 res['entries'].append(e)
 258
 259             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
 260                 return res
 261
 262 class YahooSearchIE(SearchInfoExtractor):
 263     """Information Extractor for Yahoo! Video search queries."""
 264
 265     _MAX_RESULTS = 1000
 266     IE_NAME = u'screen.yahoo:search'
 267     _SEARCH_KEY = 'yvsearch'
 268
 269     def _get_n_results(self, query, n):
 270         """Get a specified number of results for a query"""
 271
 272         res = {
 273             '_type': 'playlist',
 274             'id': query,
 275             'entries': []
 276         }
 277         for pagenum in itertools.count(0):
 278             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 279             webpage = self._download_webpage(result_url, query,
 280                                              note='Downloading results page '+str(pagenum+1))
 281             info = json.loads(webpage)
 282             m = info[u'm']
 283             results = info[u'results']
 284
 285             for (i, r) in enumerate(results):
 286                 if (pagenum * 30) +i >= n:
 287                     break
 288                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 289                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 290                 res['entries'].append(e)
 291             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
 292                 break
 293
 294         return res
 295
 296
 297 class BlipTVUserIE(InfoExtractor):
 298     """Information Extractor for blip.tv users."""
 299
 300     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 301     _PAGE_SIZE = 12
 302     IE_NAME = u'blip.tv:user'
 303
 304     def _real_extract(self, url):
 305         # Extract username
 306         mobj = re.match(self._VALID_URL, url)
 307         if mobj is None:
 308             raise ExtractorError(u'Invalid URL: %s' % url)
 309
 310         username = mobj.group(1)
 311
 312         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 313
 314         page = self._download_webpage(url, username, u'Downloading user page')
 315         mobj = re.search(r'data-users-id="([^"]+)"', page)
 316         page_base = page_base % mobj.group(1)
 317
 318
 319         # Download video ids using BlipTV Ajax calls. Result size per
 320         # query is limited (currently to 12 videos) so we need to query
 321         # page by page until there are no video ids - it means we got
 322         # all of them.
 323
 324         video_ids = []
 325         pagenum = 1
 326
 327         while True:
 328             url = page_base + "&page=" + str(pagenum)
 329             page = self._download_webpage(url, username,
 330                                           u'Downloading video ids from page %d' % pagenum)
 331
 332             # Extract video identifiers
 333             ids_in_page = []
 334
 335             for mobj in re.finditer(r'href="/([^"]+)"', page):
 336                 if mobj.group(1) not in ids_in_page:
 337                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 338
 339             video_ids.extend(ids_in_page)
 340
 341             # A little optimization - if current page is not
 342             # "full", ie. does not contain PAGE_SIZE video ids then
 343             # we can assume that this page is the last one - there
 344             # are no more ids on further pages - no need to query
 345             # again.
 346
 347             if len(ids_in_page) < self._PAGE_SIZE:
 348                 break
 349
 350             pagenum += 1
 351
 352         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 353         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 354         return [self.playlist_result(url_entries, playlist_title = username)]
 355
 356
 357 class DepositFilesIE(InfoExtractor):
 358     """Information extractor for depositfiles.com"""
 359
 360     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 361
 362     def _real_extract(self, url):
 363         file_id = url.split('/')[-1]
 364         # Rebuild url in english locale
 365         url = 'http://depositfiles.com/en/files/' + file_id
 366
 367         # Retrieve file webpage with 'Free download' button pressed
 368         free_download_indication = { 'gateway_result' : '1' }
 369         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 370         try:
 371             self.report_download_webpage(file_id)
 372             webpage = compat_urllib_request.urlopen(request).read()
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 375
 376         # Search for the real file URL
 377         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 378         if (mobj is None) or (mobj.group(1) is None):
 379             # Try to figure out reason of the error.
 380             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 381             if (mobj is not None) and (mobj.group(1) is not None):
 382                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 383                 raise ExtractorError(u'%s' % restriction_message)
 384             else:
 385                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 386
 387         file_url = mobj.group(1)
 388         file_extension = os.path.splitext(file_url)[1][1:]
 389
 390         # Search for file title
 391         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 392
 393         return [{
 394             'id':       file_id.decode('utf-8'),
 395             'url':      file_url.decode('utf-8'),
 396             'uploader': None,
 397             'upload_date':  None,
 398             'title':    file_title,
 399             'ext':      file_extension.decode('utf-8'),
 400         }]
 401
 402
 403 class FacebookIE(InfoExtractor):
 404     """Information Extractor for Facebook"""
 405
 406     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 407     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 408     _NETRC_MACHINE = 'facebook'
 409     IE_NAME = u'facebook'
 410
 411     def report_login(self):
 412         """Report attempt to log in."""
 413         self.to_screen(u'Logging in')
 414
 415     def _real_initialize(self):
 416         if self._downloader is None:
 417             return
 418
 419         useremail = None
 420         password = None
 421         downloader_params = self._downloader.params
 422
 423         # Attempt to use provided username and password or .netrc data
 424         if downloader_params.get('username', None) is not None:
 425             useremail = downloader_params['username']
 426             password = downloader_params['password']
 427         elif downloader_params.get('usenetrc', False):
 428             try:
 429                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 430                 if info is not None:
 431                     useremail = info[0]
 432                     password = info[2]
 433                 else:
 434                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 435             except (IOError, netrc.NetrcParseError) as err:
 436                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 437                 return
 438
 439         if useremail is None:
 440             return
 441
 442         # Log in
 443         login_form = {
 444             'email': useremail,
 445             'pass': password,
 446             'login': 'Log+In'
 447             }
 448         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 449         try:
 450             self.report_login()
 451             login_results = compat_urllib_request.urlopen(request).read()
 452             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 453                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 454                 return
 455         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 456             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 457             return
 458
 459     def _real_extract(self, url):
 460         mobj = re.match(self._VALID_URL, url)
 461         if mobj is None:
 462             raise ExtractorError(u'Invalid URL: %s' % url)
 463         video_id = mobj.group('ID')
 464
 465         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 466         webpage = self._download_webpage(url, video_id)
 467
 468         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 469         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 470         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 471         if not m:
 472             raise ExtractorError(u'Cannot parse data')
 473         data = dict(json.loads(m.group(1)))
 474         params_raw = compat_urllib_parse.unquote(data['params'])
 475         params = json.loads(params_raw)
 476         video_data = params['video_data'][0]
 477         video_url = video_data.get('hd_src')
 478         if not video_url:
 479             video_url = video_data['sd_src']
 480         if not video_url:
 481             raise ExtractorError(u'Cannot find video URL')
 482         video_duration = int(video_data['video_duration'])
 483         thumbnail = video_data['thumbnail_src']
 484
 485         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 486             webpage, u'title')
 487
 488         info = {
 489             'id': video_id,
 490             'title': video_title,
 491             'url': video_url,
 492             'ext': 'mp4',
 493             'duration': video_duration,
 494             'thumbnail': thumbnail,
 495         }
 496         return [info]
 497
 498
 499 class BlipTVIE(InfoExtractor):
 500     """Information extractor for blip.tv"""
 501
 502     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 503     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 504     IE_NAME = u'blip.tv'
 505
 506     def report_direct_download(self, title):
 507         """Report information extraction."""
 508         self.to_screen(u'%s: Direct download detected' % title)
 509
 510     def _real_extract(self, url):
 511         mobj = re.match(self._VALID_URL, url)
 512         if mobj is None:
 513             raise ExtractorError(u'Invalid URL: %s' % url)
 514
 515         # See https://github.com/rg3/youtube-dl/issues/857
 516         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 517         if api_mobj is not None:
 518             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 519         urlp = compat_urllib_parse_urlparse(url)
 520         if urlp.path.startswith('/play/'):
 521             request = compat_urllib_request.Request(url)
 522             response = compat_urllib_request.urlopen(request)
 523             redirecturl = response.geturl()
 524             rurlp = compat_urllib_parse_urlparse(redirecturl)
 525             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 526             url = 'http://blip.tv/a/a-' + file_id
 527             return self._real_extract(url)
 528
 529
 530         if '?' in url:
 531             cchar = '&'
 532         else:
 533             cchar = '?'
 534         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 535         request = compat_urllib_request.Request(json_url)
 536         request.add_header('User-Agent', 'iTunes/10.6.1')
 537         self.report_extraction(mobj.group(1))
 538         info = None
 539         try:
 540             urlh = compat_urllib_request.urlopen(request)
 541             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 542                 basename = url.split('/')[-1]
 543                 title,ext = os.path.splitext(basename)
 544                 title = title.decode('UTF-8')
 545                 ext = ext.replace('.', '')
 546                 self.report_direct_download(title)
 547                 info = {
 548                     'id': title,
 549                     'url': url,
 550                     'uploader': None,
 551                     'upload_date': None,
 552                     'title': title,
 553                     'ext': ext,
 554                     'urlhandle': urlh
 555                 }
 556         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 557             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 558         if info is None: # Regular URL
 559             try:
 560                 json_code_bytes = urlh.read()
 561                 json_code = json_code_bytes.decode('utf-8')
 562             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 563                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 564
 565             try:
 566                 json_data = json.loads(json_code)
 567                 if 'Post' in json_data:
 568                     data = json_data['Post']
 569                 else:
 570                     data = json_data
 571
 572                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 573                 video_url = data['media']['url']
 574                 umobj = re.match(self._URL_EXT, video_url)
 575                 if umobj is None:
 576                     raise ValueError('Can not determine filename extension')
 577                 ext = umobj.group(1)
 578
 579                 info = {
 580                     'id': data['item_id'],
 581                     'url': video_url,
 582                     'uploader': data['display_name'],
 583                     'upload_date': upload_date,
 584                     'title': data['title'],
 585                     'ext': ext,
 586                     'format': data['media']['mimeType'],
 587                     'thumbnail': data['thumbnailUrl'],
 588                     'description': data['description'],
 589                     'player_url': data['embedUrl'],
 590                     'user_agent': 'iTunes/10.6.1',
 591                 }
 592             except (ValueError,KeyError) as err:
 593                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 594
 595         return [info]
 596
 597
 598 class MyVideoIE(InfoExtractor):
 599     """Information Extractor for myvideo.de."""
 600
 601     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 602     IE_NAME = u'myvideo'
 603
 604     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 605     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 606     # https://github.com/rg3/youtube-dl/pull/842
 607     def __rc4crypt(self,data, key):
 608         x = 0
 609         box = list(range(256))
 610         for i in list(range(256)):
 611             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 612             box[i], box[x] = box[x], box[i]
 613         x = 0
 614         y = 0
 615         out = ''
 616         for char in data:
 617             x = (x + 1) % 256
 618             y = (y + box[x]) % 256
 619             box[x], box[y] = box[y], box[x]
 620             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 621         return out
 622
 623     def __md5(self,s):
 624         return hashlib.md5(s).hexdigest().encode()
 625
 626     def _real_extract(self,url):
 627         mobj = re.match(self._VALID_URL, url)
 628         if mobj is None:
 629             raise ExtractorError(u'invalid URL: %s' % url)
 630
 631         video_id = mobj.group(1)
 632
 633         GK = (
 634           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 635           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 636           b'TnpsbA0KTVRkbU1tSTRNdz09'
 637         )
 638
 639         # Get video webpage
 640         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 641         webpage = self._download_webpage(webpage_url, video_id)
 642
 643         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 644         if mobj is not None:
 645             self.report_extraction(video_id)
 646             video_url = mobj.group(1) + '.flv'
 647
 648             video_title = self._html_search_regex('<title>([^<]+)</title>',
 649                 webpage, u'title')
 650
 651             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 652
 653             return [{
 654                 'id':       video_id,
 655                 'url':      video_url,
 656                 'uploader': None,
 657                 'upload_date':  None,
 658                 'title':    video_title,
 659                 'ext':      u'flv',
 660             }]
 661
 662         # try encxml
 663         mobj = re.search('var flashvars={(.+?)}', webpage)
 664         if mobj is None:
 665             raise ExtractorError(u'Unable to extract video')
 666
 667         params = {}
 668         encxml = ''
 669         sec = mobj.group(1)
 670         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 671             if not a == '_encxml':
 672                 params[a] = b
 673             else:
 674                 encxml = compat_urllib_parse.unquote(b)
 675         if not params.get('domain'):
 676             params['domain'] = 'www.myvideo.de'
 677         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 678         if 'flash_playertype=MTV' in xmldata_url:
 679             self._downloader.report_warning(u'avoiding MTV player')
 680             xmldata_url = (
 681                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 682                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 683             ) % video_id
 684
 685         # get enc data
 686         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 687         enc_data_b = binascii.unhexlify(enc_data)
 688         sk = self.__md5(
 689             base64.b64decode(base64.b64decode(GK)) +
 690             self.__md5(
 691                 str(video_id).encode('utf-8')
 692             )
 693         )
 694         dec_data = self.__rc4crypt(enc_data_b, sk)
 695
 696         # extracting infos
 697         self.report_extraction(video_id)
 698
 699         video_url = None
 700         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
 701         if mobj:
 702             video_url = compat_urllib_parse.unquote(mobj.group(1))
 703             if 'myvideo2flash' in video_url:
 704                 self._downloader.report_warning(u'forcing RTMPT ...')
 705                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
 706
 707         if not video_url:
 708             # extract non rtmp videos
 709             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
 710             if mobj is None:
 711                 raise ExtractorError(u'unable to extract url')
 712             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 713
 714         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
 715         video_file = compat_urllib_parse.unquote(video_file)
 716
 717         if not video_file.endswith('f4m'):
 718             ppath, prefix = video_file.split('.')
 719             video_playpath = '%s:%s' % (prefix, ppath)
 720             video_hls_playlist = ''
 721         else:
 722             video_playpath = ''
 723             video_hls_playlist = (
 724                 video_filepath + video_file
 725             ).replace('.f4m', '.m3u8')
 726
 727         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
 728         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 729
 730         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
 731             webpage, u'title')
 732
 733         return [{
 734             'id':                 video_id,
 735             'url':                video_url,
 736             'tc_url':             video_url,
 737             'uploader':           None,
 738             'upload_date':        None,
 739             'title':              video_title,
 740             'ext':                u'flv',
 741             'play_path':          video_playpath,
 742             'video_file':         video_file,
 743             'video_hls_playlist': video_hls_playlist,
 744             'player_url':         video_swfobj,
 745         }]
 746
 747
 748 class ComedyCentralIE(InfoExtractor):
 749     """Information extractor for The Daily Show and Colbert Report """
 750
 751     # urls can be abbreviations like :thedailyshow or :colbert
 752     # urls for episodes like:
 753     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 754     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 755     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 756     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 757                       |(https?://)?(www\.)?
 758                           (?P<showname>thedailyshow|colbertnation)\.com/
 759                          (full-episodes/(?P<episode>.*)|
 760                           (?P<clip>
 761                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 762                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 763                      $"""
 764
 765     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 766
 767     _video_extensions = {
 768         '3500': 'mp4',
 769         '2200': 'mp4',
 770         '1700': 'mp4',
 771         '1200': 'mp4',
 772         '750': 'mp4',
 773         '400': 'mp4',
 774     }
 775     _video_dimensions = {
 776         '3500': '1280x720',
 777         '2200': '960x540',
 778         '1700': '768x432',
 779         '1200': '640x360',
 780         '750': '512x288',
 781         '400': '384x216',
 782     }
 783
 784     @classmethod
 785     def suitable(cls, url):
 786         """Receives a URL and returns True if suitable for this IE."""
 787         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 788
 789     def _print_formats(self, formats):
 790         print('Available formats:')
 791         for x in formats:
 792             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 793
 794
 795     def _real_extract(self, url):
 796         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 797         if mobj is None:
 798             raise ExtractorError(u'Invalid URL: %s' % url)
 799
 800         if mobj.group('shortname'):
 801             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 802                 url = u'http://www.thedailyshow.com/full-episodes/'
 803             else:
 804                 url = u'http://www.colbertnation.com/full-episodes/'
 805             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 806             assert mobj is not None
 807
 808         if mobj.group('clip'):
 809             if mobj.group('showname') == 'thedailyshow':
 810                 epTitle = mobj.group('tdstitle')
 811             else:
 812                 epTitle = mobj.group('cntitle')
 813             dlNewest = False
 814         else:
 815             dlNewest = not mobj.group('episode')
 816             if dlNewest:
 817                 epTitle = mobj.group('showname')
 818             else:
 819                 epTitle = mobj.group('episode')
 820
 821         self.report_extraction(epTitle)
 822         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 823         if dlNewest:
 824             url = htmlHandle.geturl()
 825             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 826             if mobj is None:
 827                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 828             if mobj.group('episode') == '':
 829                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 830             epTitle = mobj.group('episode')
 831
 832         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 833
 834         if len(mMovieParams) == 0:
 835             # The Colbert Report embeds the information in a without
 836             # a URL prefix; so extract the alternate reference
 837             # and then add the URL prefix manually.
 838
 839             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 840             if len(altMovieParams) == 0:
 841                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 842             else:
 843                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 844
 845         uri = mMovieParams[0][1]
 846         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 847         indexXml = self._download_webpage(indexUrl, epTitle,
 848                                           u'Downloading show index',
 849                                           u'unable to download episode index')
 850
 851         results = []
 852
 853         idoc = xml.etree.ElementTree.fromstring(indexXml)
 854         itemEls = idoc.findall('.//item')
 855         for partNum,itemEl in enumerate(itemEls):
 856             mediaId = itemEl.findall('./guid')[0].text
 857             shortMediaId = mediaId.split(':')[-1]
 858             showId = mediaId.split(':')[-2].replace('.com', '')
 859             officialTitle = itemEl.findall('./title')[0].text
 860             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 861
 862             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 863                         compat_urllib_parse.urlencode({'uri': mediaId}))
 864             configXml = self._download_webpage(configUrl, epTitle,
 865                                                u'Downloading configuration for %s' % shortMediaId)
 866
 867             cdoc = xml.etree.ElementTree.fromstring(configXml)
 868             turls = []
 869             for rendition in cdoc.findall('.//rendition'):
 870                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 871                 turls.append(finfo)
 872
 873             if len(turls) == 0:
 874                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 875                 continue
 876
 877             if self._downloader.params.get('listformats', None):
 878                 self._print_formats([i[0] for i in turls])
 879                 return
 880
 881             # For now, just pick the highest bitrate
 882             format,rtmp_video_url = turls[-1]
 883
 884             # Get the format arg from the arg stream
 885             req_format = self._downloader.params.get('format', None)
 886
 887             # Select format if we can find one
 888             for f,v in turls:
 889                 if f == req_format:
 890                     format, rtmp_video_url = f, v
 891                     break
 892
 893             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 894             if not m:
 895                 raise ExtractorError(u'Cannot transform RTMP url')
 896             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 897             video_url = base + m.group('finalid')
 898
 899             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 900             info = {
 901                 'id': shortMediaId,
 902                 'url': video_url,
 903                 'uploader': showId,
 904                 'upload_date': officialDate,
 905                 'title': effTitle,
 906                 'ext': 'mp4',
 907                 'format': format,
 908                 'thumbnail': None,
 909                 'description': officialTitle,
 910             }
 911             results.append(info)
 912
 913         return results
 914
 915
 916 class EscapistIE(InfoExtractor):
 917     """Information extractor for The Escapist """
 918
 919     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 920     IE_NAME = u'escapist'
 921
 922     def _real_extract(self, url):
 923         mobj = re.match(self._VALID_URL, url)
 924         if mobj is None:
 925             raise ExtractorError(u'Invalid URL: %s' % url)
 926         showName = mobj.group('showname')
 927         videoId = mobj.group('episode')
 928
 929         self.report_extraction(videoId)
 930         webpage = self._download_webpage(url, videoId)
 931
 932         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 933             webpage, u'description', fatal=False)
 934
 935         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 936             webpage, u'thumbnail', fatal=False)
 937
 938         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 939             webpage, u'player url')
 940
 941         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 942             webpage, u'player url').split(' : ')[-1]
 943
 944         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 945         configUrl = compat_urllib_parse.unquote(configUrl)
 946
 947         configJSON = self._download_webpage(configUrl, videoId,
 948                                             u'Downloading configuration',
 949                                             u'unable to download configuration')
 950
 951         # Technically, it's JavaScript, not JSON
 952         configJSON = configJSON.replace("'", '"')
 953
 954         try:
 955             config = json.loads(configJSON)
 956         except (ValueError,) as err:
 957             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 958
 959         playlist = config['playlist']
 960         videoUrl = playlist[1]['url']
 961
 962         info = {
 963             'id': videoId,
 964             'url': videoUrl,
 965             'uploader': showName,
 966             'upload_date': None,
 967             'title': title,
 968             'ext': 'mp4',
 969             'thumbnail': imgUrl,
 970             'description': videoDesc,
 971             'player_url': playerUrl,
 972         }
 973
 974         return [info]
 975
 976 class CollegeHumorIE(InfoExtractor):
 977     """Information extractor for collegehumor.com"""
 978
 979     _WORKING = False
 980     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 981     IE_NAME = u'collegehumor'
 982
 983     def report_manifest(self, video_id):
 984         """Report information extraction."""
 985         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 986
 987     def _real_extract(self, url):
 988         mobj = re.match(self._VALID_URL, url)
 989         if mobj is None:
 990             raise ExtractorError(u'Invalid URL: %s' % url)
 991         video_id = mobj.group('videoid')
 992
 993         info = {
 994             'id': video_id,
 995             'uploader': None,
 996             'upload_date': None,
 997         }
 998
 999         self.report_extraction(video_id)
1000         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1001         try:
1002             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1003         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1005
1006         mdoc = xml.etree.ElementTree.fromstring(metaXml)
1007         try:
1008             videoNode = mdoc.findall('./video')[0]
1009             info['description'] = videoNode.findall('./description')[0].text
1010             info['title'] = videoNode.findall('./caption')[0].text
1011             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1012             manifest_url = videoNode.findall('./file')[0].text
1013         except IndexError:
1014             raise ExtractorError(u'Invalid metadata XML file')
1015
1016         manifest_url += '?hdcore=2.10.3'
1017         self.report_manifest(video_id)
1018         try:
1019             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1020         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1021             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1022
1023         adoc = xml.etree.ElementTree.fromstring(manifestXml)
1024         try:
1025             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1026             node_id = media_node.attrib['url']
1027             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1028         except IndexError as err:
1029             raise ExtractorError(u'Invalid manifest file')
1030
1031         url_pr = compat_urllib_parse_urlparse(manifest_url)
1032         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1033
1034         info['url'] = url
1035         info['ext'] = 'f4f'
1036         return [info]
1037
1038
1039 class XVideosIE(InfoExtractor):
1040     """Information extractor for xvideos.com"""
1041
1042     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1043     IE_NAME = u'xvideos'
1044
1045     def _real_extract(self, url):
1046         mobj = re.match(self._VALID_URL, url)
1047         if mobj is None:
1048             raise ExtractorError(u'Invalid URL: %s' % url)
1049         video_id = mobj.group(1)
1050
1051         webpage = self._download_webpage(url, video_id)
1052
1053         self.report_extraction(video_id)
1054
1055         # Extract video URL
1056         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1057             webpage, u'video URL'))
1058
1059         # Extract title
1060         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1061             webpage, u'title')
1062
1063         # Extract video thumbnail
1064         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1065             webpage, u'thumbnail', fatal=False)
1066
1067         info = {
1068             'id': video_id,
1069             'url': video_url,
1070             'uploader': None,
1071             'upload_date': None,
1072             'title': video_title,
1073             'ext': 'flv',
1074             'thumbnail': video_thumbnail,
1075             'description': None,
1076         }
1077
1078         return [info]
1079
1080
1081 class SoundcloudIE(InfoExtractor):
1082     """Information extractor for soundcloud.com
1083        To access the media, the uid of the song and a stream token
1084        must be extracted from the page source and the script must make
1085        a request to media.soundcloud.com/crossdomain.xml. Then
1086        the media can be grabbed by requesting from an url composed
1087        of the stream token and uid
1088      """
1089
1090     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1091     IE_NAME = u'soundcloud'
1092
1093     def report_resolve(self, video_id):
1094         """Report information extraction."""
1095         self.to_screen(u'%s: Resolving id' % video_id)
1096
1097     def _real_extract(self, url):
1098         mobj = re.match(self._VALID_URL, url)
1099         if mobj is None:
1100             raise ExtractorError(u'Invalid URL: %s' % url)
1101
1102         # extract uploader (which is in the url)
1103         uploader = mobj.group(1)
1104         # extract simple title (uploader + slug of song title)
1105         slug_title =  mobj.group(2)
1106         simple_title = uploader + u'-' + slug_title
1107         full_title = '%s/%s' % (uploader, slug_title)
1108
1109         self.report_resolve(full_title)
1110
1111         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1112         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1113         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1114
1115         info = json.loads(info_json)
1116         video_id = info['id']
1117         self.report_extraction(full_title)
1118
1119         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1120         stream_json = self._download_webpage(streams_url, full_title,
1121                                              u'Downloading stream definitions',
1122                                              u'unable to download stream definitions')
1123
1124         streams = json.loads(stream_json)
1125         mediaURL = streams['http_mp3_128_url']
1126         upload_date = unified_strdate(info['created_at'])
1127
1128         return [{
1129             'id':       info['id'],
1130             'url':      mediaURL,
1131             'uploader': info['user']['username'],
1132             'upload_date': upload_date,
1133             'title':    info['title'],
1134             'ext':      u'mp3',
1135             'description': info['description'],
1136         }]
1137
1138 class SoundcloudSetIE(InfoExtractor):
1139     """Information extractor for soundcloud.com sets
1140        To access the media, the uid of the song and a stream token
1141        must be extracted from the page source and the script must make
1142        a request to media.soundcloud.com/crossdomain.xml. Then
1143        the media can be grabbed by requesting from an url composed
1144        of the stream token and uid
1145      """
1146
1147     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1148     IE_NAME = u'soundcloud:set'
1149
1150     def report_resolve(self, video_id):
1151         """Report information extraction."""
1152         self.to_screen(u'%s: Resolving id' % video_id)
1153
1154     def _real_extract(self, url):
1155         mobj = re.match(self._VALID_URL, url)
1156         if mobj is None:
1157             raise ExtractorError(u'Invalid URL: %s' % url)
1158
1159         # extract uploader (which is in the url)
1160         uploader = mobj.group(1)
1161         # extract simple title (uploader + slug of song title)
1162         slug_title =  mobj.group(2)
1163         simple_title = uploader + u'-' + slug_title
1164         full_title = '%s/sets/%s' % (uploader, slug_title)
1165
1166         self.report_resolve(full_title)
1167
1168         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1169         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1170         info_json = self._download_webpage(resolv_url, full_title)
1171
1172         videos = []
1173         info = json.loads(info_json)
1174         if 'errors' in info:
1175             for err in info['errors']:
1176                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1177             return
1178
1179         self.report_extraction(full_title)
1180         for track in info['tracks']:
1181             video_id = track['id']
1182
1183             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1184             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1185
1186             self.report_extraction(video_id)
1187             streams = json.loads(stream_json)
1188             mediaURL = streams['http_mp3_128_url']
1189
1190             videos.append({
1191                 'id':       video_id,
1192                 'url':      mediaURL,
1193                 'uploader': track['user']['username'],
1194                 'upload_date':  unified_strdate(track['created_at']),
1195                 'title':    track['title'],
1196                 'ext':      u'mp3',
1197                 'description': track['description'],
1198             })
1199         return videos
1200
1201
1202 class InfoQIE(InfoExtractor):
1203     """Information extractor for infoq.com"""
1204     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1205
1206     def _real_extract(self, url):
1207         mobj = re.match(self._VALID_URL, url)
1208         if mobj is None:
1209             raise ExtractorError(u'Invalid URL: %s' % url)
1210
1211         webpage = self._download_webpage(url, video_id=url)
1212         self.report_extraction(url)
1213
1214         # Extract video URL
1215         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1216         if mobj is None:
1217             raise ExtractorError(u'Unable to extract video url')
1218         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1219         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1220
1221         # Extract title
1222         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1223             webpage, u'title')
1224
1225         # Extract description
1226         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1227             webpage, u'description', fatal=False)
1228
1229         video_filename = video_url.split('/')[-1]
1230         video_id, extension = video_filename.split('.')
1231
1232         info = {
1233             'id': video_id,
1234             'url': video_url,
1235             'uploader': None,
1236             'upload_date': None,
1237             'title': video_title,
1238             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1239             'thumbnail': None,
1240             'description': video_description,
1241         }
1242
1243         return [info]
1244
1245 class MixcloudIE(InfoExtractor):
1246     """Information extractor for www.mixcloud.com"""
1247
1248     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1249     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1250     IE_NAME = u'mixcloud'
1251
1252     def report_download_json(self, file_id):
1253         """Report JSON download."""
1254         self.to_screen(u'Downloading json')
1255
1256     def get_urls(self, jsonData, fmt, bitrate='best'):
1257         """Get urls from 'audio_formats' section in json"""
1258         file_url = None
1259         try:
1260             bitrate_list = jsonData[fmt]
1261             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1262                 bitrate = max(bitrate_list) # select highest
1263
1264             url_list = jsonData[fmt][bitrate]
1265         except TypeError: # we have no bitrate info.
1266             url_list = jsonData[fmt]
1267         return url_list
1268
1269     def check_urls(self, url_list):
1270         """Returns 1st active url from list"""
1271         for url in url_list:
1272             try:
1273                 compat_urllib_request.urlopen(url)
1274                 return url
1275             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1276                 url = None
1277
1278         return None
1279
1280     def _print_formats(self, formats):
1281         print('Available formats:')
1282         for fmt in formats.keys():
1283             for b in formats[fmt]:
1284                 try:
1285                     ext = formats[fmt][b][0]
1286                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1287                 except TypeError: # we have no bitrate info
1288                     ext = formats[fmt][0]
1289                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1290                     break
1291
1292     def _real_extract(self, url):
1293         mobj = re.match(self._VALID_URL, url)
1294         if mobj is None:
1295             raise ExtractorError(u'Invalid URL: %s' % url)
1296         # extract uploader & filename from url
1297         uploader = mobj.group(1).decode('utf-8')
1298         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1299
1300         # construct API request
1301         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1302         # retrieve .json file with links to files
1303         request = compat_urllib_request.Request(file_url)
1304         try:
1305             self.report_download_json(file_url)
1306             jsonData = compat_urllib_request.urlopen(request).read()
1307         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1308             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1309
1310         # parse JSON
1311         json_data = json.loads(jsonData)
1312         player_url = json_data['player_swf_url']
1313         formats = dict(json_data['audio_formats'])
1314
1315         req_format = self._downloader.params.get('format', None)
1316         bitrate = None
1317
1318         if self._downloader.params.get('listformats', None):
1319             self._print_formats(formats)
1320             return
1321
1322         if req_format is None or req_format == 'best':
1323             for format_param in formats.keys():
1324                 url_list = self.get_urls(formats, format_param)
1325                 # check urls
1326                 file_url = self.check_urls(url_list)
1327                 if file_url is not None:
1328                     break # got it!
1329         else:
1330             if req_format not in formats:
1331                 raise ExtractorError(u'Format is not available')
1332
1333             url_list = self.get_urls(formats, req_format)
1334             file_url = self.check_urls(url_list)
1335             format_param = req_format
1336
1337         return [{
1338             'id': file_id.decode('utf-8'),
1339             'url': file_url.decode('utf-8'),
1340             'uploader': uploader.decode('utf-8'),
1341             'upload_date': None,
1342             'title': json_data['name'],
1343             'ext': file_url.split('.')[-1].decode('utf-8'),
1344             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1345             'thumbnail': json_data['thumbnail_url'],
1346             'description': json_data['description'],
1347             'player_url': player_url.decode('utf-8'),
1348         }]
1349
1350 class StanfordOpenClassroomIE(InfoExtractor):
1351     """Information extractor for Stanford's Open ClassRoom"""
1352
1353     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1354     IE_NAME = u'stanfordoc'
1355
1356     def _real_extract(self, url):
1357         mobj = re.match(self._VALID_URL, url)
1358         if mobj is None:
1359             raise ExtractorError(u'Invalid URL: %s' % url)
1360
1361         if mobj.group('course') and mobj.group('video'): # A specific video
1362             course = mobj.group('course')
1363             video = mobj.group('video')
1364             info = {
1365                 'id': course + '_' + video,
1366                 'uploader': None,
1367                 'upload_date': None,
1368             }
1369
1370             self.report_extraction(info['id'])
1371             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1372             xmlUrl = baseUrl + video + '.xml'
1373             try:
1374                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1375             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1376                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1377             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1378             try:
1379                 info['title'] = mdoc.findall('./title')[0].text
1380                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1381             except IndexError:
1382                 raise ExtractorError(u'Invalid metadata XML file')
1383             info['ext'] = info['url'].rpartition('.')[2]
1384             return [info]
1385         elif mobj.group('course'): # A course page
1386             course = mobj.group('course')
1387             info = {
1388                 'id': course,
1389                 'type': 'playlist',
1390                 'uploader': None,
1391                 'upload_date': None,
1392             }
1393
1394             coursepage = self._download_webpage(url, info['id'],
1395                                         note='Downloading course info page',
1396                                         errnote='Unable to download course info page')
1397
1398             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1399
1400             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1401                 coursepage, u'description', fatal=False)
1402
1403             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1404             info['list'] = [
1405                 {
1406                     'type': 'reference',
1407                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1408                 }
1409                     for vpage in links]
1410             results = []
1411             for entry in info['list']:
1412                 assert entry['type'] == 'reference'
1413                 results += self.extract(entry['url'])
1414             return results
1415         else: # Root page
1416             info = {
1417                 'id': 'Stanford OpenClassroom',
1418                 'type': 'playlist',
1419                 'uploader': None,
1420                 'upload_date': None,
1421             }
1422
1423             self.report_download_webpage(info['id'])
1424             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1425             try:
1426                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1427             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1428                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1429
1430             info['title'] = info['id']
1431
1432             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1433             info['list'] = [
1434                 {
1435                     'type': 'reference',
1436                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1437                 }
1438                     for cpage in links]
1439
1440             results = []
1441             for entry in info['list']:
1442                 assert entry['type'] == 'reference'
1443                 results += self.extract(entry['url'])
1444             return results
1445
1446 class MTVIE(InfoExtractor):
1447     """Information extractor for MTV.com"""
1448
1449     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1450     IE_NAME = u'mtv'
1451
1452     def _real_extract(self, url):
1453         mobj = re.match(self._VALID_URL, url)
1454         if mobj is None:
1455             raise ExtractorError(u'Invalid URL: %s' % url)
1456         if not mobj.group('proto'):
1457             url = 'http://' + url
1458         video_id = mobj.group('videoid')
1459
1460         webpage = self._download_webpage(url, video_id)
1461
1462         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1463             webpage, u'song name', fatal=False)
1464
1465         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1466             webpage, u'title')
1467
1468         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1469             webpage, u'mtvn_uri', fatal=False)
1470
1471         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1472             webpage, u'content id', fatal=False)
1473
1474         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1475         self.report_extraction(video_id)
1476         request = compat_urllib_request.Request(videogen_url)
1477         try:
1478             metadataXml = compat_urllib_request.urlopen(request).read()
1479         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1480             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1481
1482         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1483         renditions = mdoc.findall('.//rendition')
1484
1485         # For now, always pick the highest quality.
1486         rendition = renditions[-1]
1487
1488         try:
1489             _,_,ext = rendition.attrib['type'].partition('/')
1490             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1491             video_url = rendition.find('./src').text
1492         except KeyError:
1493             raise ExtractorError('Invalid rendition field.')
1494
1495         info = {
1496             'id': video_id,
1497             'url': video_url,
1498             'uploader': performer,
1499             'upload_date': None,
1500             'title': video_title,
1501             'ext': ext,
1502             'format': format,
1503         }
1504
1505         return [info]
1506
1507
1508 class YoukuIE(InfoExtractor):
1509     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1510
1511     def _gen_sid(self):
1512         nowTime = int(time.time() * 1000)
1513         random1 = random.randint(1000,1998)
1514         random2 = random.randint(1000,9999)
1515
1516         return "%d%d%d" %(nowTime,random1,random2)
1517
1518     def _get_file_ID_mix_string(self, seed):
1519         mixed = []
1520         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1521         seed = float(seed)
1522         for i in range(len(source)):
1523             seed  =  (seed * 211 + 30031 ) % 65536
1524             index  =  math.floor(seed / 65536 * len(source) )
1525             mixed.append(source[int(index)])
1526             source.remove(source[int(index)])
1527         #return ''.join(mixed)
1528         return mixed
1529
1530     def _get_file_id(self, fileId, seed):
1531         mixed = self._get_file_ID_mix_string(seed)
1532         ids = fileId.split('*')
1533         realId = []
1534         for ch in ids:
1535             if ch:
1536                 realId.append(mixed[int(ch)])
1537         return ''.join(realId)
1538
1539     def _real_extract(self, url):
1540         mobj = re.match(self._VALID_URL, url)
1541         if mobj is None:
1542             raise ExtractorError(u'Invalid URL: %s' % url)
1543         video_id = mobj.group('ID')
1544
1545         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1546
1547         jsondata = self._download_webpage(info_url, video_id)
1548
1549         self.report_extraction(video_id)
1550         try:
1551             config = json.loads(jsondata)
1552
1553             video_title =  config['data'][0]['title']
1554             seed = config['data'][0]['seed']
1555
1556             format = self._downloader.params.get('format', None)
1557             supported_format = list(config['data'][0]['streamfileids'].keys())
1558
1559             if format is None or format == 'best':
1560                 if 'hd2' in supported_format:
1561                     format = 'hd2'
1562                 else:
1563                     format = 'flv'
1564                 ext = u'flv'
1565             elif format == 'worst':
1566                 format = 'mp4'
1567                 ext = u'mp4'
1568             else:
1569                 format = 'flv'
1570                 ext = u'flv'
1571
1572
1573             fileid = config['data'][0]['streamfileids'][format]
1574             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1575         except (UnicodeDecodeError, ValueError, KeyError):
1576             raise ExtractorError(u'Unable to extract info section')
1577
1578         files_info=[]
1579         sid = self._gen_sid()
1580         fileid = self._get_file_id(fileid, seed)
1581
1582         #column 8,9 of fileid represent the segment number
1583         #fileid[7:9] should be changed
1584         for index, key in enumerate(keys):
1585
1586             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1587             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1588
1589             info = {
1590                 'id': '%s_part%02d' % (video_id, index),
1591                 'url': download_url,
1592                 'uploader': None,
1593                 'upload_date': None,
1594                 'title': video_title,
1595                 'ext': ext,
1596             }
1597             files_info.append(info)
1598
1599         return files_info
1600
1601
1602 class XNXXIE(InfoExtractor):
1603     """Information extractor for xnxx.com"""
1604
1605     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1606     IE_NAME = u'xnxx'
1607     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1608     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1609     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1610
1611     def _real_extract(self, url):
1612         mobj = re.match(self._VALID_URL, url)
1613         if mobj is None:
1614             raise ExtractorError(u'Invalid URL: %s' % url)
1615         video_id = mobj.group(1)
1616
1617         # Get webpage content
1618         webpage = self._download_webpage(url, video_id)
1619
1620         video_url = self._search_regex(self.VIDEO_URL_RE,
1621             webpage, u'video URL')
1622         video_url = compat_urllib_parse.unquote(video_url)
1623
1624         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1625             webpage, u'title')
1626
1627         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1628             webpage, u'thumbnail', fatal=False)
1629
1630         return [{
1631             'id': video_id,
1632             'url': video_url,
1633             'uploader': None,
1634             'upload_date': None,
1635             'title': video_title,
1636             'ext': 'flv',
1637             'thumbnail': video_thumbnail,
1638             'description': None,
1639         }]
1640
1641
1642 class GooglePlusIE(InfoExtractor):
1643     """Information extractor for plus.google.com."""
1644
1645     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1646     IE_NAME = u'plus.google'
1647
1648     def _real_extract(self, url):
1649         # Extract id from URL
1650         mobj = re.match(self._VALID_URL, url)
1651         if mobj is None:
1652             raise ExtractorError(u'Invalid URL: %s' % url)
1653
1654         post_url = mobj.group(0)
1655         video_id = mobj.group(1)
1656
1657         video_extension = 'flv'
1658
1659         # Step 1, Retrieve post webpage to extract further information
1660         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1661
1662         self.report_extraction(video_id)
1663
1664         # Extract update date
1665         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1666             webpage, u'upload date', fatal=False)
1667         if upload_date:
1668             # Convert timestring to a format suitable for filename
1669             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1670             upload_date = upload_date.strftime('%Y%m%d')
1671
1672         # Extract uploader
1673         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1674             webpage, u'uploader', fatal=False)
1675
1676         # Extract title
1677         # Get the first line for title
1678         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1679             webpage, 'title', default=u'NA')
1680
1681         # Step 2, Stimulate clicking the image box to launch video
1682         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1683             webpage, u'video page URL')
1684         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1685
1686         # Extract video links on video page
1687         """Extract video links of all sizes"""
1688         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1689         mobj = re.findall(pattern, webpage)
1690         if len(mobj) == 0:
1691             raise ExtractorError(u'Unable to extract video links')
1692
1693         # Sort in resolution
1694         links = sorted(mobj)
1695
1696         # Choose the lowest of the sort, i.e. highest resolution
1697         video_url = links[-1]
1698         # Only get the url. The resolution part in the tuple has no use anymore
1699         video_url = video_url[-1]
1700         # Treat escaped \u0026 style hex
1701         try:
1702             video_url = video_url.decode("unicode_escape")
1703         except AttributeError: # Python 3
1704             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1705
1706
1707         return [{
1708             'id':       video_id,
1709             'url':      video_url,
1710             'uploader': uploader,
1711             'upload_date':  upload_date,
1712             'title':    video_title,
1713             'ext':      video_extension,
1714         }]
1715
1716 class NBAIE(InfoExtractor):
1717     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1718     IE_NAME = u'nba'
1719
1720     def _real_extract(self, url):
1721         mobj = re.match(self._VALID_URL, url)
1722         if mobj is None:
1723             raise ExtractorError(u'Invalid URL: %s' % url)
1724
1725         video_id = mobj.group(1)
1726
1727         webpage = self._download_webpage(url, video_id)
1728
1729         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1730
1731         shortened_video_id = video_id.rpartition('/')[2]
1732         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1733             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1734
1735         # It isn't there in the HTML it returns to us
1736         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1737
1738         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1739
1740         info = {
1741             'id': shortened_video_id,
1742             'url': video_url,
1743             'ext': 'mp4',
1744             'title': title,
1745             # 'uploader_date': uploader_date,
1746             'description': description,
1747         }
1748         return [info]
1749
1750 class JustinTVIE(InfoExtractor):
1751     """Information extractor for justin.tv and twitch.tv"""
1752     # TODO: One broadcast may be split into multiple videos. The key
1753     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1754     # starts at 1 and increases. Can we treat all parts as one video?
1755
1756     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1757         (?:
1758             (?P<channelid>[^/]+)|
1759             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1760             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1761         )
1762         /?(?:\#.*)?$
1763         """
1764     _JUSTIN_PAGE_LIMIT = 100
1765     IE_NAME = u'justin.tv'
1766
1767     def report_download_page(self, channel, offset):
1768         """Report attempt to download a single page of videos."""
1769         self.to_screen(u'%s: Downloading video information from %d to %d' %
1770                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1771
1772     # Return count of items, list of *valid* items
1773     def _parse_page(self, url, video_id):
1774         webpage = self._download_webpage(url, video_id,
1775                                          u'Downloading video info JSON',
1776                                          u'unable to download video info JSON')
1777
1778         response = json.loads(webpage)
1779         if type(response) != list:
1780             error_text = response.get('error', 'unknown error')
1781             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1782         info = []
1783         for clip in response:
1784             video_url = clip['video_file_url']
1785             if video_url:
1786                 video_extension = os.path.splitext(video_url)[1][1:]
1787                 video_date = re.sub('-', '', clip['start_time'][:10])
1788                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1789                 video_id = clip['id']
1790                 video_title = clip.get('title', video_id)
1791                 info.append({
1792                     'id': video_id,
1793                     'url': video_url,
1794                     'title': video_title,
1795                     'uploader': clip.get('channel_name', video_uploader_id),
1796                     'uploader_id': video_uploader_id,
1797                     'upload_date': video_date,
1798                     'ext': video_extension,
1799                 })
1800         return (len(response), info)
1801
1802     def _real_extract(self, url):
1803         mobj = re.match(self._VALID_URL, url)
1804         if mobj is None:
1805             raise ExtractorError(u'invalid URL: %s' % url)
1806
1807         api_base = 'http://api.justin.tv'
1808         paged = False
1809         if mobj.group('channelid'):
1810             paged = True
1811             video_id = mobj.group('channelid')
1812             api = api_base + '/channel/archives/%s.json' % video_id
1813         elif mobj.group('chapterid'):
1814             chapter_id = mobj.group('chapterid')
1815
1816             webpage = self._download_webpage(url, chapter_id)
1817             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1818             if not m:
1819                 raise ExtractorError(u'Cannot find archive of a chapter')
1820             archive_id = m.group(1)
1821
1822             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1823             chapter_info_xml = self._download_webpage(api, chapter_id,
1824                                              note=u'Downloading chapter information',
1825                                              errnote=u'Chapter information download failed')
1826             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1827             for a in doc.findall('.//archive'):
1828                 if archive_id == a.find('./id').text:
1829                     break
1830             else:
1831                 raise ExtractorError(u'Could not find chapter in chapter information')
1832
1833             video_url = a.find('./video_file_url').text
1834             video_ext = video_url.rpartition('.')[2] or u'flv'
1835
1836             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1837             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1838                                    note='Downloading chapter metadata',
1839                                    errnote='Download of chapter metadata failed')
1840             chapter_info = json.loads(chapter_info_json)
1841
1842             bracket_start = int(doc.find('.//bracket_start').text)
1843             bracket_end = int(doc.find('.//bracket_end').text)
1844
1845             # TODO determine start (and probably fix up file)
1846             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1847             #video_url += u'?start=' + TODO:start_timestamp
1848             # bracket_start is 13290, but we want 51670615
1849             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1850                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1851
1852             info = {
1853                 'id': u'c' + chapter_id,
1854                 'url': video_url,
1855                 'ext': video_ext,
1856                 'title': chapter_info['title'],
1857                 'thumbnail': chapter_info['preview'],
1858                 'description': chapter_info['description'],
1859                 'uploader': chapter_info['channel']['display_name'],
1860                 'uploader_id': chapter_info['channel']['name'],
1861             }
1862             return [info]
1863         else:
1864             video_id = mobj.group('videoid')
1865             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1866
1867         self.report_extraction(video_id)
1868
1869         info = []
1870         offset = 0
1871         limit = self._JUSTIN_PAGE_LIMIT
1872         while True:
1873             if paged:
1874                 self.report_download_page(video_id, offset)
1875             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1876             page_count, page_info = self._parse_page(page_url, video_id)
1877             info.extend(page_info)
1878             if not paged or page_count != limit:
1879                 break
1880             offset += limit
1881         return info
1882
1883 class FunnyOrDieIE(InfoExtractor):
1884     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1885
1886     def _real_extract(self, url):
1887         mobj = re.match(self._VALID_URL, url)
1888         if mobj is None:
1889             raise ExtractorError(u'invalid URL: %s' % url)
1890
1891         video_id = mobj.group('id')
1892         webpage = self._download_webpage(url, video_id)
1893
1894         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1895             webpage, u'video URL', flags=re.DOTALL)
1896
1897         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1898             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1899
1900         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1901             webpage, u'description', fatal=False, flags=re.DOTALL)
1902
1903         info = {
1904             'id': video_id,
1905             'url': video_url,
1906             'ext': 'mp4',
1907             'title': title,
1908             'description': video_description,
1909         }
1910         return [info]
1911
1912 class SteamIE(InfoExtractor):
1913     _VALID_URL = r"""http://store\.steampowered\.com/
1914                 (agecheck/)?
1915                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1916                 (?P<gameID>\d+)/?
1917                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1918                 """
1919     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1920     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1921
1922     @classmethod
1923     def suitable(cls, url):
1924         """Receives a URL and returns True if suitable for this IE."""
1925         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1926
1927     def _real_extract(self, url):
1928         m = re.match(self._VALID_URL, url, re.VERBOSE)
1929         gameID = m.group('gameID')
1930
1931         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1932         webpage = self._download_webpage(videourl, gameID)
1933
1934         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1935             videourl = self._AGECHECK_TEMPLATE % gameID
1936             self.report_age_confirmation()
1937             webpage = self._download_webpage(videourl, gameID)
1938
1939         self.report_extraction(gameID)
1940         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1941                                              webpage, 'game title')
1942
1943         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1944         mweb = re.finditer(urlRE, webpage)
1945         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1946         titles = re.finditer(namesRE, webpage)
1947         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1948         thumbs = re.finditer(thumbsRE, webpage)
1949         videos = []
1950         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1951             video_id = vid.group('videoID')
1952             title = vtitle.group('videoName')
1953             video_url = vid.group('videoURL')
1954             video_thumb = thumb.group('thumbnail')
1955             if not video_url:
1956                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1957             info = {
1958                 'id':video_id,
1959                 'url':video_url,
1960                 'ext': 'flv',
1961                 'title': unescapeHTML(title),
1962                 'thumbnail': video_thumb
1963                   }
1964             videos.append(info)
1965         return [self.playlist_result(videos, gameID, game_title)]
1966
1967 class UstreamIE(InfoExtractor):
1968     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1969     IE_NAME = u'ustream'
1970
1971     def _real_extract(self, url):
1972         m = re.match(self._VALID_URL, url)
1973         video_id = m.group('videoID')
1974
1975         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1976         webpage = self._download_webpage(url, video_id)
1977
1978         self.report_extraction(video_id)
1979
1980         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1981             webpage, u'title')
1982
1983         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1984             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1985
1986         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1987             webpage, u'thumbnail', fatal=False)
1988
1989         info = {
1990                 'id': video_id,
1991                 'url': video_url,
1992                 'ext': 'flv',
1993                 'title': video_title,
1994                 'uploader': uploader,
1995                 'thumbnail': thumbnail,
1996                }
1997         return info
1998
1999 class WorldStarHipHopIE(InfoExtractor):
2000     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2001     IE_NAME = u'WorldStarHipHop'
2002
2003     def _real_extract(self, url):
2004         m = re.match(self._VALID_URL, url)
2005         video_id = m.group('id')
2006
2007         webpage_src = self._download_webpage(url, video_id)
2008
2009         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2010             webpage_src, u'video URL')
2011
2012         if 'mp4' in video_url:
2013             ext = 'mp4'
2014         else:
2015             ext = 'flv'
2016
2017         video_title = self._html_search_regex(r"<title>(.*)</title>",
2018             webpage_src, u'title')
2019
2020         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2021         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2022             webpage_src, u'thumbnail', fatal=False)
2023
2024         if not thumbnail:
2025             _title = r"""candytitles.*>(.*)</span>"""
2026             mobj = re.search(_title, webpage_src)
2027             if mobj is not None:
2028                 video_title = mobj.group(1)
2029
2030         results = [{
2031                     'id': video_id,
2032                     'url' : video_url,
2033                     'title' : video_title,
2034                     'thumbnail' : thumbnail,
2035                     'ext' : ext,
2036                     }]
2037         return results
2038
2039 class RBMARadioIE(InfoExtractor):
2040     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2041
2042     def _real_extract(self, url):
2043         m = re.match(self._VALID_URL, url)
2044         video_id = m.group('videoID')
2045
2046         webpage = self._download_webpage(url, video_id)
2047
2048         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2049             webpage, u'json data', flags=re.MULTILINE)
2050
2051         try:
2052             data = json.loads(json_data)
2053         except ValueError as e:
2054             raise ExtractorError(u'Invalid JSON: ' + str(e))
2055
2056         video_url = data['akamai_url'] + '&cbr=256'
2057         url_parts = compat_urllib_parse_urlparse(video_url)
2058         video_ext = url_parts.path.rpartition('.')[2]
2059         info = {
2060                 'id': video_id,
2061                 'url': video_url,
2062                 'ext': video_ext,
2063                 'title': data['title'],
2064                 'description': data.get('teaser_text'),
2065                 'location': data.get('country_of_origin'),
2066                 'uploader': data.get('host', {}).get('name'),
2067                 'uploader_id': data.get('host', {}).get('slug'),
2068                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2069                 'duration': data.get('duration'),
2070         }
2071         return [info]
2072
2073
2074 class YouPornIE(InfoExtractor):
2075     """Information extractor for youporn.com."""
2076     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2077
2078     def _print_formats(self, formats):
2079         """Print all available formats"""
2080         print(u'Available formats:')
2081         print(u'ext\t\tformat')
2082         print(u'---------------------------------')
2083         for format in formats:
2084             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2085
2086     def _specific(self, req_format, formats):
2087         for x in formats:
2088             if(x["format"]==req_format):
2089                 return x
2090         return None
2091
2092     def _real_extract(self, url):
2093         mobj = re.match(self._VALID_URL, url)
2094         if mobj is None:
2095             raise ExtractorError(u'Invalid URL: %s' % url)
2096         video_id = mobj.group('videoid')
2097
2098         req = compat_urllib_request.Request(url)
2099         req.add_header('Cookie', 'age_verified=1')
2100         webpage = self._download_webpage(req, video_id)
2101
2102         # Get JSON parameters
2103         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2104         try:
2105             params = json.loads(json_params)
2106         except:
2107             raise ExtractorError(u'Invalid JSON')
2108
2109         self.report_extraction(video_id)
2110         try:
2111             video_title = params['title']
2112             upload_date = unified_strdate(params['release_date_f'])
2113             video_description = params['description']
2114             video_uploader = params['submitted_by']
2115             thumbnail = params['thumbnails'][0]['image']
2116         except KeyError:
2117             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2118
2119         # Get all of the formats available
2120         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2121         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2122             webpage, u'download list').strip()
2123
2124         # Get all of the links from the page
2125         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2126         links = re.findall(LINK_RE, download_list_html)
2127         if(len(links) == 0):
2128             raise ExtractorError(u'ERROR: no known formats available for video')
2129
2130         self.to_screen(u'Links found: %d' % len(links))
2131
2132         formats = []
2133         for link in links:
2134
2135             # A link looks like this:
2136             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2137             # A path looks like this:
2138             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2139             video_url = unescapeHTML( link )
2140             path = compat_urllib_parse_urlparse( video_url ).path
2141             extension = os.path.splitext( path )[1][1:]
2142             format = path.split('/')[4].split('_')[:2]
2143             size = format[0]
2144             bitrate = format[1]
2145             format = "-".join( format )
2146             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2147
2148             formats.append({
2149                 'id': video_id,
2150                 'url': video_url,
2151                 'uploader': video_uploader,
2152                 'upload_date': upload_date,
2153                 'title': video_title,
2154                 'ext': extension,
2155                 'format': format,
2156                 'thumbnail': thumbnail,
2157                 'description': video_description
2158             })
2159
2160         if self._downloader.params.get('listformats', None):
2161             self._print_formats(formats)
2162             return
2163
2164         req_format = self._downloader.params.get('format', None)
2165         self.to_screen(u'Format: %s' % req_format)
2166
2167         if req_format is None or req_format == 'best':
2168             return [formats[0]]
2169         elif req_format == 'worst':
2170             return [formats[-1]]
2171         elif req_format in ('-1', 'all'):
2172             return formats
2173         else:
2174             format = self._specific( req_format, formats )
2175             if result is None:
2176                 raise ExtractorError(u'Requested format not available')
2177             return [format]
2178
2179
2180
2181 class PornotubeIE(InfoExtractor):
2182     """Information extractor for pornotube.com."""
2183     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2184
2185     def _real_extract(self, url):
2186         mobj = re.match(self._VALID_URL, url)
2187         if mobj is None:
2188             raise ExtractorError(u'Invalid URL: %s' % url)
2189
2190         video_id = mobj.group('videoid')
2191         video_title = mobj.group('title')
2192
2193         # Get webpage content
2194         webpage = self._download_webpage(url, video_id)
2195
2196         # Get the video URL
2197         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2198         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2199         video_url = compat_urllib_parse.unquote(video_url)
2200
2201         #Get the uploaded date
2202         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2203         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2204         if upload_date: upload_date = unified_strdate(upload_date)
2205
2206         info = {'id': video_id,
2207                 'url': video_url,
2208                 'uploader': None,
2209                 'upload_date': upload_date,
2210                 'title': video_title,
2211                 'ext': 'flv',
2212                 'format': 'flv'}
2213
2214         return [info]
2215
2216 class YouJizzIE(InfoExtractor):
2217     """Information extractor for youjizz.com."""
2218     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2219
2220     def _real_extract(self, url):
2221         mobj = re.match(self._VALID_URL, url)
2222         if mobj is None:
2223             raise ExtractorError(u'Invalid URL: %s' % url)
2224
2225         video_id = mobj.group('videoid')
2226
2227         # Get webpage content
2228         webpage = self._download_webpage(url, video_id)
2229
2230         # Get the video title
2231         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2232             webpage, u'title').strip()
2233
2234         # Get the embed page
2235         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2236         if result is None:
2237             raise ExtractorError(u'ERROR: unable to extract embed page')
2238
2239         embed_page_url = result.group(0).strip()
2240         video_id = result.group('videoid')
2241
2242         webpage = self._download_webpage(embed_page_url, video_id)
2243
2244         # Get the video URL
2245         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2246             webpage, u'video URL')
2247
2248         info = {'id': video_id,
2249                 'url': video_url,
2250                 'title': video_title,
2251                 'ext': 'flv',
2252                 'format': 'flv',
2253                 'player_url': embed_page_url}
2254
2255         return [info]
2256
2257 class EightTracksIE(InfoExtractor):
2258     IE_NAME = '8tracks'
2259     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2260
2261     def _real_extract(self, url):
2262         mobj = re.match(self._VALID_URL, url)
2263         if mobj is None:
2264             raise ExtractorError(u'Invalid URL: %s' % url)
2265         playlist_id = mobj.group('id')
2266
2267         webpage = self._download_webpage(url, playlist_id)
2268
2269         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2270         data = json.loads(json_like)
2271
2272         session = str(random.randint(0, 1000000000))
2273         mix_id = data['id']
2274         track_count = data['tracks_count']
2275         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2276         next_url = first_url
2277         res = []
2278         for i in itertools.count():
2279             api_json = self._download_webpage(next_url, playlist_id,
2280                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2281                 errnote=u'Failed to download song information')
2282             api_data = json.loads(api_json)
2283             track_data = api_data[u'set']['track']
2284             info = {
2285                 'id': track_data['id'],
2286                 'url': track_data['track_file_stream_url'],
2287                 'title': track_data['performer'] + u' - ' + track_data['name'],
2288                 'raw_title': track_data['name'],
2289                 'uploader_id': data['user']['login'],
2290                 'ext': 'm4a',
2291             }
2292             res.append(info)
2293             if api_data['set']['at_last_track']:
2294                 break
2295             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2296         return res
2297
2298 class KeekIE(InfoExtractor):
2299     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2300     IE_NAME = u'keek'
2301
2302     def _real_extract(self, url):
2303         m = re.match(self._VALID_URL, url)
2304         video_id = m.group('videoID')
2305
2306         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2307         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2308         webpage = self._download_webpage(url, video_id)
2309
2310         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2311             webpage, u'title')
2312
2313         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2314             webpage, u'uploader', fatal=False)
2315
2316         info = {
2317                 'id': video_id,
2318                 'url': video_url,
2319                 'ext': 'mp4',
2320                 'title': video_title,
2321                 'thumbnail': thumbnail,
2322                 'uploader': uploader
2323         }
2324         return [info]
2325
2326 class TEDIE(InfoExtractor):
2327     _VALID_URL=r'''http://www\.ted\.com/
2328                    (
2329                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2330                         |
2331                         ((?P<type_talk>talks)) # We have a simple talk
2332                    )
2333                    (/lang/(.*?))? # The url may contain the language
2334                    /(?P<name>\w+) # Here goes the name and then ".html"
2335                    '''
2336
2337     @classmethod
2338     def suitable(cls, url):
2339         """Receives a URL and returns True if suitable for this IE."""
2340         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2341
2342     def _real_extract(self, url):
2343         m=re.match(self._VALID_URL, url, re.VERBOSE)
2344         if m.group('type_talk'):
2345             return [self._talk_info(url)]
2346         else :
2347             playlist_id=m.group('playlist_id')
2348             name=m.group('name')
2349             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2350             return [self._playlist_videos_info(url,name,playlist_id)]
2351
2352     def _playlist_videos_info(self,url,name,playlist_id=0):
2353         '''Returns the videos of the playlist'''
2354         video_RE=r'''
2355                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2356                      ([.\s]*?)data-playlist_item_id="(\d+)"
2357                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2358                      '''
2359         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2360         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2361         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2362         m_names=re.finditer(video_name_RE,webpage)
2363
2364         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2365                                                  webpage, 'playlist title')
2366
2367         playlist_entries = []
2368         for m_video, m_name in zip(m_videos,m_names):
2369             video_id=m_video.group('video_id')
2370             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2371             playlist_entries.append(self.url_result(talk_url, 'TED'))
2372         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2373
2374     def _talk_info(self, url, video_id=0):
2375         """Return the video for the talk in the url"""
2376         m = re.match(self._VALID_URL, url,re.VERBOSE)
2377         video_name = m.group('name')
2378         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2379         self.report_extraction(video_name)
2380         # If the url includes the language we get the title translated
2381         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2382                                         webpage, 'title')
2383         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2384                                     webpage, 'json data')
2385         info = json.loads(json_data)
2386         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2387                                        webpage, 'description', flags = re.DOTALL)
2388
2389         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2390                                        webpage, 'thumbnail')
2391         info = {
2392                 'id': info['id'],
2393                 'url': info['htmlStreams'][-1]['file'],
2394                 'ext': 'mp4',
2395                 'title': title,
2396                 'thumbnail': thumbnail,
2397                 'description': desc,
2398                 }
2399         return info
2400
2401 class MySpassIE(InfoExtractor):
2402     _VALID_URL = r'http://www.myspass.de/.*'
2403
2404     def _real_extract(self, url):
2405         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2406
2407         # video id is the last path element of the URL
2408         # usually there is a trailing slash, so also try the second but last
2409         url_path = compat_urllib_parse_urlparse(url).path
2410         url_parent_path, video_id = os.path.split(url_path)
2411         if not video_id:
2412             _, video_id = os.path.split(url_parent_path)
2413
2414         # get metadata
2415         metadata_url = META_DATA_URL_TEMPLATE % video_id
2416         metadata_text = self._download_webpage(metadata_url, video_id)
2417         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2418
2419         # extract values from metadata
2420         url_flv_el = metadata.find('url_flv')
2421         if url_flv_el is None:
2422             raise ExtractorError(u'Unable to extract download url')
2423         video_url = url_flv_el.text
2424         extension = os.path.splitext(video_url)[1][1:]
2425         title_el = metadata.find('title')
2426         if title_el is None:
2427             raise ExtractorError(u'Unable to extract title')
2428         title = title_el.text
2429         format_id_el = metadata.find('format_id')
2430         if format_id_el is None:
2431             format = ext
2432         else:
2433             format = format_id_el.text
2434         description_el = metadata.find('description')
2435         if description_el is not None:
2436             description = description_el.text
2437         else:
2438             description = None
2439         imagePreview_el = metadata.find('imagePreview')
2440         if imagePreview_el is not None:
2441             thumbnail = imagePreview_el.text
2442         else:
2443             thumbnail = None
2444         info = {
2445             'id': video_id,
2446             'url': video_url,
2447             'title': title,
2448             'ext': extension,
2449             'format': format,
2450             'thumbnail': thumbnail,
2451             'description': description
2452         }
2453         return [info]
2454
2455 class SpiegelIE(InfoExtractor):
2456     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2457
2458     def _real_extract(self, url):
2459         m = re.match(self._VALID_URL, url)
2460         video_id = m.group('videoID')
2461
2462         webpage = self._download_webpage(url, video_id)
2463
2464         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2465             webpage, u'title')
2466
2467         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2468         xml_code = self._download_webpage(xml_url, video_id,
2469                     note=u'Downloading XML', errnote=u'Failed to download XML')
2470
2471         idoc = xml.etree.ElementTree.fromstring(xml_code)
2472         last_type = idoc[-1]
2473         filename = last_type.findall('./filename')[0].text
2474         duration = float(last_type.findall('./duration')[0].text)
2475
2476         video_url = 'http://video2.spiegel.de/flash/' + filename
2477         video_ext = filename.rpartition('.')[2]
2478         info = {
2479             'id': video_id,
2480             'url': video_url,
2481             'ext': video_ext,
2482             'title': video_title,
2483             'duration': duration,
2484         }
2485         return [info]
2486
2487 class LiveLeakIE(InfoExtractor):
2488
2489     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2490     IE_NAME = u'liveleak'
2491
2492     def _real_extract(self, url):
2493         mobj = re.match(self._VALID_URL, url)
2494         if mobj is None:
2495             raise ExtractorError(u'Invalid URL: %s' % url)
2496
2497         video_id = mobj.group('video_id')
2498
2499         webpage = self._download_webpage(url, video_id)
2500
2501         video_url = self._search_regex(r'file: "(.*?)",',
2502             webpage, u'video URL')
2503
2504         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2505             webpage, u'title').replace('LiveLeak.com -', '').strip()
2506
2507         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2508             webpage, u'description', fatal=False)
2509
2510         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2511             webpage, u'uploader', fatal=False)
2512
2513         info = {
2514             'id':  video_id,
2515             'url': video_url,
2516             'ext': 'mp4',
2517             'title': video_title,
2518             'description': video_description,
2519             'uploader': video_uploader
2520         }
2521
2522         return [info]
2523
2524
2525
2526 class TumblrIE(InfoExtractor):
2527     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2528
2529     def _real_extract(self, url):
2530         m_url = re.match(self._VALID_URL, url)
2531         video_id = m_url.group('id')
2532         blog = m_url.group('blog_name')
2533
2534         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2535         webpage = self._download_webpage(url, video_id)
2536
2537         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2538         video = re.search(re_video, webpage)
2539         if video is None:
2540            raise ExtractorError(u'Unable to extract video')
2541         video_url = video.group('video_url')
2542         ext = video.group('ext')
2543
2544         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2545             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2546         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2547
2548         # The only place where you can get a title, it's not complete,
2549         # but searching in other places doesn't work for all videos
2550         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2551             webpage, u'title', flags=re.DOTALL)
2552
2553         return [{'id': video_id,
2554                  'url': video_url,
2555                  'title': video_title,
2556                  'thumbnail': video_thumbnail,
2557                  'ext': ext
2558                  }]
2559
2560 class BandcampIE(InfoExtractor):
2561     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2562
2563     def _real_extract(self, url):
2564         mobj = re.match(self._VALID_URL, url)
2565         title = mobj.group('title')
2566         webpage = self._download_webpage(url, title)
2567         # We get the link to the free download page
2568         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2569         if m_download is None:
2570             raise ExtractorError(u'No free songs found')
2571
2572         download_link = m_download.group(1)
2573         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2574                        webpage, re.MULTILINE|re.DOTALL).group('id')
2575
2576         download_webpage = self._download_webpage(download_link, id,
2577                                                   'Downloading free downloads page')
2578         # We get the dictionary of the track from some javascrip code
2579         info = re.search(r'items: (.*?),$',
2580                          download_webpage, re.MULTILINE).group(1)
2581         info = json.loads(info)[0]
2582         # We pick mp3-320 for now, until format selection can be easily implemented.
2583         mp3_info = info[u'downloads'][u'mp3-320']
2584         # If we try to use this url it says the link has expired
2585         initial_url = mp3_info[u'url']
2586         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2587         m_url = re.match(re_url, initial_url)
2588         #We build the url we will use to get the final track url
2589         # This url is build in Bandcamp in the script download_bunde_*.js
2590         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2591         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2592         # If we could correctly generate the .rand field the url would be
2593         #in the "download_url" key
2594         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2595
2596         track_info = {'id':id,
2597                       'title' : info[u'title'],
2598                       'ext' :   'mp3',
2599                       'url' :   final_url,
2600                       'thumbnail' : info[u'thumb_url'],
2601                       'uploader' :  info[u'artist']
2602                       }
2603
2604         return [track_info]
2605
2606 class RedTubeIE(InfoExtractor):
2607     """Information Extractor for redtube"""
2608     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2609
2610     def _real_extract(self,url):
2611         mobj = re.match(self._VALID_URL, url)
2612         if mobj is None:
2613             raise ExtractorError(u'Invalid URL: %s' % url)
2614
2615         video_id = mobj.group('id')
2616         video_extension = 'mp4'
2617         webpage = self._download_webpage(url, video_id)
2618
2619         self.report_extraction(video_id)
2620
2621         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2622             webpage, u'video URL')
2623
2624         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2625             webpage, u'title')
2626
2627         return [{
2628             'id':       video_id,
2629             'url':      video_url,
2630             'ext':      video_extension,
2631             'title':    video_title,
2632         }]
2633
2634 class InaIE(InfoExtractor):
2635     """Information Extractor for Ina.fr"""
2636     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2637
2638     def _real_extract(self,url):
2639         mobj = re.match(self._VALID_URL, url)
2640
2641         video_id = mobj.group('id')
2642         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2643         video_extension = 'mp4'
2644         webpage = self._download_webpage(mrss_url, video_id)
2645
2646         self.report_extraction(video_id)
2647
2648         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2649             webpage, u'video URL')
2650
2651         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2652             webpage, u'title')
2653
2654         return [{
2655             'id':       video_id,
2656             'url':      video_url,
2657             'ext':      video_extension,
2658             'title':    video_title,
2659         }]
2660
2661 class HowcastIE(InfoExtractor):
2662     """Information Extractor for Howcast.com"""
2663     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2664
2665     def _real_extract(self, url):
2666         mobj = re.match(self._VALID_URL, url)
2667
2668         video_id = mobj.group('id')
2669         webpage_url = 'http://www.howcast.com/videos/' + video_id
2670         webpage = self._download_webpage(webpage_url, video_id)
2671
2672         self.report_extraction(video_id)
2673
2674         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2675             webpage, u'video URL')
2676
2677         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2678             webpage, u'title')
2679
2680         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2681             webpage, u'description', fatal=False)
2682
2683         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2684             webpage, u'thumbnail', fatal=False)
2685
2686         return [{
2687             'id':       video_id,
2688             'url':      video_url,
2689             'ext':      'mp4',
2690             'title':    video_title,
2691             'description': video_description,
2692             'thumbnail': thumbnail,
2693         }]
2694
2695 class VineIE(InfoExtractor):
2696     """Information Extractor for Vine.co"""
2697     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2698
2699     def _real_extract(self, url):
2700         mobj = re.match(self._VALID_URL, url)
2701
2702         video_id = mobj.group('id')
2703         webpage_url = 'https://vine.co/v/' + video_id
2704         webpage = self._download_webpage(webpage_url, video_id)
2705
2706         self.report_extraction(video_id)
2707
2708         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2709             webpage, u'video URL')
2710
2711         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2712             webpage, u'title')
2713
2714         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2715             webpage, u'thumbnail', fatal=False)
2716
2717         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2718             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2719
2720         return [{
2721             'id':        video_id,
2722             'url':       video_url,
2723             'ext':       'mp4',
2724             'title':     video_title,
2725             'thumbnail': thumbnail,
2726             'uploader':  uploader,
2727         }]
2728
2729 class FlickrIE(InfoExtractor):
2730     """Information Extractor for Flickr videos"""
2731     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2732
2733     def _real_extract(self, url):
2734         mobj = re.match(self._VALID_URL, url)
2735
2736         video_id = mobj.group('id')
2737         video_uploader_id = mobj.group('uploader_id')
2738         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2739         webpage = self._download_webpage(webpage_url, video_id)
2740
2741         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2742
2743         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2744         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2745
2746         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2747             first_xml, u'node_id')
2748
2749         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2750         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2751
2752         self.report_extraction(video_id)
2753
2754         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2755         if mobj is None:
2756             raise ExtractorError(u'Unable to extract video url')
2757         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2758
2759         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2760             webpage, u'video title')
2761
2762         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2763             webpage, u'description', fatal=False)
2764
2765         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2766             webpage, u'thumbnail', fatal=False)
2767
2768         return [{
2769             'id':          video_id,
2770             'url':         video_url,
2771             'ext':         'mp4',
2772             'title':       video_title,
2773             'description': video_description,
2774             'thumbnail':   thumbnail,
2775             'uploader_id': video_uploader_id,
2776         }]
2777
2778 class TeamcocoIE(InfoExtractor):
2779     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2780
2781     def _real_extract(self, url):
2782         mobj = re.match(self._VALID_URL, url)
2783         if mobj is None:
2784             raise ExtractorError(u'Invalid URL: %s' % url)
2785         url_title = mobj.group('url_title')
2786         webpage = self._download_webpage(url, url_title)
2787
2788         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2789             webpage, u'video id')
2790
2791         self.report_extraction(video_id)
2792
2793         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2794             webpage, u'title')
2795
2796         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2797             webpage, u'thumbnail', fatal=False)
2798
2799         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2800             webpage, u'description', fatal=False)
2801
2802         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2803         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2804
2805         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2806             data, u'video URL')
2807
2808         return [{
2809             'id':          video_id,
2810             'url':         video_url,
2811             'ext':         'mp4',
2812             'title':       video_title,
2813             'thumbnail':   thumbnail,
2814             'description': video_description,
2815         }]
2816
2817 class XHamsterIE(InfoExtractor):
2818     """Information Extractor for xHamster"""
2819     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2820
2821     def _real_extract(self,url):
2822         mobj = re.match(self._VALID_URL, url)
2823
2824         video_id = mobj.group('id')
2825         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2826         webpage = self._download_webpage(mrss_url, video_id)
2827
2828         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2829         if mobj is None:
2830             raise ExtractorError(u'Unable to extract media URL')
2831         if len(mobj.group('server')) == 0:
2832             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2833         else:
2834             video_url = mobj.group('server')+'/key='+mobj.group('file')
2835         video_extension = video_url.split('.')[-1]
2836
2837         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2838             webpage, u'title')
2839
2840         # Can't see the description anywhere in the UI
2841         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2842         #     webpage, u'description', fatal=False)
2843         # if video_description: video_description = unescapeHTML(video_description)
2844
2845         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2846         if mobj:
2847             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2848         else:
2849             video_upload_date = None
2850             self._downloader.report_warning(u'Unable to extract upload date')
2851
2852         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2853             webpage, u'uploader id', default=u'anonymous')
2854
2855         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2856             webpage, u'thumbnail', fatal=False)
2857
2858         return [{
2859             'id':       video_id,
2860             'url':      video_url,
2861             'ext':      video_extension,
2862             'title':    video_title,
2863             # 'description': video_description,
2864             'upload_date': video_upload_date,
2865             'uploader_id': video_uploader_id,
2866             'thumbnail': video_thumbnail
2867         }]
2868
2869 class HypemIE(InfoExtractor):
2870     """Information Extractor for hypem"""
2871     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2872
2873     def _real_extract(self, url):
2874         mobj = re.match(self._VALID_URL, url)
2875         if mobj is None:
2876             raise ExtractorError(u'Invalid URL: %s' % url)
2877         track_id = mobj.group(1)
2878
2879         data = { 'ax': 1, 'ts': time.time() }
2880         data_encoded = compat_urllib_parse.urlencode(data)
2881         complete_url = url + "?" + data_encoded
2882         request = compat_urllib_request.Request(complete_url)
2883         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2884         cookie = urlh.headers.get('Set-Cookie', '')
2885
2886         self.report_extraction(track_id)
2887
2888         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2889             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2890         try:
2891             track_list = json.loads(html_tracks)
2892             track = track_list[u'tracks'][0]
2893         except ValueError:
2894             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2895
2896         key = track[u"key"]
2897         track_id = track[u"id"]
2898         artist = track[u"artist"]
2899         title = track[u"song"]
2900
2901         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2902         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2903         request.add_header('cookie', cookie)
2904         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2905         try:
2906             song_data = json.loads(song_data_json)
2907         except ValueError:
2908             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2909         final_url = song_data[u"url"]
2910
2911         return [{
2912             'id':       track_id,
2913             'url':      final_url,
2914             'ext':      "mp3",
2915             'title':    title,
2916             'artist':   artist,
2917         }]
2918
2919 class Vbox7IE(InfoExtractor):
2920     """Information Extractor for Vbox7"""
2921     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2922
2923     def _real_extract(self,url):
2924         mobj = re.match(self._VALID_URL, url)
2925         if mobj is None:
2926             raise ExtractorError(u'Invalid URL: %s' % url)
2927         video_id = mobj.group(1)
2928
2929         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2930         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2931         redirect_url = urlh.geturl() + new_location
2932         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2933
2934         title = self._html_search_regex(r'<title>(.*)</title>',
2935             webpage, u'title').split('/')[0].strip()
2936
2937         ext = "flv"
2938         info_url = "http://vbox7.com/play/magare.do"
2939         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2940         info_request = compat_urllib_request.Request(info_url, data)
2941         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2942         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2943         if info_response is None:
2944             raise ExtractorError(u'Unable to extract the media url')
2945         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2946
2947         return [{
2948             'id':        video_id,
2949             'url':       final_url,
2950             'ext':       ext,
2951             'title':     title,
2952             'thumbnail': thumbnail_url,
2953         }]
2954
2955 class GametrailersIE(InfoExtractor):
2956     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
2957
2958     def _real_extract(self, url):
2959         mobj = re.match(self._VALID_URL, url)
2960         if mobj is None:
2961             raise ExtractorError(u'Invalid URL: %s' % url)
2962         video_id = mobj.group('id')
2963         video_type = mobj.group('type')
2964         webpage = self._download_webpage(url, video_id)
2965         if video_type == 'full-episodes':
2966             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
2967         else:
2968             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
2969         mgid = self._search_regex(mgid_re, webpage, u'mgid')
2970         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
2971
2972         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
2973                                            video_id, u'Downloading video info')
2974         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
2975                                                video_id, u'Downloading video urls info')
2976
2977         self.report_extraction(video_id)
2978         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
2979                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
2980                       <image>.*
2981                         <url>(?P<thumb>.*?)</url>.*
2982                       </image>'''
2983
2984         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
2985         if m_info is None:
2986             raise ExtractorError(u'Unable to extract video info')
2987         video_title = m_info.group('title')
2988         video_description = m_info.group('description')
2989         video_thumb = m_info.group('thumb')
2990
2991         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
2992         if m_urls is None or len(m_urls) == 0:
2993             raise ExtractError(u'Unable to extrat video url')
2994         # They are sorted from worst to best quality
2995         video_url = m_urls[-1].group('url')
2996
2997         return {'url':         video_url,
2998                 'id':          video_id,
2999                 'title':       video_title,
3000                 # Videos are actually flv not mp4
3001                 'ext':         'flv',
3002                 'thumbnail':   video_thumb,
3003                 'description': video_description,
3004                 }
3005
3006 def gen_extractors():
3007     """ Return a list of an instance of every supported extractor.
3008     The order does matter; the first extractor matched is the one handling the URL.
3009     """
3010     return [
3011         YoutubePlaylistIE(),
3012         YoutubeChannelIE(),
3013         YoutubeUserIE(),
3014         YoutubeSearchIE(),
3015         YoutubeIE(),
3016         MetacafeIE(),
3017         DailymotionIE(),
3018         GoogleSearchIE(),
3019         PhotobucketIE(),
3020         YahooIE(),
3021         YahooSearchIE(),
3022         DepositFilesIE(),
3023         FacebookIE(),
3024         BlipTVIE(),
3025         BlipTVUserIE(),
3026         VimeoIE(),
3027         MyVideoIE(),
3028         ComedyCentralIE(),
3029         EscapistIE(),
3030         CollegeHumorIE(),
3031         XVideosIE(),
3032         SoundcloudSetIE(),
3033         SoundcloudIE(),
3034         InfoQIE(),
3035         MixcloudIE(),
3036         StanfordOpenClassroomIE(),
3037         MTVIE(),
3038         YoukuIE(),
3039         XNXXIE(),
3040         YouJizzIE(),
3041         PornotubeIE(),
3042         YouPornIE(),
3043         GooglePlusIE(),
3044         ArteTvIE(),
3045         NBAIE(),
3046         WorldStarHipHopIE(),
3047         JustinTVIE(),
3048         FunnyOrDieIE(),
3049         SteamIE(),
3050         UstreamIE(),
3051         RBMARadioIE(),
3052         EightTracksIE(),
3053         KeekIE(),
3054         TEDIE(),
3055         MySpassIE(),
3056         SpiegelIE(),
3057         LiveLeakIE(),
3058         ARDIE(),
3059         ZDFIE(),
3060         TumblrIE(),
3061         BandcampIE(),
3062         RedTubeIE(),
3063         InaIE(),
3064         HowcastIE(),
3065         VineIE(),
3066         FlickrIE(),
3067         TeamcocoIE(),
3068         XHamsterIE(),
3069         HypemIE(),
3070         Vbox7IE(),
3071         GametrailersIE(),
3072         StatigramIE(),
3073         GenericIE()
3074     ]
3075
3076 def get_info_extractor(ie_name):
3077     """Returns the info extractor class with the given ie_name"""
3078     return globals()[ie_name+'IE']