_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.dailymotion import DailymotionIE
  26 from .extractor.depositfiles import DepositFilesIE
  27 from .extractor.facebook import FacebookIE
  28 from .extractor.gametrailers import GametrailersIE
  29 from .extractor.generic import GenericIE
  30 from .extractor.googleplus import GooglePlusIE
  31 from .extractor.googlesearch import GoogleSearchIE
  32 from .extractor.metacafe import MetacafeIE
  33 from .extractor.myvideo import MyVideoIE
  34 from .extractor.statigram import StatigramIE
  35 from .extractor.photobucket import PhotobucketIE
  36 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  37 from .extractor.vimeo import VimeoIE
  38 from .extractor.yahoo import YahooIE, YahooSearchIE
  39 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  40 from .extractor.zdf import ZDFIE
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69 class EscapistIE(InfoExtractor):
  70     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
  71
  72     def _real_extract(self, url):
  73         mobj = re.match(self._VALID_URL, url)
  74         if mobj is None:
  75             raise ExtractorError(u'Invalid URL: %s' % url)
  76         showName = mobj.group('showname')
  77         videoId = mobj.group('episode')
  78
  79         self.report_extraction(videoId)
  80         webpage = self._download_webpage(url, videoId)
  81
  82         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
  83             webpage, u'description', fatal=False)
  84
  85         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
  86             webpage, u'thumbnail', fatal=False)
  87
  88         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
  89             webpage, u'player url')
  90
  91         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
  92             webpage, u'player url').split(' : ')[-1]
  93
  94         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
  95         configUrl = compat_urllib_parse.unquote(configUrl)
  96
  97         configJSON = self._download_webpage(configUrl, videoId,
  98                                             u'Downloading configuration',
  99                                             u'unable to download configuration')
 100
 101         # Technically, it's JavaScript, not JSON
 102         configJSON = configJSON.replace("'", '"')
 103
 104         try:
 105             config = json.loads(configJSON)
 106         except (ValueError,) as err:
 107             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 108
 109         playlist = config['playlist']
 110         videoUrl = playlist[1]['url']
 111
 112         info = {
 113             'id': videoId,
 114             'url': videoUrl,
 115             'uploader': showName,
 116             'upload_date': None,
 117             'title': title,
 118             'ext': 'mp4',
 119             'thumbnail': imgUrl,
 120             'description': videoDesc,
 121             'player_url': playerUrl,
 122         }
 123
 124         return [info]
 125
 126 class CollegeHumorIE(InfoExtractor):
 127     """Information extractor for collegehumor.com"""
 128
 129     _WORKING = False
 130     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 131     IE_NAME = u'collegehumor'
 132
 133     def report_manifest(self, video_id):
 134         """Report information extraction."""
 135         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 136
 137     def _real_extract(self, url):
 138         mobj = re.match(self._VALID_URL, url)
 139         if mobj is None:
 140             raise ExtractorError(u'Invalid URL: %s' % url)
 141         video_id = mobj.group('videoid')
 142
 143         info = {
 144             'id': video_id,
 145             'uploader': None,
 146             'upload_date': None,
 147         }
 148
 149         self.report_extraction(video_id)
 150         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 151         try:
 152             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 153         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 154             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 155
 156         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 157         try:
 158             videoNode = mdoc.findall('./video')[0]
 159             info['description'] = videoNode.findall('./description')[0].text
 160             info['title'] = videoNode.findall('./caption')[0].text
 161             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 162             manifest_url = videoNode.findall('./file')[0].text
 163         except IndexError:
 164             raise ExtractorError(u'Invalid metadata XML file')
 165
 166         manifest_url += '?hdcore=2.10.3'
 167         self.report_manifest(video_id)
 168         try:
 169             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 171             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 172
 173         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 174         try:
 175             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 176             node_id = media_node.attrib['url']
 177             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 178         except IndexError as err:
 179             raise ExtractorError(u'Invalid manifest file')
 180
 181         url_pr = compat_urllib_parse_urlparse(manifest_url)
 182         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 183
 184         info['url'] = url
 185         info['ext'] = 'f4f'
 186         return [info]
 187
 188
 189 class XVideosIE(InfoExtractor):
 190     """Information extractor for xvideos.com"""
 191
 192     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 193     IE_NAME = u'xvideos'
 194
 195     def _real_extract(self, url):
 196         mobj = re.match(self._VALID_URL, url)
 197         if mobj is None:
 198             raise ExtractorError(u'Invalid URL: %s' % url)
 199         video_id = mobj.group(1)
 200
 201         webpage = self._download_webpage(url, video_id)
 202
 203         self.report_extraction(video_id)
 204
 205         # Extract video URL
 206         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 207             webpage, u'video URL'))
 208
 209         # Extract title
 210         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 211             webpage, u'title')
 212
 213         # Extract video thumbnail
 214         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 215             webpage, u'thumbnail', fatal=False)
 216
 217         info = {
 218             'id': video_id,
 219             'url': video_url,
 220             'uploader': None,
 221             'upload_date': None,
 222             'title': video_title,
 223             'ext': 'flv',
 224             'thumbnail': video_thumbnail,
 225             'description': None,
 226         }
 227
 228         return [info]
 229
 230
 231
 232
 233 class InfoQIE(InfoExtractor):
 234     """Information extractor for infoq.com"""
 235     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 236
 237     def _real_extract(self, url):
 238         mobj = re.match(self._VALID_URL, url)
 239         if mobj is None:
 240             raise ExtractorError(u'Invalid URL: %s' % url)
 241
 242         webpage = self._download_webpage(url, video_id=url)
 243         self.report_extraction(url)
 244
 245         # Extract video URL
 246         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 247         if mobj is None:
 248             raise ExtractorError(u'Unable to extract video url')
 249         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 250         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 251
 252         # Extract title
 253         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 254             webpage, u'title')
 255
 256         # Extract description
 257         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 258             webpage, u'description', fatal=False)
 259
 260         video_filename = video_url.split('/')[-1]
 261         video_id, extension = video_filename.split('.')
 262
 263         info = {
 264             'id': video_id,
 265             'url': video_url,
 266             'uploader': None,
 267             'upload_date': None,
 268             'title': video_title,
 269             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 270             'thumbnail': None,
 271             'description': video_description,
 272         }
 273
 274         return [info]
 275
 276 class MixcloudIE(InfoExtractor):
 277     """Information extractor for www.mixcloud.com"""
 278
 279     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 280     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 281     IE_NAME = u'mixcloud'
 282
 283     def report_download_json(self, file_id):
 284         """Report JSON download."""
 285         self.to_screen(u'Downloading json')
 286
 287     def get_urls(self, jsonData, fmt, bitrate='best'):
 288         """Get urls from 'audio_formats' section in json"""
 289         file_url = None
 290         try:
 291             bitrate_list = jsonData[fmt]
 292             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 293                 bitrate = max(bitrate_list) # select highest
 294
 295             url_list = jsonData[fmt][bitrate]
 296         except TypeError: # we have no bitrate info.
 297             url_list = jsonData[fmt]
 298         return url_list
 299
 300     def check_urls(self, url_list):
 301         """Returns 1st active url from list"""
 302         for url in url_list:
 303             try:
 304                 compat_urllib_request.urlopen(url)
 305                 return url
 306             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 307                 url = None
 308
 309         return None
 310
 311     def _print_formats(self, formats):
 312         print('Available formats:')
 313         for fmt in formats.keys():
 314             for b in formats[fmt]:
 315                 try:
 316                     ext = formats[fmt][b][0]
 317                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 318                 except TypeError: # we have no bitrate info
 319                     ext = formats[fmt][0]
 320                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 321                     break
 322
 323     def _real_extract(self, url):
 324         mobj = re.match(self._VALID_URL, url)
 325         if mobj is None:
 326             raise ExtractorError(u'Invalid URL: %s' % url)
 327         # extract uploader & filename from url
 328         uploader = mobj.group(1).decode('utf-8')
 329         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 330
 331         # construct API request
 332         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 333         # retrieve .json file with links to files
 334         request = compat_urllib_request.Request(file_url)
 335         try:
 336             self.report_download_json(file_url)
 337             jsonData = compat_urllib_request.urlopen(request).read()
 338         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 339             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 340
 341         # parse JSON
 342         json_data = json.loads(jsonData)
 343         player_url = json_data['player_swf_url']
 344         formats = dict(json_data['audio_formats'])
 345
 346         req_format = self._downloader.params.get('format', None)
 347         bitrate = None
 348
 349         if self._downloader.params.get('listformats', None):
 350             self._print_formats(formats)
 351             return
 352
 353         if req_format is None or req_format == 'best':
 354             for format_param in formats.keys():
 355                 url_list = self.get_urls(formats, format_param)
 356                 # check urls
 357                 file_url = self.check_urls(url_list)
 358                 if file_url is not None:
 359                     break # got it!
 360         else:
 361             if req_format not in formats:
 362                 raise ExtractorError(u'Format is not available')
 363
 364             url_list = self.get_urls(formats, req_format)
 365             file_url = self.check_urls(url_list)
 366             format_param = req_format
 367
 368         return [{
 369             'id': file_id.decode('utf-8'),
 370             'url': file_url.decode('utf-8'),
 371             'uploader': uploader.decode('utf-8'),
 372             'upload_date': None,
 373             'title': json_data['name'],
 374             'ext': file_url.split('.')[-1].decode('utf-8'),
 375             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 376             'thumbnail': json_data['thumbnail_url'],
 377             'description': json_data['description'],
 378             'player_url': player_url.decode('utf-8'),
 379         }]
 380
 381 class StanfordOpenClassroomIE(InfoExtractor):
 382     """Information extractor for Stanford's Open ClassRoom"""
 383
 384     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 385     IE_NAME = u'stanfordoc'
 386
 387     def _real_extract(self, url):
 388         mobj = re.match(self._VALID_URL, url)
 389         if mobj is None:
 390             raise ExtractorError(u'Invalid URL: %s' % url)
 391
 392         if mobj.group('course') and mobj.group('video'): # A specific video
 393             course = mobj.group('course')
 394             video = mobj.group('video')
 395             info = {
 396                 'id': course + '_' + video,
 397                 'uploader': None,
 398                 'upload_date': None,
 399             }
 400
 401             self.report_extraction(info['id'])
 402             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 403             xmlUrl = baseUrl + video + '.xml'
 404             try:
 405                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 406             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 407                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 408             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 409             try:
 410                 info['title'] = mdoc.findall('./title')[0].text
 411                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 412             except IndexError:
 413                 raise ExtractorError(u'Invalid metadata XML file')
 414             info['ext'] = info['url'].rpartition('.')[2]
 415             return [info]
 416         elif mobj.group('course'): # A course page
 417             course = mobj.group('course')
 418             info = {
 419                 'id': course,
 420                 'type': 'playlist',
 421                 'uploader': None,
 422                 'upload_date': None,
 423             }
 424
 425             coursepage = self._download_webpage(url, info['id'],
 426                                         note='Downloading course info page',
 427                                         errnote='Unable to download course info page')
 428
 429             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 430
 431             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 432                 coursepage, u'description', fatal=False)
 433
 434             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 435             info['list'] = [
 436                 {
 437                     'type': 'reference',
 438                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 439                 }
 440                     for vpage in links]
 441             results = []
 442             for entry in info['list']:
 443                 assert entry['type'] == 'reference'
 444                 results += self.extract(entry['url'])
 445             return results
 446         else: # Root page
 447             info = {
 448                 'id': 'Stanford OpenClassroom',
 449                 'type': 'playlist',
 450                 'uploader': None,
 451                 'upload_date': None,
 452             }
 453
 454             self.report_download_webpage(info['id'])
 455             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 456             try:
 457                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 458             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 459                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 460
 461             info['title'] = info['id']
 462
 463             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 464             info['list'] = [
 465                 {
 466                     'type': 'reference',
 467                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 468                 }
 469                     for cpage in links]
 470
 471             results = []
 472             for entry in info['list']:
 473                 assert entry['type'] == 'reference'
 474                 results += self.extract(entry['url'])
 475             return results
 476
 477 class MTVIE(InfoExtractor):
 478     """Information extractor for MTV.com"""
 479
 480     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 481     IE_NAME = u'mtv'
 482
 483     def _real_extract(self, url):
 484         mobj = re.match(self._VALID_URL, url)
 485         if mobj is None:
 486             raise ExtractorError(u'Invalid URL: %s' % url)
 487         if not mobj.group('proto'):
 488             url = 'http://' + url
 489         video_id = mobj.group('videoid')
 490
 491         webpage = self._download_webpage(url, video_id)
 492
 493         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 494             webpage, u'song name', fatal=False)
 495
 496         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 497             webpage, u'title')
 498
 499         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 500             webpage, u'mtvn_uri', fatal=False)
 501
 502         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 503             webpage, u'content id', fatal=False)
 504
 505         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 506         self.report_extraction(video_id)
 507         request = compat_urllib_request.Request(videogen_url)
 508         try:
 509             metadataXml = compat_urllib_request.urlopen(request).read()
 510         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 511             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 512
 513         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 514         renditions = mdoc.findall('.//rendition')
 515
 516         # For now, always pick the highest quality.
 517         rendition = renditions[-1]
 518
 519         try:
 520             _,_,ext = rendition.attrib['type'].partition('/')
 521             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 522             video_url = rendition.find('./src').text
 523         except KeyError:
 524             raise ExtractorError('Invalid rendition field.')
 525
 526         info = {
 527             'id': video_id,
 528             'url': video_url,
 529             'uploader': performer,
 530             'upload_date': None,
 531             'title': video_title,
 532             'ext': ext,
 533             'format': format,
 534         }
 535
 536         return [info]
 537
 538
 539 class YoukuIE(InfoExtractor):
 540     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 541
 542     def _gen_sid(self):
 543         nowTime = int(time.time() * 1000)
 544         random1 = random.randint(1000,1998)
 545         random2 = random.randint(1000,9999)
 546
 547         return "%d%d%d" %(nowTime,random1,random2)
 548
 549     def _get_file_ID_mix_string(self, seed):
 550         mixed = []
 551         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 552         seed = float(seed)
 553         for i in range(len(source)):
 554             seed  =  (seed * 211 + 30031 ) % 65536
 555             index  =  math.floor(seed / 65536 * len(source) )
 556             mixed.append(source[int(index)])
 557             source.remove(source[int(index)])
 558         #return ''.join(mixed)
 559         return mixed
 560
 561     def _get_file_id(self, fileId, seed):
 562         mixed = self._get_file_ID_mix_string(seed)
 563         ids = fileId.split('*')
 564         realId = []
 565         for ch in ids:
 566             if ch:
 567                 realId.append(mixed[int(ch)])
 568         return ''.join(realId)
 569
 570     def _real_extract(self, url):
 571         mobj = re.match(self._VALID_URL, url)
 572         if mobj is None:
 573             raise ExtractorError(u'Invalid URL: %s' % url)
 574         video_id = mobj.group('ID')
 575
 576         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 577
 578         jsondata = self._download_webpage(info_url, video_id)
 579
 580         self.report_extraction(video_id)
 581         try:
 582             config = json.loads(jsondata)
 583
 584             video_title =  config['data'][0]['title']
 585             seed = config['data'][0]['seed']
 586
 587             format = self._downloader.params.get('format', None)
 588             supported_format = list(config['data'][0]['streamfileids'].keys())
 589
 590             if format is None or format == 'best':
 591                 if 'hd2' in supported_format:
 592                     format = 'hd2'
 593                 else:
 594                     format = 'flv'
 595                 ext = u'flv'
 596             elif format == 'worst':
 597                 format = 'mp4'
 598                 ext = u'mp4'
 599             else:
 600                 format = 'flv'
 601                 ext = u'flv'
 602
 603
 604             fileid = config['data'][0]['streamfileids'][format]
 605             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 606         except (UnicodeDecodeError, ValueError, KeyError):
 607             raise ExtractorError(u'Unable to extract info section')
 608
 609         files_info=[]
 610         sid = self._gen_sid()
 611         fileid = self._get_file_id(fileid, seed)
 612
 613         #column 8,9 of fileid represent the segment number
 614         #fileid[7:9] should be changed
 615         for index, key in enumerate(keys):
 616
 617             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 618             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 619
 620             info = {
 621                 'id': '%s_part%02d' % (video_id, index),
 622                 'url': download_url,
 623                 'uploader': None,
 624                 'upload_date': None,
 625                 'title': video_title,
 626                 'ext': ext,
 627             }
 628             files_info.append(info)
 629
 630         return files_info
 631
 632
 633 class XNXXIE(InfoExtractor):
 634     """Information extractor for xnxx.com"""
 635
 636     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 637     IE_NAME = u'xnxx'
 638     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 639     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 640     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 641
 642     def _real_extract(self, url):
 643         mobj = re.match(self._VALID_URL, url)
 644         if mobj is None:
 645             raise ExtractorError(u'Invalid URL: %s' % url)
 646         video_id = mobj.group(1)
 647
 648         # Get webpage content
 649         webpage = self._download_webpage(url, video_id)
 650
 651         video_url = self._search_regex(self.VIDEO_URL_RE,
 652             webpage, u'video URL')
 653         video_url = compat_urllib_parse.unquote(video_url)
 654
 655         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 656             webpage, u'title')
 657
 658         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 659             webpage, u'thumbnail', fatal=False)
 660
 661         return [{
 662             'id': video_id,
 663             'url': video_url,
 664             'uploader': None,
 665             'upload_date': None,
 666             'title': video_title,
 667             'ext': 'flv',
 668             'thumbnail': video_thumbnail,
 669             'description': None,
 670         }]
 671
 672
 673
 674 class NBAIE(InfoExtractor):
 675     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
 676     IE_NAME = u'nba'
 677
 678     def _real_extract(self, url):
 679         mobj = re.match(self._VALID_URL, url)
 680         if mobj is None:
 681             raise ExtractorError(u'Invalid URL: %s' % url)
 682
 683         video_id = mobj.group(1)
 684
 685         webpage = self._download_webpage(url, video_id)
 686
 687         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 688
 689         shortened_video_id = video_id.rpartition('/')[2]
 690         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
 691             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 692
 693         # It isn't there in the HTML it returns to us
 694         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 695
 696         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 697
 698         info = {
 699             'id': shortened_video_id,
 700             'url': video_url,
 701             'ext': 'mp4',
 702             'title': title,
 703             # 'uploader_date': uploader_date,
 704             'description': description,
 705         }
 706         return [info]
 707
 708 class JustinTVIE(InfoExtractor):
 709     """Information extractor for justin.tv and twitch.tv"""
 710     # TODO: One broadcast may be split into multiple videos. The key
 711     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 712     # starts at 1 and increases. Can we treat all parts as one video?
 713
 714     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 715         (?:
 716             (?P<channelid>[^/]+)|
 717             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 718             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 719         )
 720         /?(?:\#.*)?$
 721         """
 722     _JUSTIN_PAGE_LIMIT = 100
 723     IE_NAME = u'justin.tv'
 724
 725     def report_download_page(self, channel, offset):
 726         """Report attempt to download a single page of videos."""
 727         self.to_screen(u'%s: Downloading video information from %d to %d' %
 728                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 729
 730     # Return count of items, list of *valid* items
 731     def _parse_page(self, url, video_id):
 732         webpage = self._download_webpage(url, video_id,
 733                                          u'Downloading video info JSON',
 734                                          u'unable to download video info JSON')
 735
 736         response = json.loads(webpage)
 737         if type(response) != list:
 738             error_text = response.get('error', 'unknown error')
 739             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 740         info = []
 741         for clip in response:
 742             video_url = clip['video_file_url']
 743             if video_url:
 744                 video_extension = os.path.splitext(video_url)[1][1:]
 745                 video_date = re.sub('-', '', clip['start_time'][:10])
 746                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 747                 video_id = clip['id']
 748                 video_title = clip.get('title', video_id)
 749                 info.append({
 750                     'id': video_id,
 751                     'url': video_url,
 752                     'title': video_title,
 753                     'uploader': clip.get('channel_name', video_uploader_id),
 754                     'uploader_id': video_uploader_id,
 755                     'upload_date': video_date,
 756                     'ext': video_extension,
 757                 })
 758         return (len(response), info)
 759
 760     def _real_extract(self, url):
 761         mobj = re.match(self._VALID_URL, url)
 762         if mobj is None:
 763             raise ExtractorError(u'invalid URL: %s' % url)
 764
 765         api_base = 'http://api.justin.tv'
 766         paged = False
 767         if mobj.group('channelid'):
 768             paged = True
 769             video_id = mobj.group('channelid')
 770             api = api_base + '/channel/archives/%s.json' % video_id
 771         elif mobj.group('chapterid'):
 772             chapter_id = mobj.group('chapterid')
 773
 774             webpage = self._download_webpage(url, chapter_id)
 775             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 776             if not m:
 777                 raise ExtractorError(u'Cannot find archive of a chapter')
 778             archive_id = m.group(1)
 779
 780             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 781             chapter_info_xml = self._download_webpage(api, chapter_id,
 782                                              note=u'Downloading chapter information',
 783                                              errnote=u'Chapter information download failed')
 784             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 785             for a in doc.findall('.//archive'):
 786                 if archive_id == a.find('./id').text:
 787                     break
 788             else:
 789                 raise ExtractorError(u'Could not find chapter in chapter information')
 790
 791             video_url = a.find('./video_file_url').text
 792             video_ext = video_url.rpartition('.')[2] or u'flv'
 793
 794             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 795             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 796                                    note='Downloading chapter metadata',
 797                                    errnote='Download of chapter metadata failed')
 798             chapter_info = json.loads(chapter_info_json)
 799
 800             bracket_start = int(doc.find('.//bracket_start').text)
 801             bracket_end = int(doc.find('.//bracket_end').text)
 802
 803             # TODO determine start (and probably fix up file)
 804             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 805             #video_url += u'?start=' + TODO:start_timestamp
 806             # bracket_start is 13290, but we want 51670615
 807             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 808                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 809
 810             info = {
 811                 'id': u'c' + chapter_id,
 812                 'url': video_url,
 813                 'ext': video_ext,
 814                 'title': chapter_info['title'],
 815                 'thumbnail': chapter_info['preview'],
 816                 'description': chapter_info['description'],
 817                 'uploader': chapter_info['channel']['display_name'],
 818                 'uploader_id': chapter_info['channel']['name'],
 819             }
 820             return [info]
 821         else:
 822             video_id = mobj.group('videoid')
 823             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 824
 825         self.report_extraction(video_id)
 826
 827         info = []
 828         offset = 0
 829         limit = self._JUSTIN_PAGE_LIMIT
 830         while True:
 831             if paged:
 832                 self.report_download_page(video_id, offset)
 833             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 834             page_count, page_info = self._parse_page(page_url, video_id)
 835             info.extend(page_info)
 836             if not paged or page_count != limit:
 837                 break
 838             offset += limit
 839         return info
 840
 841 class FunnyOrDieIE(InfoExtractor):
 842     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 843
 844     def _real_extract(self, url):
 845         mobj = re.match(self._VALID_URL, url)
 846         if mobj is None:
 847             raise ExtractorError(u'invalid URL: %s' % url)
 848
 849         video_id = mobj.group('id')
 850         webpage = self._download_webpage(url, video_id)
 851
 852         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 853             webpage, u'video URL', flags=re.DOTALL)
 854
 855         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 856             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 857
 858         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 859             webpage, u'description', fatal=False, flags=re.DOTALL)
 860
 861         info = {
 862             'id': video_id,
 863             'url': video_url,
 864             'ext': 'mp4',
 865             'title': title,
 866             'description': video_description,
 867         }
 868         return [info]
 869
 870 class SteamIE(InfoExtractor):
 871     _VALID_URL = r"""http://store\.steampowered\.com/
 872                 (agecheck/)?
 873                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 874                 (?P<gameID>\d+)/?
 875                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 876                 """
 877     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 878     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 879
 880     @classmethod
 881     def suitable(cls, url):
 882         """Receives a URL and returns True if suitable for this IE."""
 883         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 884
 885     def _real_extract(self, url):
 886         m = re.match(self._VALID_URL, url, re.VERBOSE)
 887         gameID = m.group('gameID')
 888
 889         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 890         webpage = self._download_webpage(videourl, gameID)
 891
 892         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 893             videourl = self._AGECHECK_TEMPLATE % gameID
 894             self.report_age_confirmation()
 895             webpage = self._download_webpage(videourl, gameID)
 896
 897         self.report_extraction(gameID)
 898         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 899                                              webpage, 'game title')
 900
 901         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 902         mweb = re.finditer(urlRE, webpage)
 903         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 904         titles = re.finditer(namesRE, webpage)
 905         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 906         thumbs = re.finditer(thumbsRE, webpage)
 907         videos = []
 908         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 909             video_id = vid.group('videoID')
 910             title = vtitle.group('videoName')
 911             video_url = vid.group('videoURL')
 912             video_thumb = thumb.group('thumbnail')
 913             if not video_url:
 914                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 915             info = {
 916                 'id':video_id,
 917                 'url':video_url,
 918                 'ext': 'flv',
 919                 'title': unescapeHTML(title),
 920                 'thumbnail': video_thumb
 921                   }
 922             videos.append(info)
 923         return [self.playlist_result(videos, gameID, game_title)]
 924
 925 class UstreamIE(InfoExtractor):
 926     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 927     IE_NAME = u'ustream'
 928
 929     def _real_extract(self, url):
 930         m = re.match(self._VALID_URL, url)
 931         video_id = m.group('videoID')
 932
 933         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 934         webpage = self._download_webpage(url, video_id)
 935
 936         self.report_extraction(video_id)
 937
 938         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 939             webpage, u'title')
 940
 941         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 942             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 943
 944         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 945             webpage, u'thumbnail', fatal=False)
 946
 947         info = {
 948                 'id': video_id,
 949                 'url': video_url,
 950                 'ext': 'flv',
 951                 'title': video_title,
 952                 'uploader': uploader,
 953                 'thumbnail': thumbnail,
 954                }
 955         return info
 956
 957 class WorldStarHipHopIE(InfoExtractor):
 958     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 959     IE_NAME = u'WorldStarHipHop'
 960
 961     def _real_extract(self, url):
 962         m = re.match(self._VALID_URL, url)
 963         video_id = m.group('id')
 964
 965         webpage_src = self._download_webpage(url, video_id)
 966
 967         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 968             webpage_src, u'video URL')
 969
 970         if 'mp4' in video_url:
 971             ext = 'mp4'
 972         else:
 973             ext = 'flv'
 974
 975         video_title = self._html_search_regex(r"<title>(.*)</title>",
 976             webpage_src, u'title')
 977
 978         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 979         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 980             webpage_src, u'thumbnail', fatal=False)
 981
 982         if not thumbnail:
 983             _title = r"""candytitles.*>(.*)</span>"""
 984             mobj = re.search(_title, webpage_src)
 985             if mobj is not None:
 986                 video_title = mobj.group(1)
 987
 988         results = [{
 989                     'id': video_id,
 990                     'url' : video_url,
 991                     'title' : video_title,
 992                     'thumbnail' : thumbnail,
 993                     'ext' : ext,
 994                     }]
 995         return results
 996
 997 class RBMARadioIE(InfoExtractor):
 998     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 999
1000     def _real_extract(self, url):
1001         m = re.match(self._VALID_URL, url)
1002         video_id = m.group('videoID')
1003
1004         webpage = self._download_webpage(url, video_id)
1005
1006         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1007             webpage, u'json data', flags=re.MULTILINE)
1008
1009         try:
1010             data = json.loads(json_data)
1011         except ValueError as e:
1012             raise ExtractorError(u'Invalid JSON: ' + str(e))
1013
1014         video_url = data['akamai_url'] + '&cbr=256'
1015         url_parts = compat_urllib_parse_urlparse(video_url)
1016         video_ext = url_parts.path.rpartition('.')[2]
1017         info = {
1018                 'id': video_id,
1019                 'url': video_url,
1020                 'ext': video_ext,
1021                 'title': data['title'],
1022                 'description': data.get('teaser_text'),
1023                 'location': data.get('country_of_origin'),
1024                 'uploader': data.get('host', {}).get('name'),
1025                 'uploader_id': data.get('host', {}).get('slug'),
1026                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1027                 'duration': data.get('duration'),
1028         }
1029         return [info]
1030
1031
1032 class YouPornIE(InfoExtractor):
1033     """Information extractor for youporn.com."""
1034     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1035
1036     def _print_formats(self, formats):
1037         """Print all available formats"""
1038         print(u'Available formats:')
1039         print(u'ext\t\tformat')
1040         print(u'---------------------------------')
1041         for format in formats:
1042             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1043
1044     def _specific(self, req_format, formats):
1045         for x in formats:
1046             if(x["format"]==req_format):
1047                 return x
1048         return None
1049
1050     def _real_extract(self, url):
1051         mobj = re.match(self._VALID_URL, url)
1052         if mobj is None:
1053             raise ExtractorError(u'Invalid URL: %s' % url)
1054         video_id = mobj.group('videoid')
1055
1056         req = compat_urllib_request.Request(url)
1057         req.add_header('Cookie', 'age_verified=1')
1058         webpage = self._download_webpage(req, video_id)
1059
1060         # Get JSON parameters
1061         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1062         try:
1063             params = json.loads(json_params)
1064         except:
1065             raise ExtractorError(u'Invalid JSON')
1066
1067         self.report_extraction(video_id)
1068         try:
1069             video_title = params['title']
1070             upload_date = unified_strdate(params['release_date_f'])
1071             video_description = params['description']
1072             video_uploader = params['submitted_by']
1073             thumbnail = params['thumbnails'][0]['image']
1074         except KeyError:
1075             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1076
1077         # Get all of the formats available
1078         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1079         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1080             webpage, u'download list').strip()
1081
1082         # Get all of the links from the page
1083         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1084         links = re.findall(LINK_RE, download_list_html)
1085         if(len(links) == 0):
1086             raise ExtractorError(u'ERROR: no known formats available for video')
1087
1088         self.to_screen(u'Links found: %d' % len(links))
1089
1090         formats = []
1091         for link in links:
1092
1093             # A link looks like this:
1094             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1095             # A path looks like this:
1096             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1097             video_url = unescapeHTML( link )
1098             path = compat_urllib_parse_urlparse( video_url ).path
1099             extension = os.path.splitext( path )[1][1:]
1100             format = path.split('/')[4].split('_')[:2]
1101             size = format[0]
1102             bitrate = format[1]
1103             format = "-".join( format )
1104             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1105
1106             formats.append({
1107                 'id': video_id,
1108                 'url': video_url,
1109                 'uploader': video_uploader,
1110                 'upload_date': upload_date,
1111                 'title': video_title,
1112                 'ext': extension,
1113                 'format': format,
1114                 'thumbnail': thumbnail,
1115                 'description': video_description
1116             })
1117
1118         if self._downloader.params.get('listformats', None):
1119             self._print_formats(formats)
1120             return
1121
1122         req_format = self._downloader.params.get('format', None)
1123         self.to_screen(u'Format: %s' % req_format)
1124
1125         if req_format is None or req_format == 'best':
1126             return [formats[0]]
1127         elif req_format == 'worst':
1128             return [formats[-1]]
1129         elif req_format in ('-1', 'all'):
1130             return formats
1131         else:
1132             format = self._specific( req_format, formats )
1133             if result is None:
1134                 raise ExtractorError(u'Requested format not available')
1135             return [format]
1136
1137
1138
1139 class PornotubeIE(InfoExtractor):
1140     """Information extractor for pornotube.com."""
1141     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1142
1143     def _real_extract(self, url):
1144         mobj = re.match(self._VALID_URL, url)
1145         if mobj is None:
1146             raise ExtractorError(u'Invalid URL: %s' % url)
1147
1148         video_id = mobj.group('videoid')
1149         video_title = mobj.group('title')
1150
1151         # Get webpage content
1152         webpage = self._download_webpage(url, video_id)
1153
1154         # Get the video URL
1155         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1156         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1157         video_url = compat_urllib_parse.unquote(video_url)
1158
1159         #Get the uploaded date
1160         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1161         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1162         if upload_date: upload_date = unified_strdate(upload_date)
1163
1164         info = {'id': video_id,
1165                 'url': video_url,
1166                 'uploader': None,
1167                 'upload_date': upload_date,
1168                 'title': video_title,
1169                 'ext': 'flv',
1170                 'format': 'flv'}
1171
1172         return [info]
1173
1174 class YouJizzIE(InfoExtractor):
1175     """Information extractor for youjizz.com."""
1176     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1177
1178     def _real_extract(self, url):
1179         mobj = re.match(self._VALID_URL, url)
1180         if mobj is None:
1181             raise ExtractorError(u'Invalid URL: %s' % url)
1182
1183         video_id = mobj.group('videoid')
1184
1185         # Get webpage content
1186         webpage = self._download_webpage(url, video_id)
1187
1188         # Get the video title
1189         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1190             webpage, u'title').strip()
1191
1192         # Get the embed page
1193         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1194         if result is None:
1195             raise ExtractorError(u'ERROR: unable to extract embed page')
1196
1197         embed_page_url = result.group(0).strip()
1198         video_id = result.group('videoid')
1199
1200         webpage = self._download_webpage(embed_page_url, video_id)
1201
1202         # Get the video URL
1203         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1204             webpage, u'video URL')
1205
1206         info = {'id': video_id,
1207                 'url': video_url,
1208                 'title': video_title,
1209                 'ext': 'flv',
1210                 'format': 'flv',
1211                 'player_url': embed_page_url}
1212
1213         return [info]
1214
1215 class EightTracksIE(InfoExtractor):
1216     IE_NAME = '8tracks'
1217     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1218
1219     def _real_extract(self, url):
1220         mobj = re.match(self._VALID_URL, url)
1221         if mobj is None:
1222             raise ExtractorError(u'Invalid URL: %s' % url)
1223         playlist_id = mobj.group('id')
1224
1225         webpage = self._download_webpage(url, playlist_id)
1226
1227         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1228         data = json.loads(json_like)
1229
1230         session = str(random.randint(0, 1000000000))
1231         mix_id = data['id']
1232         track_count = data['tracks_count']
1233         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1234         next_url = first_url
1235         res = []
1236         for i in itertools.count():
1237             api_json = self._download_webpage(next_url, playlist_id,
1238                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1239                 errnote=u'Failed to download song information')
1240             api_data = json.loads(api_json)
1241             track_data = api_data[u'set']['track']
1242             info = {
1243                 'id': track_data['id'],
1244                 'url': track_data['track_file_stream_url'],
1245                 'title': track_data['performer'] + u' - ' + track_data['name'],
1246                 'raw_title': track_data['name'],
1247                 'uploader_id': data['user']['login'],
1248                 'ext': 'm4a',
1249             }
1250             res.append(info)
1251             if api_data['set']['at_last_track']:
1252                 break
1253             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1254         return res
1255
1256 class KeekIE(InfoExtractor):
1257     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1258     IE_NAME = u'keek'
1259
1260     def _real_extract(self, url):
1261         m = re.match(self._VALID_URL, url)
1262         video_id = m.group('videoID')
1263
1264         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1265         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1266         webpage = self._download_webpage(url, video_id)
1267
1268         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1269             webpage, u'title')
1270
1271         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1272             webpage, u'uploader', fatal=False)
1273
1274         info = {
1275                 'id': video_id,
1276                 'url': video_url,
1277                 'ext': 'mp4',
1278                 'title': video_title,
1279                 'thumbnail': thumbnail,
1280                 'uploader': uploader
1281         }
1282         return [info]
1283
1284 class TEDIE(InfoExtractor):
1285     _VALID_URL=r'''http://www\.ted\.com/
1286                    (
1287                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1288                         |
1289                         ((?P<type_talk>talks)) # We have a simple talk
1290                    )
1291                    (/lang/(.*?))? # The url may contain the language
1292                    /(?P<name>\w+) # Here goes the name and then ".html"
1293                    '''
1294
1295     @classmethod
1296     def suitable(cls, url):
1297         """Receives a URL and returns True if suitable for this IE."""
1298         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1299
1300     def _real_extract(self, url):
1301         m=re.match(self._VALID_URL, url, re.VERBOSE)
1302         if m.group('type_talk'):
1303             return [self._talk_info(url)]
1304         else :
1305             playlist_id=m.group('playlist_id')
1306             name=m.group('name')
1307             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1308             return [self._playlist_videos_info(url,name,playlist_id)]
1309
1310     def _playlist_videos_info(self,url,name,playlist_id=0):
1311         '''Returns the videos of the playlist'''
1312         video_RE=r'''
1313                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1314                      ([.\s]*?)data-playlist_item_id="(\d+)"
1315                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1316                      '''
1317         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1318         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1319         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1320         m_names=re.finditer(video_name_RE,webpage)
1321
1322         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1323                                                  webpage, 'playlist title')
1324
1325         playlist_entries = []
1326         for m_video, m_name in zip(m_videos,m_names):
1327             video_id=m_video.group('video_id')
1328             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1329             playlist_entries.append(self.url_result(talk_url, 'TED'))
1330         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1331
1332     def _talk_info(self, url, video_id=0):
1333         """Return the video for the talk in the url"""
1334         m = re.match(self._VALID_URL, url,re.VERBOSE)
1335         video_name = m.group('name')
1336         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1337         self.report_extraction(video_name)
1338         # If the url includes the language we get the title translated
1339         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1340                                         webpage, 'title')
1341         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1342                                     webpage, 'json data')
1343         info = json.loads(json_data)
1344         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1345                                        webpage, 'description', flags = re.DOTALL)
1346
1347         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1348                                        webpage, 'thumbnail')
1349         info = {
1350                 'id': info['id'],
1351                 'url': info['htmlStreams'][-1]['file'],
1352                 'ext': 'mp4',
1353                 'title': title,
1354                 'thumbnail': thumbnail,
1355                 'description': desc,
1356                 }
1357         return info
1358
1359 class MySpassIE(InfoExtractor):
1360     _VALID_URL = r'http://www.myspass.de/.*'
1361
1362     def _real_extract(self, url):
1363         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1364
1365         # video id is the last path element of the URL
1366         # usually there is a trailing slash, so also try the second but last
1367         url_path = compat_urllib_parse_urlparse(url).path
1368         url_parent_path, video_id = os.path.split(url_path)
1369         if not video_id:
1370             _, video_id = os.path.split(url_parent_path)
1371
1372         # get metadata
1373         metadata_url = META_DATA_URL_TEMPLATE % video_id
1374         metadata_text = self._download_webpage(metadata_url, video_id)
1375         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1376
1377         # extract values from metadata
1378         url_flv_el = metadata.find('url_flv')
1379         if url_flv_el is None:
1380             raise ExtractorError(u'Unable to extract download url')
1381         video_url = url_flv_el.text
1382         extension = os.path.splitext(video_url)[1][1:]
1383         title_el = metadata.find('title')
1384         if title_el is None:
1385             raise ExtractorError(u'Unable to extract title')
1386         title = title_el.text
1387         format_id_el = metadata.find('format_id')
1388         if format_id_el is None:
1389             format = ext
1390         else:
1391             format = format_id_el.text
1392         description_el = metadata.find('description')
1393         if description_el is not None:
1394             description = description_el.text
1395         else:
1396             description = None
1397         imagePreview_el = metadata.find('imagePreview')
1398         if imagePreview_el is not None:
1399             thumbnail = imagePreview_el.text
1400         else:
1401             thumbnail = None
1402         info = {
1403             'id': video_id,
1404             'url': video_url,
1405             'title': title,
1406             'ext': extension,
1407             'format': format,
1408             'thumbnail': thumbnail,
1409             'description': description
1410         }
1411         return [info]
1412
1413 class SpiegelIE(InfoExtractor):
1414     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1415
1416     def _real_extract(self, url):
1417         m = re.match(self._VALID_URL, url)
1418         video_id = m.group('videoID')
1419
1420         webpage = self._download_webpage(url, video_id)
1421
1422         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1423             webpage, u'title')
1424
1425         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1426         xml_code = self._download_webpage(xml_url, video_id,
1427                     note=u'Downloading XML', errnote=u'Failed to download XML')
1428
1429         idoc = xml.etree.ElementTree.fromstring(xml_code)
1430         last_type = idoc[-1]
1431         filename = last_type.findall('./filename')[0].text
1432         duration = float(last_type.findall('./duration')[0].text)
1433
1434         video_url = 'http://video2.spiegel.de/flash/' + filename
1435         video_ext = filename.rpartition('.')[2]
1436         info = {
1437             'id': video_id,
1438             'url': video_url,
1439             'ext': video_ext,
1440             'title': video_title,
1441             'duration': duration,
1442         }
1443         return [info]
1444
1445 class LiveLeakIE(InfoExtractor):
1446
1447     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1448     IE_NAME = u'liveleak'
1449
1450     def _real_extract(self, url):
1451         mobj = re.match(self._VALID_URL, url)
1452         if mobj is None:
1453             raise ExtractorError(u'Invalid URL: %s' % url)
1454
1455         video_id = mobj.group('video_id')
1456
1457         webpage = self._download_webpage(url, video_id)
1458
1459         video_url = self._search_regex(r'file: "(.*?)",',
1460             webpage, u'video URL')
1461
1462         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1463             webpage, u'title').replace('LiveLeak.com -', '').strip()
1464
1465         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1466             webpage, u'description', fatal=False)
1467
1468         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1469             webpage, u'uploader', fatal=False)
1470
1471         info = {
1472             'id':  video_id,
1473             'url': video_url,
1474             'ext': 'mp4',
1475             'title': video_title,
1476             'description': video_description,
1477             'uploader': video_uploader
1478         }
1479
1480         return [info]
1481
1482
1483
1484 class TumblrIE(InfoExtractor):
1485     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1486
1487     def _real_extract(self, url):
1488         m_url = re.match(self._VALID_URL, url)
1489         video_id = m_url.group('id')
1490         blog = m_url.group('blog_name')
1491
1492         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1493         webpage = self._download_webpage(url, video_id)
1494
1495         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1496         video = re.search(re_video, webpage)
1497         if video is None:
1498            raise ExtractorError(u'Unable to extract video')
1499         video_url = video.group('video_url')
1500         ext = video.group('ext')
1501
1502         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1503             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1504         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1505
1506         # The only place where you can get a title, it's not complete,
1507         # but searching in other places doesn't work for all videos
1508         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1509             webpage, u'title', flags=re.DOTALL)
1510
1511         return [{'id': video_id,
1512                  'url': video_url,
1513                  'title': video_title,
1514                  'thumbnail': video_thumbnail,
1515                  'ext': ext
1516                  }]
1517
1518 class BandcampIE(InfoExtractor):
1519     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1520
1521     def _real_extract(self, url):
1522         mobj = re.match(self._VALID_URL, url)
1523         title = mobj.group('title')
1524         webpage = self._download_webpage(url, title)
1525         # We get the link to the free download page
1526         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1527         if m_download is None:
1528             raise ExtractorError(u'No free songs found')
1529
1530         download_link = m_download.group(1)
1531         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1532                        webpage, re.MULTILINE|re.DOTALL).group('id')
1533
1534         download_webpage = self._download_webpage(download_link, id,
1535                                                   'Downloading free downloads page')
1536         # We get the dictionary of the track from some javascrip code
1537         info = re.search(r'items: (.*?),$',
1538                          download_webpage, re.MULTILINE).group(1)
1539         info = json.loads(info)[0]
1540         # We pick mp3-320 for now, until format selection can be easily implemented.
1541         mp3_info = info[u'downloads'][u'mp3-320']
1542         # If we try to use this url it says the link has expired
1543         initial_url = mp3_info[u'url']
1544         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1545         m_url = re.match(re_url, initial_url)
1546         #We build the url we will use to get the final track url
1547         # This url is build in Bandcamp in the script download_bunde_*.js
1548         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1549         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1550         # If we could correctly generate the .rand field the url would be
1551         #in the "download_url" key
1552         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1553
1554         track_info = {'id':id,
1555                       'title' : info[u'title'],
1556                       'ext' :   'mp3',
1557                       'url' :   final_url,
1558                       'thumbnail' : info[u'thumb_url'],
1559                       'uploader' :  info[u'artist']
1560                       }
1561
1562         return [track_info]
1563
1564 class RedTubeIE(InfoExtractor):
1565     """Information Extractor for redtube"""
1566     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1567
1568     def _real_extract(self,url):
1569         mobj = re.match(self._VALID_URL, url)
1570         if mobj is None:
1571             raise ExtractorError(u'Invalid URL: %s' % url)
1572
1573         video_id = mobj.group('id')
1574         video_extension = 'mp4'
1575         webpage = self._download_webpage(url, video_id)
1576
1577         self.report_extraction(video_id)
1578
1579         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1580             webpage, u'video URL')
1581
1582         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1583             webpage, u'title')
1584
1585         return [{
1586             'id':       video_id,
1587             'url':      video_url,
1588             'ext':      video_extension,
1589             'title':    video_title,
1590         }]
1591
1592 class InaIE(InfoExtractor):
1593     """Information Extractor for Ina.fr"""
1594     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1595
1596     def _real_extract(self,url):
1597         mobj = re.match(self._VALID_URL, url)
1598
1599         video_id = mobj.group('id')
1600         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1601         video_extension = 'mp4'
1602         webpage = self._download_webpage(mrss_url, video_id)
1603
1604         self.report_extraction(video_id)
1605
1606         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1607             webpage, u'video URL')
1608
1609         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1610             webpage, u'title')
1611
1612         return [{
1613             'id':       video_id,
1614             'url':      video_url,
1615             'ext':      video_extension,
1616             'title':    video_title,
1617         }]
1618
1619 class HowcastIE(InfoExtractor):
1620     """Information Extractor for Howcast.com"""
1621     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1622
1623     def _real_extract(self, url):
1624         mobj = re.match(self._VALID_URL, url)
1625
1626         video_id = mobj.group('id')
1627         webpage_url = 'http://www.howcast.com/videos/' + video_id
1628         webpage = self._download_webpage(webpage_url, video_id)
1629
1630         self.report_extraction(video_id)
1631
1632         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1633             webpage, u'video URL')
1634
1635         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1636             webpage, u'title')
1637
1638         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1639             webpage, u'description', fatal=False)
1640
1641         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1642             webpage, u'thumbnail', fatal=False)
1643
1644         return [{
1645             'id':       video_id,
1646             'url':      video_url,
1647             'ext':      'mp4',
1648             'title':    video_title,
1649             'description': video_description,
1650             'thumbnail': thumbnail,
1651         }]
1652
1653 class VineIE(InfoExtractor):
1654     """Information Extractor for Vine.co"""
1655     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1656
1657     def _real_extract(self, url):
1658         mobj = re.match(self._VALID_URL, url)
1659
1660         video_id = mobj.group('id')
1661         webpage_url = 'https://vine.co/v/' + video_id
1662         webpage = self._download_webpage(webpage_url, video_id)
1663
1664         self.report_extraction(video_id)
1665
1666         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1667             webpage, u'video URL')
1668
1669         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1670             webpage, u'title')
1671
1672         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1673             webpage, u'thumbnail', fatal=False)
1674
1675         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1676             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1677
1678         return [{
1679             'id':        video_id,
1680             'url':       video_url,
1681             'ext':       'mp4',
1682             'title':     video_title,
1683             'thumbnail': thumbnail,
1684             'uploader':  uploader,
1685         }]
1686
1687 class FlickrIE(InfoExtractor):
1688     """Information Extractor for Flickr videos"""
1689     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1690
1691     def _real_extract(self, url):
1692         mobj = re.match(self._VALID_URL, url)
1693
1694         video_id = mobj.group('id')
1695         video_uploader_id = mobj.group('uploader_id')
1696         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1697         webpage = self._download_webpage(webpage_url, video_id)
1698
1699         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1700
1701         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1702         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1703
1704         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1705             first_xml, u'node_id')
1706
1707         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1708         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1709
1710         self.report_extraction(video_id)
1711
1712         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1713         if mobj is None:
1714             raise ExtractorError(u'Unable to extract video url')
1715         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1716
1717         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1718             webpage, u'video title')
1719
1720         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1721             webpage, u'description', fatal=False)
1722
1723         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1724             webpage, u'thumbnail', fatal=False)
1725
1726         return [{
1727             'id':          video_id,
1728             'url':         video_url,
1729             'ext':         'mp4',
1730             'title':       video_title,
1731             'description': video_description,
1732             'thumbnail':   thumbnail,
1733             'uploader_id': video_uploader_id,
1734         }]
1735
1736 class TeamcocoIE(InfoExtractor):
1737     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1738
1739     def _real_extract(self, url):
1740         mobj = re.match(self._VALID_URL, url)
1741         if mobj is None:
1742             raise ExtractorError(u'Invalid URL: %s' % url)
1743         url_title = mobj.group('url_title')
1744         webpage = self._download_webpage(url, url_title)
1745
1746         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1747             webpage, u'video id')
1748
1749         self.report_extraction(video_id)
1750
1751         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1752             webpage, u'title')
1753
1754         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1755             webpage, u'thumbnail', fatal=False)
1756
1757         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1758             webpage, u'description', fatal=False)
1759
1760         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1761         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1762
1763         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1764             data, u'video URL')
1765
1766         return [{
1767             'id':          video_id,
1768             'url':         video_url,
1769             'ext':         'mp4',
1770             'title':       video_title,
1771             'thumbnail':   thumbnail,
1772             'description': video_description,
1773         }]
1774
1775 class XHamsterIE(InfoExtractor):
1776     """Information Extractor for xHamster"""
1777     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1778
1779     def _real_extract(self,url):
1780         mobj = re.match(self._VALID_URL, url)
1781
1782         video_id = mobj.group('id')
1783         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1784         webpage = self._download_webpage(mrss_url, video_id)
1785
1786         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1787         if mobj is None:
1788             raise ExtractorError(u'Unable to extract media URL')
1789         if len(mobj.group('server')) == 0:
1790             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1791         else:
1792             video_url = mobj.group('server')+'/key='+mobj.group('file')
1793         video_extension = video_url.split('.')[-1]
1794
1795         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1796             webpage, u'title')
1797
1798         # Can't see the description anywhere in the UI
1799         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1800         #     webpage, u'description', fatal=False)
1801         # if video_description: video_description = unescapeHTML(video_description)
1802
1803         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1804         if mobj:
1805             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1806         else:
1807             video_upload_date = None
1808             self._downloader.report_warning(u'Unable to extract upload date')
1809
1810         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1811             webpage, u'uploader id', default=u'anonymous')
1812
1813         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1814             webpage, u'thumbnail', fatal=False)
1815
1816         return [{
1817             'id':       video_id,
1818             'url':      video_url,
1819             'ext':      video_extension,
1820             'title':    video_title,
1821             # 'description': video_description,
1822             'upload_date': video_upload_date,
1823             'uploader_id': video_uploader_id,
1824             'thumbnail': video_thumbnail
1825         }]
1826
1827 class HypemIE(InfoExtractor):
1828     """Information Extractor for hypem"""
1829     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1830
1831     def _real_extract(self, url):
1832         mobj = re.match(self._VALID_URL, url)
1833         if mobj is None:
1834             raise ExtractorError(u'Invalid URL: %s' % url)
1835         track_id = mobj.group(1)
1836
1837         data = { 'ax': 1, 'ts': time.time() }
1838         data_encoded = compat_urllib_parse.urlencode(data)
1839         complete_url = url + "?" + data_encoded
1840         request = compat_urllib_request.Request(complete_url)
1841         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1842         cookie = urlh.headers.get('Set-Cookie', '')
1843
1844         self.report_extraction(track_id)
1845
1846         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1847             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1848         try:
1849             track_list = json.loads(html_tracks)
1850             track = track_list[u'tracks'][0]
1851         except ValueError:
1852             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1853
1854         key = track[u"key"]
1855         track_id = track[u"id"]
1856         artist = track[u"artist"]
1857         title = track[u"song"]
1858
1859         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1860         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1861         request.add_header('cookie', cookie)
1862         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1863         try:
1864             song_data = json.loads(song_data_json)
1865         except ValueError:
1866             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1867         final_url = song_data[u"url"]
1868
1869         return [{
1870             'id':       track_id,
1871             'url':      final_url,
1872             'ext':      "mp3",
1873             'title':    title,
1874             'artist':   artist,
1875         }]
1876
1877 class Vbox7IE(InfoExtractor):
1878     """Information Extractor for Vbox7"""
1879     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1880
1881     def _real_extract(self,url):
1882         mobj = re.match(self._VALID_URL, url)
1883         if mobj is None:
1884             raise ExtractorError(u'Invalid URL: %s' % url)
1885         video_id = mobj.group(1)
1886
1887         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1888         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1889         redirect_url = urlh.geturl() + new_location
1890         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1891
1892         title = self._html_search_regex(r'<title>(.*)</title>',
1893             webpage, u'title').split('/')[0].strip()
1894
1895         ext = "flv"
1896         info_url = "http://vbox7.com/play/magare.do"
1897         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1898         info_request = compat_urllib_request.Request(info_url, data)
1899         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1900         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1901         if info_response is None:
1902             raise ExtractorError(u'Unable to extract the media url')
1903         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1904
1905         return [{
1906             'id':        video_id,
1907             'url':       final_url,
1908             'ext':       ext,
1909             'title':     title,
1910             'thumbnail': thumbnail_url,
1911         }]
1912
1913
1914 def gen_extractors():
1915     """ Return a list of an instance of every supported extractor.
1916     The order does matter; the first extractor matched is the one handling the URL.
1917     """
1918     return [
1919         YoutubePlaylistIE(),
1920         YoutubeChannelIE(),
1921         YoutubeUserIE(),
1922         YoutubeSearchIE(),
1923         YoutubeIE(),
1924         MetacafeIE(),
1925         DailymotionIE(),
1926         GoogleSearchIE(),
1927         PhotobucketIE(),
1928         YahooIE(),
1929         YahooSearchIE(),
1930         DepositFilesIE(),
1931         FacebookIE(),
1932         BlipTVIE(),
1933         BlipTVUserIE(),
1934         VimeoIE(),
1935         MyVideoIE(),
1936         ComedyCentralIE(),
1937         EscapistIE(),
1938         CollegeHumorIE(),
1939         XVideosIE(),
1940         SoundcloudSetIE(),
1941         SoundcloudIE(),
1942         InfoQIE(),
1943         MixcloudIE(),
1944         StanfordOpenClassroomIE(),
1945         MTVIE(),
1946         YoukuIE(),
1947         XNXXIE(),
1948         YouJizzIE(),
1949         PornotubeIE(),
1950         YouPornIE(),
1951         GooglePlusIE(),
1952         ArteTvIE(),
1953         NBAIE(),
1954         WorldStarHipHopIE(),
1955         JustinTVIE(),
1956         FunnyOrDieIE(),
1957         SteamIE(),
1958         UstreamIE(),
1959         RBMARadioIE(),
1960         EightTracksIE(),
1961         KeekIE(),
1962         TEDIE(),
1963         MySpassIE(),
1964         SpiegelIE(),
1965         LiveLeakIE(),
1966         ARDIE(),
1967         ZDFIE(),
1968         TumblrIE(),
1969         BandcampIE(),
1970         RedTubeIE(),
1971         InaIE(),
1972         HowcastIE(),
1973         VineIE(),
1974         FlickrIE(),
1975         TeamcocoIE(),
1976         XHamsterIE(),
1977         HypemIE(),
1978         Vbox7IE(),
1979         GametrailersIE(),
1980         StatigramIE(),
1981         GenericIE()
1982     ]
1983
1984 def get_info_extractor(ie_name):
1985     """Returns the info extractor class with the given ie_name"""
1986     return globals()[ie_name+'IE']