_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.metacafe import MetacafeIE
  35 from .extractor.myvideo import MyVideoIE
  36 from .extractor.statigram import StatigramIE
  37 from .extractor.photobucket import PhotobucketIE
  38 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  39 from .extractor.vimeo import VimeoIE
  40 from .extractor.yahoo import YahooIE, YahooSearchIE
  41 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  42 from .extractor.zdf import ZDFIE
  43
  44
  45 class XVideosIE(InfoExtractor):
  46     """Information extractor for xvideos.com"""
  47
  48     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
  49     IE_NAME = u'xvideos'
  50
  51     def _real_extract(self, url):
  52         mobj = re.match(self._VALID_URL, url)
  53         if mobj is None:
  54             raise ExtractorError(u'Invalid URL: %s' % url)
  55         video_id = mobj.group(1)
  56
  57         webpage = self._download_webpage(url, video_id)
  58
  59         self.report_extraction(video_id)
  60
  61         # Extract video URL
  62         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
  63             webpage, u'video URL'))
  64
  65         # Extract title
  66         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
  67             webpage, u'title')
  68
  69         # Extract video thumbnail
  70         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
  71             webpage, u'thumbnail', fatal=False)
  72
  73         info = {
  74             'id': video_id,
  75             'url': video_url,
  76             'uploader': None,
  77             'upload_date': None,
  78             'title': video_title,
  79             'ext': 'flv',
  80             'thumbnail': video_thumbnail,
  81             'description': None,
  82         }
  83
  84         return [info]
  85
  86
  87
  88
  89 class InfoQIE(InfoExtractor):
  90     """Information extractor for infoq.com"""
  91     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
  92
  93     def _real_extract(self, url):
  94         mobj = re.match(self._VALID_URL, url)
  95         if mobj is None:
  96             raise ExtractorError(u'Invalid URL: %s' % url)
  97
  98         webpage = self._download_webpage(url, video_id=url)
  99         self.report_extraction(url)
 100
 101         # Extract video URL
 102         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 103         if mobj is None:
 104             raise ExtractorError(u'Unable to extract video url')
 105         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 106         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 107
 108         # Extract title
 109         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 110             webpage, u'title')
 111
 112         # Extract description
 113         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 114             webpage, u'description', fatal=False)
 115
 116         video_filename = video_url.split('/')[-1]
 117         video_id, extension = video_filename.split('.')
 118
 119         info = {
 120             'id': video_id,
 121             'url': video_url,
 122             'uploader': None,
 123             'upload_date': None,
 124             'title': video_title,
 125             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 126             'thumbnail': None,
 127             'description': video_description,
 128         }
 129
 130         return [info]
 131
 132 class MixcloudIE(InfoExtractor):
 133     """Information extractor for www.mixcloud.com"""
 134
 135     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 136     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 137     IE_NAME = u'mixcloud'
 138
 139     def report_download_json(self, file_id):
 140         """Report JSON download."""
 141         self.to_screen(u'Downloading json')
 142
 143     def get_urls(self, jsonData, fmt, bitrate='best'):
 144         """Get urls from 'audio_formats' section in json"""
 145         file_url = None
 146         try:
 147             bitrate_list = jsonData[fmt]
 148             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 149                 bitrate = max(bitrate_list) # select highest
 150
 151             url_list = jsonData[fmt][bitrate]
 152         except TypeError: # we have no bitrate info.
 153             url_list = jsonData[fmt]
 154         return url_list
 155
 156     def check_urls(self, url_list):
 157         """Returns 1st active url from list"""
 158         for url in url_list:
 159             try:
 160                 compat_urllib_request.urlopen(url)
 161                 return url
 162             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 163                 url = None
 164
 165         return None
 166
 167     def _print_formats(self, formats):
 168         print('Available formats:')
 169         for fmt in formats.keys():
 170             for b in formats[fmt]:
 171                 try:
 172                     ext = formats[fmt][b][0]
 173                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 174                 except TypeError: # we have no bitrate info
 175                     ext = formats[fmt][0]
 176                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 177                     break
 178
 179     def _real_extract(self, url):
 180         mobj = re.match(self._VALID_URL, url)
 181         if mobj is None:
 182             raise ExtractorError(u'Invalid URL: %s' % url)
 183         # extract uploader & filename from url
 184         uploader = mobj.group(1).decode('utf-8')
 185         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 186
 187         # construct API request
 188         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 189         # retrieve .json file with links to files
 190         request = compat_urllib_request.Request(file_url)
 191         try:
 192             self.report_download_json(file_url)
 193             jsonData = compat_urllib_request.urlopen(request).read()
 194         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 195             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 196
 197         # parse JSON
 198         json_data = json.loads(jsonData)
 199         player_url = json_data['player_swf_url']
 200         formats = dict(json_data['audio_formats'])
 201
 202         req_format = self._downloader.params.get('format', None)
 203         bitrate = None
 204
 205         if self._downloader.params.get('listformats', None):
 206             self._print_formats(formats)
 207             return
 208
 209         if req_format is None or req_format == 'best':
 210             for format_param in formats.keys():
 211                 url_list = self.get_urls(formats, format_param)
 212                 # check urls
 213                 file_url = self.check_urls(url_list)
 214                 if file_url is not None:
 215                     break # got it!
 216         else:
 217             if req_format not in formats:
 218                 raise ExtractorError(u'Format is not available')
 219
 220             url_list = self.get_urls(formats, req_format)
 221             file_url = self.check_urls(url_list)
 222             format_param = req_format
 223
 224         return [{
 225             'id': file_id.decode('utf-8'),
 226             'url': file_url.decode('utf-8'),
 227             'uploader': uploader.decode('utf-8'),
 228             'upload_date': None,
 229             'title': json_data['name'],
 230             'ext': file_url.split('.')[-1].decode('utf-8'),
 231             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 232             'thumbnail': json_data['thumbnail_url'],
 233             'description': json_data['description'],
 234             'player_url': player_url.decode('utf-8'),
 235         }]
 236
 237 class StanfordOpenClassroomIE(InfoExtractor):
 238     """Information extractor for Stanford's Open ClassRoom"""
 239
 240     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 241     IE_NAME = u'stanfordoc'
 242
 243     def _real_extract(self, url):
 244         mobj = re.match(self._VALID_URL, url)
 245         if mobj is None:
 246             raise ExtractorError(u'Invalid URL: %s' % url)
 247
 248         if mobj.group('course') and mobj.group('video'): # A specific video
 249             course = mobj.group('course')
 250             video = mobj.group('video')
 251             info = {
 252                 'id': course + '_' + video,
 253                 'uploader': None,
 254                 'upload_date': None,
 255             }
 256
 257             self.report_extraction(info['id'])
 258             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 259             xmlUrl = baseUrl + video + '.xml'
 260             try:
 261                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 262             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 263                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 264             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 265             try:
 266                 info['title'] = mdoc.findall('./title')[0].text
 267                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 268             except IndexError:
 269                 raise ExtractorError(u'Invalid metadata XML file')
 270             info['ext'] = info['url'].rpartition('.')[2]
 271             return [info]
 272         elif mobj.group('course'): # A course page
 273             course = mobj.group('course')
 274             info = {
 275                 'id': course,
 276                 'type': 'playlist',
 277                 'uploader': None,
 278                 'upload_date': None,
 279             }
 280
 281             coursepage = self._download_webpage(url, info['id'],
 282                                         note='Downloading course info page',
 283                                         errnote='Unable to download course info page')
 284
 285             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 286
 287             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 288                 coursepage, u'description', fatal=False)
 289
 290             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 291             info['list'] = [
 292                 {
 293                     'type': 'reference',
 294                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 295                 }
 296                     for vpage in links]
 297             results = []
 298             for entry in info['list']:
 299                 assert entry['type'] == 'reference'
 300                 results += self.extract(entry['url'])
 301             return results
 302         else: # Root page
 303             info = {
 304                 'id': 'Stanford OpenClassroom',
 305                 'type': 'playlist',
 306                 'uploader': None,
 307                 'upload_date': None,
 308             }
 309
 310             self.report_download_webpage(info['id'])
 311             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 312             try:
 313                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 314             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 315                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 316
 317             info['title'] = info['id']
 318
 319             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 320             info['list'] = [
 321                 {
 322                     'type': 'reference',
 323                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 324                 }
 325                     for cpage in links]
 326
 327             results = []
 328             for entry in info['list']:
 329                 assert entry['type'] == 'reference'
 330                 results += self.extract(entry['url'])
 331             return results
 332
 333 class MTVIE(InfoExtractor):
 334     """Information extractor for MTV.com"""
 335
 336     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 337     IE_NAME = u'mtv'
 338
 339     def _real_extract(self, url):
 340         mobj = re.match(self._VALID_URL, url)
 341         if mobj is None:
 342             raise ExtractorError(u'Invalid URL: %s' % url)
 343         if not mobj.group('proto'):
 344             url = 'http://' + url
 345         video_id = mobj.group('videoid')
 346
 347         webpage = self._download_webpage(url, video_id)
 348
 349         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 350             webpage, u'song name', fatal=False)
 351
 352         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 353             webpage, u'title')
 354
 355         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 356             webpage, u'mtvn_uri', fatal=False)
 357
 358         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 359             webpage, u'content id', fatal=False)
 360
 361         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 362         self.report_extraction(video_id)
 363         request = compat_urllib_request.Request(videogen_url)
 364         try:
 365             metadataXml = compat_urllib_request.urlopen(request).read()
 366         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 367             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 368
 369         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 370         renditions = mdoc.findall('.//rendition')
 371
 372         # For now, always pick the highest quality.
 373         rendition = renditions[-1]
 374
 375         try:
 376             _,_,ext = rendition.attrib['type'].partition('/')
 377             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 378             video_url = rendition.find('./src').text
 379         except KeyError:
 380             raise ExtractorError('Invalid rendition field.')
 381
 382         info = {
 383             'id': video_id,
 384             'url': video_url,
 385             'uploader': performer,
 386             'upload_date': None,
 387             'title': video_title,
 388             'ext': ext,
 389             'format': format,
 390         }
 391
 392         return [info]
 393
 394
 395 class YoukuIE(InfoExtractor):
 396     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 397
 398     def _gen_sid(self):
 399         nowTime = int(time.time() * 1000)
 400         random1 = random.randint(1000,1998)
 401         random2 = random.randint(1000,9999)
 402
 403         return "%d%d%d" %(nowTime,random1,random2)
 404
 405     def _get_file_ID_mix_string(self, seed):
 406         mixed = []
 407         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 408         seed = float(seed)
 409         for i in range(len(source)):
 410             seed  =  (seed * 211 + 30031 ) % 65536
 411             index  =  math.floor(seed / 65536 * len(source) )
 412             mixed.append(source[int(index)])
 413             source.remove(source[int(index)])
 414         #return ''.join(mixed)
 415         return mixed
 416
 417     def _get_file_id(self, fileId, seed):
 418         mixed = self._get_file_ID_mix_string(seed)
 419         ids = fileId.split('*')
 420         realId = []
 421         for ch in ids:
 422             if ch:
 423                 realId.append(mixed[int(ch)])
 424         return ''.join(realId)
 425
 426     def _real_extract(self, url):
 427         mobj = re.match(self._VALID_URL, url)
 428         if mobj is None:
 429             raise ExtractorError(u'Invalid URL: %s' % url)
 430         video_id = mobj.group('ID')
 431
 432         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 433
 434         jsondata = self._download_webpage(info_url, video_id)
 435
 436         self.report_extraction(video_id)
 437         try:
 438             config = json.loads(jsondata)
 439
 440             video_title =  config['data'][0]['title']
 441             seed = config['data'][0]['seed']
 442
 443             format = self._downloader.params.get('format', None)
 444             supported_format = list(config['data'][0]['streamfileids'].keys())
 445
 446             if format is None or format == 'best':
 447                 if 'hd2' in supported_format:
 448                     format = 'hd2'
 449                 else:
 450                     format = 'flv'
 451                 ext = u'flv'
 452             elif format == 'worst':
 453                 format = 'mp4'
 454                 ext = u'mp4'
 455             else:
 456                 format = 'flv'
 457                 ext = u'flv'
 458
 459
 460             fileid = config['data'][0]['streamfileids'][format]
 461             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 462         except (UnicodeDecodeError, ValueError, KeyError):
 463             raise ExtractorError(u'Unable to extract info section')
 464
 465         files_info=[]
 466         sid = self._gen_sid()
 467         fileid = self._get_file_id(fileid, seed)
 468
 469         #column 8,9 of fileid represent the segment number
 470         #fileid[7:9] should be changed
 471         for index, key in enumerate(keys):
 472
 473             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 474             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 475
 476             info = {
 477                 'id': '%s_part%02d' % (video_id, index),
 478                 'url': download_url,
 479                 'uploader': None,
 480                 'upload_date': None,
 481                 'title': video_title,
 482                 'ext': ext,
 483             }
 484             files_info.append(info)
 485
 486         return files_info
 487
 488
 489 class XNXXIE(InfoExtractor):
 490     """Information extractor for xnxx.com"""
 491
 492     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 493     IE_NAME = u'xnxx'
 494     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 495     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 496     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 497
 498     def _real_extract(self, url):
 499         mobj = re.match(self._VALID_URL, url)
 500         if mobj is None:
 501             raise ExtractorError(u'Invalid URL: %s' % url)
 502         video_id = mobj.group(1)
 503
 504         # Get webpage content
 505         webpage = self._download_webpage(url, video_id)
 506
 507         video_url = self._search_regex(self.VIDEO_URL_RE,
 508             webpage, u'video URL')
 509         video_url = compat_urllib_parse.unquote(video_url)
 510
 511         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 512             webpage, u'title')
 513
 514         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 515             webpage, u'thumbnail', fatal=False)
 516
 517         return [{
 518             'id': video_id,
 519             'url': video_url,
 520             'uploader': None,
 521             'upload_date': None,
 522             'title': video_title,
 523             'ext': 'flv',
 524             'thumbnail': video_thumbnail,
 525             'description': None,
 526         }]
 527
 528
 529
 530 class NBAIE(InfoExtractor):
 531     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
 532     IE_NAME = u'nba'
 533
 534     def _real_extract(self, url):
 535         mobj = re.match(self._VALID_URL, url)
 536         if mobj is None:
 537             raise ExtractorError(u'Invalid URL: %s' % url)
 538
 539         video_id = mobj.group(1)
 540
 541         webpage = self._download_webpage(url, video_id)
 542
 543         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 544
 545         shortened_video_id = video_id.rpartition('/')[2]
 546         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
 547             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 548
 549         # It isn't there in the HTML it returns to us
 550         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 551
 552         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 553
 554         info = {
 555             'id': shortened_video_id,
 556             'url': video_url,
 557             'ext': 'mp4',
 558             'title': title,
 559             # 'uploader_date': uploader_date,
 560             'description': description,
 561         }
 562         return [info]
 563
 564 class JustinTVIE(InfoExtractor):
 565     """Information extractor for justin.tv and twitch.tv"""
 566     # TODO: One broadcast may be split into multiple videos. The key
 567     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 568     # starts at 1 and increases. Can we treat all parts as one video?
 569
 570     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 571         (?:
 572             (?P<channelid>[^/]+)|
 573             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 574             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 575         )
 576         /?(?:\#.*)?$
 577         """
 578     _JUSTIN_PAGE_LIMIT = 100
 579     IE_NAME = u'justin.tv'
 580
 581     def report_download_page(self, channel, offset):
 582         """Report attempt to download a single page of videos."""
 583         self.to_screen(u'%s: Downloading video information from %d to %d' %
 584                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 585
 586     # Return count of items, list of *valid* items
 587     def _parse_page(self, url, video_id):
 588         webpage = self._download_webpage(url, video_id,
 589                                          u'Downloading video info JSON',
 590                                          u'unable to download video info JSON')
 591
 592         response = json.loads(webpage)
 593         if type(response) != list:
 594             error_text = response.get('error', 'unknown error')
 595             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 596         info = []
 597         for clip in response:
 598             video_url = clip['video_file_url']
 599             if video_url:
 600                 video_extension = os.path.splitext(video_url)[1][1:]
 601                 video_date = re.sub('-', '', clip['start_time'][:10])
 602                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 603                 video_id = clip['id']
 604                 video_title = clip.get('title', video_id)
 605                 info.append({
 606                     'id': video_id,
 607                     'url': video_url,
 608                     'title': video_title,
 609                     'uploader': clip.get('channel_name', video_uploader_id),
 610                     'uploader_id': video_uploader_id,
 611                     'upload_date': video_date,
 612                     'ext': video_extension,
 613                 })
 614         return (len(response), info)
 615
 616     def _real_extract(self, url):
 617         mobj = re.match(self._VALID_URL, url)
 618         if mobj is None:
 619             raise ExtractorError(u'invalid URL: %s' % url)
 620
 621         api_base = 'http://api.justin.tv'
 622         paged = False
 623         if mobj.group('channelid'):
 624             paged = True
 625             video_id = mobj.group('channelid')
 626             api = api_base + '/channel/archives/%s.json' % video_id
 627         elif mobj.group('chapterid'):
 628             chapter_id = mobj.group('chapterid')
 629
 630             webpage = self._download_webpage(url, chapter_id)
 631             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 632             if not m:
 633                 raise ExtractorError(u'Cannot find archive of a chapter')
 634             archive_id = m.group(1)
 635
 636             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 637             chapter_info_xml = self._download_webpage(api, chapter_id,
 638                                              note=u'Downloading chapter information',
 639                                              errnote=u'Chapter information download failed')
 640             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 641             for a in doc.findall('.//archive'):
 642                 if archive_id == a.find('./id').text:
 643                     break
 644             else:
 645                 raise ExtractorError(u'Could not find chapter in chapter information')
 646
 647             video_url = a.find('./video_file_url').text
 648             video_ext = video_url.rpartition('.')[2] or u'flv'
 649
 650             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 651             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 652                                    note='Downloading chapter metadata',
 653                                    errnote='Download of chapter metadata failed')
 654             chapter_info = json.loads(chapter_info_json)
 655
 656             bracket_start = int(doc.find('.//bracket_start').text)
 657             bracket_end = int(doc.find('.//bracket_end').text)
 658
 659             # TODO determine start (and probably fix up file)
 660             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 661             #video_url += u'?start=' + TODO:start_timestamp
 662             # bracket_start is 13290, but we want 51670615
 663             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 664                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 665
 666             info = {
 667                 'id': u'c' + chapter_id,
 668                 'url': video_url,
 669                 'ext': video_ext,
 670                 'title': chapter_info['title'],
 671                 'thumbnail': chapter_info['preview'],
 672                 'description': chapter_info['description'],
 673                 'uploader': chapter_info['channel']['display_name'],
 674                 'uploader_id': chapter_info['channel']['name'],
 675             }
 676             return [info]
 677         else:
 678             video_id = mobj.group('videoid')
 679             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 680
 681         self.report_extraction(video_id)
 682
 683         info = []
 684         offset = 0
 685         limit = self._JUSTIN_PAGE_LIMIT
 686         while True:
 687             if paged:
 688                 self.report_download_page(video_id, offset)
 689             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 690             page_count, page_info = self._parse_page(page_url, video_id)
 691             info.extend(page_info)
 692             if not paged or page_count != limit:
 693                 break
 694             offset += limit
 695         return info
 696
 697 class FunnyOrDieIE(InfoExtractor):
 698     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 699
 700     def _real_extract(self, url):
 701         mobj = re.match(self._VALID_URL, url)
 702         if mobj is None:
 703             raise ExtractorError(u'invalid URL: %s' % url)
 704
 705         video_id = mobj.group('id')
 706         webpage = self._download_webpage(url, video_id)
 707
 708         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 709             webpage, u'video URL', flags=re.DOTALL)
 710
 711         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 712             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 713
 714         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 715             webpage, u'description', fatal=False, flags=re.DOTALL)
 716
 717         info = {
 718             'id': video_id,
 719             'url': video_url,
 720             'ext': 'mp4',
 721             'title': title,
 722             'description': video_description,
 723         }
 724         return [info]
 725
 726 class SteamIE(InfoExtractor):
 727     _VALID_URL = r"""http://store\.steampowered\.com/
 728                 (agecheck/)?
 729                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 730                 (?P<gameID>\d+)/?
 731                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 732                 """
 733     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 734     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 735
 736     @classmethod
 737     def suitable(cls, url):
 738         """Receives a URL and returns True if suitable for this IE."""
 739         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 740
 741     def _real_extract(self, url):
 742         m = re.match(self._VALID_URL, url, re.VERBOSE)
 743         gameID = m.group('gameID')
 744
 745         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 746         webpage = self._download_webpage(videourl, gameID)
 747
 748         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 749             videourl = self._AGECHECK_TEMPLATE % gameID
 750             self.report_age_confirmation()
 751             webpage = self._download_webpage(videourl, gameID)
 752
 753         self.report_extraction(gameID)
 754         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 755                                              webpage, 'game title')
 756
 757         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 758         mweb = re.finditer(urlRE, webpage)
 759         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 760         titles = re.finditer(namesRE, webpage)
 761         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 762         thumbs = re.finditer(thumbsRE, webpage)
 763         videos = []
 764         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 765             video_id = vid.group('videoID')
 766             title = vtitle.group('videoName')
 767             video_url = vid.group('videoURL')
 768             video_thumb = thumb.group('thumbnail')
 769             if not video_url:
 770                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 771             info = {
 772                 'id':video_id,
 773                 'url':video_url,
 774                 'ext': 'flv',
 775                 'title': unescapeHTML(title),
 776                 'thumbnail': video_thumb
 777                   }
 778             videos.append(info)
 779         return [self.playlist_result(videos, gameID, game_title)]
 780
 781 class UstreamIE(InfoExtractor):
 782     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 783     IE_NAME = u'ustream'
 784
 785     def _real_extract(self, url):
 786         m = re.match(self._VALID_URL, url)
 787         video_id = m.group('videoID')
 788
 789         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 790         webpage = self._download_webpage(url, video_id)
 791
 792         self.report_extraction(video_id)
 793
 794         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 795             webpage, u'title')
 796
 797         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 798             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 799
 800         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 801             webpage, u'thumbnail', fatal=False)
 802
 803         info = {
 804                 'id': video_id,
 805                 'url': video_url,
 806                 'ext': 'flv',
 807                 'title': video_title,
 808                 'uploader': uploader,
 809                 'thumbnail': thumbnail,
 810                }
 811         return info
 812
 813 class WorldStarHipHopIE(InfoExtractor):
 814     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 815     IE_NAME = u'WorldStarHipHop'
 816
 817     def _real_extract(self, url):
 818         m = re.match(self._VALID_URL, url)
 819         video_id = m.group('id')
 820
 821         webpage_src = self._download_webpage(url, video_id)
 822
 823         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 824             webpage_src, u'video URL')
 825
 826         if 'mp4' in video_url:
 827             ext = 'mp4'
 828         else:
 829             ext = 'flv'
 830
 831         video_title = self._html_search_regex(r"<title>(.*)</title>",
 832             webpage_src, u'title')
 833
 834         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 835         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 836             webpage_src, u'thumbnail', fatal=False)
 837
 838         if not thumbnail:
 839             _title = r"""candytitles.*>(.*)</span>"""
 840             mobj = re.search(_title, webpage_src)
 841             if mobj is not None:
 842                 video_title = mobj.group(1)
 843
 844         results = [{
 845                     'id': video_id,
 846                     'url' : video_url,
 847                     'title' : video_title,
 848                     'thumbnail' : thumbnail,
 849                     'ext' : ext,
 850                     }]
 851         return results
 852
 853 class RBMARadioIE(InfoExtractor):
 854     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 855
 856     def _real_extract(self, url):
 857         m = re.match(self._VALID_URL, url)
 858         video_id = m.group('videoID')
 859
 860         webpage = self._download_webpage(url, video_id)
 861
 862         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 863             webpage, u'json data', flags=re.MULTILINE)
 864
 865         try:
 866             data = json.loads(json_data)
 867         except ValueError as e:
 868             raise ExtractorError(u'Invalid JSON: ' + str(e))
 869
 870         video_url = data['akamai_url'] + '&cbr=256'
 871         url_parts = compat_urllib_parse_urlparse(video_url)
 872         video_ext = url_parts.path.rpartition('.')[2]
 873         info = {
 874                 'id': video_id,
 875                 'url': video_url,
 876                 'ext': video_ext,
 877                 'title': data['title'],
 878                 'description': data.get('teaser_text'),
 879                 'location': data.get('country_of_origin'),
 880                 'uploader': data.get('host', {}).get('name'),
 881                 'uploader_id': data.get('host', {}).get('slug'),
 882                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 883                 'duration': data.get('duration'),
 884         }
 885         return [info]
 886
 887
 888 class YouPornIE(InfoExtractor):
 889     """Information extractor for youporn.com."""
 890     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 891
 892     def _print_formats(self, formats):
 893         """Print all available formats"""
 894         print(u'Available formats:')
 895         print(u'ext\t\tformat')
 896         print(u'---------------------------------')
 897         for format in formats:
 898             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 899
 900     def _specific(self, req_format, formats):
 901         for x in formats:
 902             if(x["format"]==req_format):
 903                 return x
 904         return None
 905
 906     def _real_extract(self, url):
 907         mobj = re.match(self._VALID_URL, url)
 908         if mobj is None:
 909             raise ExtractorError(u'Invalid URL: %s' % url)
 910         video_id = mobj.group('videoid')
 911
 912         req = compat_urllib_request.Request(url)
 913         req.add_header('Cookie', 'age_verified=1')
 914         webpage = self._download_webpage(req, video_id)
 915
 916         # Get JSON parameters
 917         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 918         try:
 919             params = json.loads(json_params)
 920         except:
 921             raise ExtractorError(u'Invalid JSON')
 922
 923         self.report_extraction(video_id)
 924         try:
 925             video_title = params['title']
 926             upload_date = unified_strdate(params['release_date_f'])
 927             video_description = params['description']
 928             video_uploader = params['submitted_by']
 929             thumbnail = params['thumbnails'][0]['image']
 930         except KeyError:
 931             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 932
 933         # Get all of the formats available
 934         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 935         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 936             webpage, u'download list').strip()
 937
 938         # Get all of the links from the page
 939         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 940         links = re.findall(LINK_RE, download_list_html)
 941         if(len(links) == 0):
 942             raise ExtractorError(u'ERROR: no known formats available for video')
 943
 944         self.to_screen(u'Links found: %d' % len(links))
 945
 946         formats = []
 947         for link in links:
 948
 949             # A link looks like this:
 950             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 951             # A path looks like this:
 952             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 953             video_url = unescapeHTML( link )
 954             path = compat_urllib_parse_urlparse( video_url ).path
 955             extension = os.path.splitext( path )[1][1:]
 956             format = path.split('/')[4].split('_')[:2]
 957             size = format[0]
 958             bitrate = format[1]
 959             format = "-".join( format )
 960             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 961
 962             formats.append({
 963                 'id': video_id,
 964                 'url': video_url,
 965                 'uploader': video_uploader,
 966                 'upload_date': upload_date,
 967                 'title': video_title,
 968                 'ext': extension,
 969                 'format': format,
 970                 'thumbnail': thumbnail,
 971                 'description': video_description
 972             })
 973
 974         if self._downloader.params.get('listformats', None):
 975             self._print_formats(formats)
 976             return
 977
 978         req_format = self._downloader.params.get('format', None)
 979         self.to_screen(u'Format: %s' % req_format)
 980
 981         if req_format is None or req_format == 'best':
 982             return [formats[0]]
 983         elif req_format == 'worst':
 984             return [formats[-1]]
 985         elif req_format in ('-1', 'all'):
 986             return formats
 987         else:
 988             format = self._specific( req_format, formats )
 989             if result is None:
 990                 raise ExtractorError(u'Requested format not available')
 991             return [format]
 992
 993
 994
 995 class PornotubeIE(InfoExtractor):
 996     """Information extractor for pornotube.com."""
 997     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 998
 999     def _real_extract(self, url):
1000         mobj = re.match(self._VALID_URL, url)
1001         if mobj is None:
1002             raise ExtractorError(u'Invalid URL: %s' % url)
1003
1004         video_id = mobj.group('videoid')
1005         video_title = mobj.group('title')
1006
1007         # Get webpage content
1008         webpage = self._download_webpage(url, video_id)
1009
1010         # Get the video URL
1011         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1012         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1013         video_url = compat_urllib_parse.unquote(video_url)
1014
1015         #Get the uploaded date
1016         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1017         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1018         if upload_date: upload_date = unified_strdate(upload_date)
1019
1020         info = {'id': video_id,
1021                 'url': video_url,
1022                 'uploader': None,
1023                 'upload_date': upload_date,
1024                 'title': video_title,
1025                 'ext': 'flv',
1026                 'format': 'flv'}
1027
1028         return [info]
1029
1030 class YouJizzIE(InfoExtractor):
1031     """Information extractor for youjizz.com."""
1032     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1033
1034     def _real_extract(self, url):
1035         mobj = re.match(self._VALID_URL, url)
1036         if mobj is None:
1037             raise ExtractorError(u'Invalid URL: %s' % url)
1038
1039         video_id = mobj.group('videoid')
1040
1041         # Get webpage content
1042         webpage = self._download_webpage(url, video_id)
1043
1044         # Get the video title
1045         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1046             webpage, u'title').strip()
1047
1048         # Get the embed page
1049         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1050         if result is None:
1051             raise ExtractorError(u'ERROR: unable to extract embed page')
1052
1053         embed_page_url = result.group(0).strip()
1054         video_id = result.group('videoid')
1055
1056         webpage = self._download_webpage(embed_page_url, video_id)
1057
1058         # Get the video URL
1059         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1060             webpage, u'video URL')
1061
1062         info = {'id': video_id,
1063                 'url': video_url,
1064                 'title': video_title,
1065                 'ext': 'flv',
1066                 'format': 'flv',
1067                 'player_url': embed_page_url}
1068
1069         return [info]
1070
1071 class EightTracksIE(InfoExtractor):
1072     IE_NAME = '8tracks'
1073     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1074
1075     def _real_extract(self, url):
1076         mobj = re.match(self._VALID_URL, url)
1077         if mobj is None:
1078             raise ExtractorError(u'Invalid URL: %s' % url)
1079         playlist_id = mobj.group('id')
1080
1081         webpage = self._download_webpage(url, playlist_id)
1082
1083         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1084         data = json.loads(json_like)
1085
1086         session = str(random.randint(0, 1000000000))
1087         mix_id = data['id']
1088         track_count = data['tracks_count']
1089         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1090         next_url = first_url
1091         res = []
1092         for i in itertools.count():
1093             api_json = self._download_webpage(next_url, playlist_id,
1094                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1095                 errnote=u'Failed to download song information')
1096             api_data = json.loads(api_json)
1097             track_data = api_data[u'set']['track']
1098             info = {
1099                 'id': track_data['id'],
1100                 'url': track_data['track_file_stream_url'],
1101                 'title': track_data['performer'] + u' - ' + track_data['name'],
1102                 'raw_title': track_data['name'],
1103                 'uploader_id': data['user']['login'],
1104                 'ext': 'm4a',
1105             }
1106             res.append(info)
1107             if api_data['set']['at_last_track']:
1108                 break
1109             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1110         return res
1111
1112 class KeekIE(InfoExtractor):
1113     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1114     IE_NAME = u'keek'
1115
1116     def _real_extract(self, url):
1117         m = re.match(self._VALID_URL, url)
1118         video_id = m.group('videoID')
1119
1120         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1121         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1122         webpage = self._download_webpage(url, video_id)
1123
1124         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1125             webpage, u'title')
1126
1127         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1128             webpage, u'uploader', fatal=False)
1129
1130         info = {
1131                 'id': video_id,
1132                 'url': video_url,
1133                 'ext': 'mp4',
1134                 'title': video_title,
1135                 'thumbnail': thumbnail,
1136                 'uploader': uploader
1137         }
1138         return [info]
1139
1140 class TEDIE(InfoExtractor):
1141     _VALID_URL=r'''http://www\.ted\.com/
1142                    (
1143                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1144                         |
1145                         ((?P<type_talk>talks)) # We have a simple talk
1146                    )
1147                    (/lang/(.*?))? # The url may contain the language
1148                    /(?P<name>\w+) # Here goes the name and then ".html"
1149                    '''
1150
1151     @classmethod
1152     def suitable(cls, url):
1153         """Receives a URL and returns True if suitable for this IE."""
1154         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1155
1156     def _real_extract(self, url):
1157         m=re.match(self._VALID_URL, url, re.VERBOSE)
1158         if m.group('type_talk'):
1159             return [self._talk_info(url)]
1160         else :
1161             playlist_id=m.group('playlist_id')
1162             name=m.group('name')
1163             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1164             return [self._playlist_videos_info(url,name,playlist_id)]
1165
1166     def _playlist_videos_info(self,url,name,playlist_id=0):
1167         '''Returns the videos of the playlist'''
1168         video_RE=r'''
1169                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1170                      ([.\s]*?)data-playlist_item_id="(\d+)"
1171                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1172                      '''
1173         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1174         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1175         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1176         m_names=re.finditer(video_name_RE,webpage)
1177
1178         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1179                                                  webpage, 'playlist title')
1180
1181         playlist_entries = []
1182         for m_video, m_name in zip(m_videos,m_names):
1183             video_id=m_video.group('video_id')
1184             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1185             playlist_entries.append(self.url_result(talk_url, 'TED'))
1186         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1187
1188     def _talk_info(self, url, video_id=0):
1189         """Return the video for the talk in the url"""
1190         m = re.match(self._VALID_URL, url,re.VERBOSE)
1191         video_name = m.group('name')
1192         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1193         self.report_extraction(video_name)
1194         # If the url includes the language we get the title translated
1195         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1196                                         webpage, 'title')
1197         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1198                                     webpage, 'json data')
1199         info = json.loads(json_data)
1200         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1201                                        webpage, 'description', flags = re.DOTALL)
1202
1203         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1204                                        webpage, 'thumbnail')
1205         info = {
1206                 'id': info['id'],
1207                 'url': info['htmlStreams'][-1]['file'],
1208                 'ext': 'mp4',
1209                 'title': title,
1210                 'thumbnail': thumbnail,
1211                 'description': desc,
1212                 }
1213         return info
1214
1215 class MySpassIE(InfoExtractor):
1216     _VALID_URL = r'http://www.myspass.de/.*'
1217
1218     def _real_extract(self, url):
1219         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1220
1221         # video id is the last path element of the URL
1222         # usually there is a trailing slash, so also try the second but last
1223         url_path = compat_urllib_parse_urlparse(url).path
1224         url_parent_path, video_id = os.path.split(url_path)
1225         if not video_id:
1226             _, video_id = os.path.split(url_parent_path)
1227
1228         # get metadata
1229         metadata_url = META_DATA_URL_TEMPLATE % video_id
1230         metadata_text = self._download_webpage(metadata_url, video_id)
1231         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1232
1233         # extract values from metadata
1234         url_flv_el = metadata.find('url_flv')
1235         if url_flv_el is None:
1236             raise ExtractorError(u'Unable to extract download url')
1237         video_url = url_flv_el.text
1238         extension = os.path.splitext(video_url)[1][1:]
1239         title_el = metadata.find('title')
1240         if title_el is None:
1241             raise ExtractorError(u'Unable to extract title')
1242         title = title_el.text
1243         format_id_el = metadata.find('format_id')
1244         if format_id_el is None:
1245             format = ext
1246         else:
1247             format = format_id_el.text
1248         description_el = metadata.find('description')
1249         if description_el is not None:
1250             description = description_el.text
1251         else:
1252             description = None
1253         imagePreview_el = metadata.find('imagePreview')
1254         if imagePreview_el is not None:
1255             thumbnail = imagePreview_el.text
1256         else:
1257             thumbnail = None
1258         info = {
1259             'id': video_id,
1260             'url': video_url,
1261             'title': title,
1262             'ext': extension,
1263             'format': format,
1264             'thumbnail': thumbnail,
1265             'description': description
1266         }
1267         return [info]
1268
1269 class SpiegelIE(InfoExtractor):
1270     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1271
1272     def _real_extract(self, url):
1273         m = re.match(self._VALID_URL, url)
1274         video_id = m.group('videoID')
1275
1276         webpage = self._download_webpage(url, video_id)
1277
1278         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1279             webpage, u'title')
1280
1281         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1282         xml_code = self._download_webpage(xml_url, video_id,
1283                     note=u'Downloading XML', errnote=u'Failed to download XML')
1284
1285         idoc = xml.etree.ElementTree.fromstring(xml_code)
1286         last_type = idoc[-1]
1287         filename = last_type.findall('./filename')[0].text
1288         duration = float(last_type.findall('./duration')[0].text)
1289
1290         video_url = 'http://video2.spiegel.de/flash/' + filename
1291         video_ext = filename.rpartition('.')[2]
1292         info = {
1293             'id': video_id,
1294             'url': video_url,
1295             'ext': video_ext,
1296             'title': video_title,
1297             'duration': duration,
1298         }
1299         return [info]
1300
1301 class LiveLeakIE(InfoExtractor):
1302
1303     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1304     IE_NAME = u'liveleak'
1305
1306     def _real_extract(self, url):
1307         mobj = re.match(self._VALID_URL, url)
1308         if mobj is None:
1309             raise ExtractorError(u'Invalid URL: %s' % url)
1310
1311         video_id = mobj.group('video_id')
1312
1313         webpage = self._download_webpage(url, video_id)
1314
1315         video_url = self._search_regex(r'file: "(.*?)",',
1316             webpage, u'video URL')
1317
1318         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1319             webpage, u'title').replace('LiveLeak.com -', '').strip()
1320
1321         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1322             webpage, u'description', fatal=False)
1323
1324         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1325             webpage, u'uploader', fatal=False)
1326
1327         info = {
1328             'id':  video_id,
1329             'url': video_url,
1330             'ext': 'mp4',
1331             'title': video_title,
1332             'description': video_description,
1333             'uploader': video_uploader
1334         }
1335
1336         return [info]
1337
1338
1339
1340 class TumblrIE(InfoExtractor):
1341     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1342
1343     def _real_extract(self, url):
1344         m_url = re.match(self._VALID_URL, url)
1345         video_id = m_url.group('id')
1346         blog = m_url.group('blog_name')
1347
1348         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1349         webpage = self._download_webpage(url, video_id)
1350
1351         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1352         video = re.search(re_video, webpage)
1353         if video is None:
1354            raise ExtractorError(u'Unable to extract video')
1355         video_url = video.group('video_url')
1356         ext = video.group('ext')
1357
1358         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1359             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1360         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1361
1362         # The only place where you can get a title, it's not complete,
1363         # but searching in other places doesn't work for all videos
1364         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1365             webpage, u'title', flags=re.DOTALL)
1366
1367         return [{'id': video_id,
1368                  'url': video_url,
1369                  'title': video_title,
1370                  'thumbnail': video_thumbnail,
1371                  'ext': ext
1372                  }]
1373
1374 class BandcampIE(InfoExtractor):
1375     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1376
1377     def _real_extract(self, url):
1378         mobj = re.match(self._VALID_URL, url)
1379         title = mobj.group('title')
1380         webpage = self._download_webpage(url, title)
1381         # We get the link to the free download page
1382         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1383         if m_download is None:
1384             raise ExtractorError(u'No free songs found')
1385
1386         download_link = m_download.group(1)
1387         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1388                        webpage, re.MULTILINE|re.DOTALL).group('id')
1389
1390         download_webpage = self._download_webpage(download_link, id,
1391                                                   'Downloading free downloads page')
1392         # We get the dictionary of the track from some javascrip code
1393         info = re.search(r'items: (.*?),$',
1394                          download_webpage, re.MULTILINE).group(1)
1395         info = json.loads(info)[0]
1396         # We pick mp3-320 for now, until format selection can be easily implemented.
1397         mp3_info = info[u'downloads'][u'mp3-320']
1398         # If we try to use this url it says the link has expired
1399         initial_url = mp3_info[u'url']
1400         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1401         m_url = re.match(re_url, initial_url)
1402         #We build the url we will use to get the final track url
1403         # This url is build in Bandcamp in the script download_bunde_*.js
1404         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1405         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1406         # If we could correctly generate the .rand field the url would be
1407         #in the "download_url" key
1408         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1409
1410         track_info = {'id':id,
1411                       'title' : info[u'title'],
1412                       'ext' :   'mp3',
1413                       'url' :   final_url,
1414                       'thumbnail' : info[u'thumb_url'],
1415                       'uploader' :  info[u'artist']
1416                       }
1417
1418         return [track_info]
1419
1420 class RedTubeIE(InfoExtractor):
1421     """Information Extractor for redtube"""
1422     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1423
1424     def _real_extract(self,url):
1425         mobj = re.match(self._VALID_URL, url)
1426         if mobj is None:
1427             raise ExtractorError(u'Invalid URL: %s' % url)
1428
1429         video_id = mobj.group('id')
1430         video_extension = 'mp4'
1431         webpage = self._download_webpage(url, video_id)
1432
1433         self.report_extraction(video_id)
1434
1435         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1436             webpage, u'video URL')
1437
1438         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1439             webpage, u'title')
1440
1441         return [{
1442             'id':       video_id,
1443             'url':      video_url,
1444             'ext':      video_extension,
1445             'title':    video_title,
1446         }]
1447
1448 class InaIE(InfoExtractor):
1449     """Information Extractor for Ina.fr"""
1450     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1451
1452     def _real_extract(self,url):
1453         mobj = re.match(self._VALID_URL, url)
1454
1455         video_id = mobj.group('id')
1456         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1457         video_extension = 'mp4'
1458         webpage = self._download_webpage(mrss_url, video_id)
1459
1460         self.report_extraction(video_id)
1461
1462         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1463             webpage, u'video URL')
1464
1465         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1466             webpage, u'title')
1467
1468         return [{
1469             'id':       video_id,
1470             'url':      video_url,
1471             'ext':      video_extension,
1472             'title':    video_title,
1473         }]
1474
1475 class HowcastIE(InfoExtractor):
1476     """Information Extractor for Howcast.com"""
1477     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1478
1479     def _real_extract(self, url):
1480         mobj = re.match(self._VALID_URL, url)
1481
1482         video_id = mobj.group('id')
1483         webpage_url = 'http://www.howcast.com/videos/' + video_id
1484         webpage = self._download_webpage(webpage_url, video_id)
1485
1486         self.report_extraction(video_id)
1487
1488         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1489             webpage, u'video URL')
1490
1491         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1492             webpage, u'title')
1493
1494         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1495             webpage, u'description', fatal=False)
1496
1497         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1498             webpage, u'thumbnail', fatal=False)
1499
1500         return [{
1501             'id':       video_id,
1502             'url':      video_url,
1503             'ext':      'mp4',
1504             'title':    video_title,
1505             'description': video_description,
1506             'thumbnail': thumbnail,
1507         }]
1508
1509 class VineIE(InfoExtractor):
1510     """Information Extractor for Vine.co"""
1511     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1512
1513     def _real_extract(self, url):
1514         mobj = re.match(self._VALID_URL, url)
1515
1516         video_id = mobj.group('id')
1517         webpage_url = 'https://vine.co/v/' + video_id
1518         webpage = self._download_webpage(webpage_url, video_id)
1519
1520         self.report_extraction(video_id)
1521
1522         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1523             webpage, u'video URL')
1524
1525         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1526             webpage, u'title')
1527
1528         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1529             webpage, u'thumbnail', fatal=False)
1530
1531         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1532             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1533
1534         return [{
1535             'id':        video_id,
1536             'url':       video_url,
1537             'ext':       'mp4',
1538             'title':     video_title,
1539             'thumbnail': thumbnail,
1540             'uploader':  uploader,
1541         }]
1542
1543 class FlickrIE(InfoExtractor):
1544     """Information Extractor for Flickr videos"""
1545     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1546
1547     def _real_extract(self, url):
1548         mobj = re.match(self._VALID_URL, url)
1549
1550         video_id = mobj.group('id')
1551         video_uploader_id = mobj.group('uploader_id')
1552         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1553         webpage = self._download_webpage(webpage_url, video_id)
1554
1555         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1556
1557         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1558         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1559
1560         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1561             first_xml, u'node_id')
1562
1563         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1564         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1565
1566         self.report_extraction(video_id)
1567
1568         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1569         if mobj is None:
1570             raise ExtractorError(u'Unable to extract video url')
1571         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1572
1573         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1574             webpage, u'video title')
1575
1576         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1577             webpage, u'description', fatal=False)
1578
1579         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1580             webpage, u'thumbnail', fatal=False)
1581
1582         return [{
1583             'id':          video_id,
1584             'url':         video_url,
1585             'ext':         'mp4',
1586             'title':       video_title,
1587             'description': video_description,
1588             'thumbnail':   thumbnail,
1589             'uploader_id': video_uploader_id,
1590         }]
1591
1592 class TeamcocoIE(InfoExtractor):
1593     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1594
1595     def _real_extract(self, url):
1596         mobj = re.match(self._VALID_URL, url)
1597         if mobj is None:
1598             raise ExtractorError(u'Invalid URL: %s' % url)
1599         url_title = mobj.group('url_title')
1600         webpage = self._download_webpage(url, url_title)
1601
1602         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1603             webpage, u'video id')
1604
1605         self.report_extraction(video_id)
1606
1607         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1608             webpage, u'title')
1609
1610         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1611             webpage, u'thumbnail', fatal=False)
1612
1613         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1614             webpage, u'description', fatal=False)
1615
1616         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1617         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1618
1619         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1620             data, u'video URL')
1621
1622         return [{
1623             'id':          video_id,
1624             'url':         video_url,
1625             'ext':         'mp4',
1626             'title':       video_title,
1627             'thumbnail':   thumbnail,
1628             'description': video_description,
1629         }]
1630
1631 class XHamsterIE(InfoExtractor):
1632     """Information Extractor for xHamster"""
1633     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1634
1635     def _real_extract(self,url):
1636         mobj = re.match(self._VALID_URL, url)
1637
1638         video_id = mobj.group('id')
1639         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1640         webpage = self._download_webpage(mrss_url, video_id)
1641
1642         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1643         if mobj is None:
1644             raise ExtractorError(u'Unable to extract media URL')
1645         if len(mobj.group('server')) == 0:
1646             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1647         else:
1648             video_url = mobj.group('server')+'/key='+mobj.group('file')
1649         video_extension = video_url.split('.')[-1]
1650
1651         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1652             webpage, u'title')
1653
1654         # Can't see the description anywhere in the UI
1655         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1656         #     webpage, u'description', fatal=False)
1657         # if video_description: video_description = unescapeHTML(video_description)
1658
1659         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1660         if mobj:
1661             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1662         else:
1663             video_upload_date = None
1664             self._downloader.report_warning(u'Unable to extract upload date')
1665
1666         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1667             webpage, u'uploader id', default=u'anonymous')
1668
1669         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1670             webpage, u'thumbnail', fatal=False)
1671
1672         return [{
1673             'id':       video_id,
1674             'url':      video_url,
1675             'ext':      video_extension,
1676             'title':    video_title,
1677             # 'description': video_description,
1678             'upload_date': video_upload_date,
1679             'uploader_id': video_uploader_id,
1680             'thumbnail': video_thumbnail
1681         }]
1682
1683 class HypemIE(InfoExtractor):
1684     """Information Extractor for hypem"""
1685     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1686
1687     def _real_extract(self, url):
1688         mobj = re.match(self._VALID_URL, url)
1689         if mobj is None:
1690             raise ExtractorError(u'Invalid URL: %s' % url)
1691         track_id = mobj.group(1)
1692
1693         data = { 'ax': 1, 'ts': time.time() }
1694         data_encoded = compat_urllib_parse.urlencode(data)
1695         complete_url = url + "?" + data_encoded
1696         request = compat_urllib_request.Request(complete_url)
1697         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1698         cookie = urlh.headers.get('Set-Cookie', '')
1699
1700         self.report_extraction(track_id)
1701
1702         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1703             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1704         try:
1705             track_list = json.loads(html_tracks)
1706             track = track_list[u'tracks'][0]
1707         except ValueError:
1708             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1709
1710         key = track[u"key"]
1711         track_id = track[u"id"]
1712         artist = track[u"artist"]
1713         title = track[u"song"]
1714
1715         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1716         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1717         request.add_header('cookie', cookie)
1718         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1719         try:
1720             song_data = json.loads(song_data_json)
1721         except ValueError:
1722             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1723         final_url = song_data[u"url"]
1724
1725         return [{
1726             'id':       track_id,
1727             'url':      final_url,
1728             'ext':      "mp3",
1729             'title':    title,
1730             'artist':   artist,
1731         }]
1732
1733 class Vbox7IE(InfoExtractor):
1734     """Information Extractor for Vbox7"""
1735     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1736
1737     def _real_extract(self,url):
1738         mobj = re.match(self._VALID_URL, url)
1739         if mobj is None:
1740             raise ExtractorError(u'Invalid URL: %s' % url)
1741         video_id = mobj.group(1)
1742
1743         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1744         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1745         redirect_url = urlh.geturl() + new_location
1746         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1747
1748         title = self._html_search_regex(r'<title>(.*)</title>',
1749             webpage, u'title').split('/')[0].strip()
1750
1751         ext = "flv"
1752         info_url = "http://vbox7.com/play/magare.do"
1753         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1754         info_request = compat_urllib_request.Request(info_url, data)
1755         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1756         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1757         if info_response is None:
1758             raise ExtractorError(u'Unable to extract the media url')
1759         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1760
1761         return [{
1762             'id':        video_id,
1763             'url':       final_url,
1764             'ext':       ext,
1765             'title':     title,
1766             'thumbnail': thumbnail_url,
1767         }]
1768
1769
1770 def gen_extractors():
1771     """ Return a list of an instance of every supported extractor.
1772     The order does matter; the first extractor matched is the one handling the URL.
1773     """
1774     return [
1775         YoutubePlaylistIE(),
1776         YoutubeChannelIE(),
1777         YoutubeUserIE(),
1778         YoutubeSearchIE(),
1779         YoutubeIE(),
1780         MetacafeIE(),
1781         DailymotionIE(),
1782         GoogleSearchIE(),
1783         PhotobucketIE(),
1784         YahooIE(),
1785         YahooSearchIE(),
1786         DepositFilesIE(),
1787         FacebookIE(),
1788         BlipTVIE(),
1789         BlipTVUserIE(),
1790         VimeoIE(),
1791         MyVideoIE(),
1792         ComedyCentralIE(),
1793         EscapistIE(),
1794         CollegeHumorIE(),
1795         XVideosIE(),
1796         SoundcloudSetIE(),
1797         SoundcloudIE(),
1798         InfoQIE(),
1799         MixcloudIE(),
1800         StanfordOpenClassroomIE(),
1801         MTVIE(),
1802         YoukuIE(),
1803         XNXXIE(),
1804         YouJizzIE(),
1805         PornotubeIE(),
1806         YouPornIE(),
1807         GooglePlusIE(),
1808         ArteTvIE(),
1809         NBAIE(),
1810         WorldStarHipHopIE(),
1811         JustinTVIE(),
1812         FunnyOrDieIE(),
1813         SteamIE(),
1814         UstreamIE(),
1815         RBMARadioIE(),
1816         EightTracksIE(),
1817         KeekIE(),
1818         TEDIE(),
1819         MySpassIE(),
1820         SpiegelIE(),
1821         LiveLeakIE(),
1822         ARDIE(),
1823         ZDFIE(),
1824         TumblrIE(),
1825         BandcampIE(),
1826         RedTubeIE(),
1827         InaIE(),
1828         HowcastIE(),
1829         VineIE(),
1830         FlickrIE(),
1831         TeamcocoIE(),
1832         XHamsterIE(),
1833         HypemIE(),
1834         Vbox7IE(),
1835         GametrailersIE(),
1836         StatigramIE(),
1837         GenericIE()
1838     ]
1839
1840 def get_info_extractor(ie_name):
1841     """Returns the info extractor class with the given ie_name"""
1842     return globals()[ie_name+'IE']