_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.metacafe import MetacafeIE
  35 from .extractor.myvideo import MyVideoIE
  36 from .extractor.statigram import StatigramIE
  37 from .extractor.photobucket import PhotobucketIE
  38 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  39 from .extractor.vimeo import VimeoIE
  40 from .extractor.xvideos import XVideosIE
  41 from .extractor.yahoo import YahooIE, YahooSearchIE
  42 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  43 from .extractor.zdf import ZDFIE
  44
  45
  46
  47 class InfoQIE(InfoExtractor):
  48     """Information extractor for infoq.com"""
  49     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
  50
  51     def _real_extract(self, url):
  52         mobj = re.match(self._VALID_URL, url)
  53         if mobj is None:
  54             raise ExtractorError(u'Invalid URL: %s' % url)
  55
  56         webpage = self._download_webpage(url, video_id=url)
  57         self.report_extraction(url)
  58
  59         # Extract video URL
  60         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
  61         if mobj is None:
  62             raise ExtractorError(u'Unable to extract video url')
  63         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
  64         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
  65
  66         # Extract title
  67         video_title = self._search_regex(r'contentTitle = "(.*?)";',
  68             webpage, u'title')
  69
  70         # Extract description
  71         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
  72             webpage, u'description', fatal=False)
  73
  74         video_filename = video_url.split('/')[-1]
  75         video_id, extension = video_filename.split('.')
  76
  77         info = {
  78             'id': video_id,
  79             'url': video_url,
  80             'uploader': None,
  81             'upload_date': None,
  82             'title': video_title,
  83             'ext': extension, # Extension is always(?) mp4, but seems to be flv
  84             'thumbnail': None,
  85             'description': video_description,
  86         }
  87
  88         return [info]
  89
  90 class MixcloudIE(InfoExtractor):
  91     """Information extractor for www.mixcloud.com"""
  92
  93     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
  94     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
  95     IE_NAME = u'mixcloud'
  96
  97     def report_download_json(self, file_id):
  98         """Report JSON download."""
  99         self.to_screen(u'Downloading json')
 100
 101     def get_urls(self, jsonData, fmt, bitrate='best'):
 102         """Get urls from 'audio_formats' section in json"""
 103         file_url = None
 104         try:
 105             bitrate_list = jsonData[fmt]
 106             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 107                 bitrate = max(bitrate_list) # select highest
 108
 109             url_list = jsonData[fmt][bitrate]
 110         except TypeError: # we have no bitrate info.
 111             url_list = jsonData[fmt]
 112         return url_list
 113
 114     def check_urls(self, url_list):
 115         """Returns 1st active url from list"""
 116         for url in url_list:
 117             try:
 118                 compat_urllib_request.urlopen(url)
 119                 return url
 120             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 121                 url = None
 122
 123         return None
 124
 125     def _print_formats(self, formats):
 126         print('Available formats:')
 127         for fmt in formats.keys():
 128             for b in formats[fmt]:
 129                 try:
 130                     ext = formats[fmt][b][0]
 131                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 132                 except TypeError: # we have no bitrate info
 133                     ext = formats[fmt][0]
 134                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 135                     break
 136
 137     def _real_extract(self, url):
 138         mobj = re.match(self._VALID_URL, url)
 139         if mobj is None:
 140             raise ExtractorError(u'Invalid URL: %s' % url)
 141         # extract uploader & filename from url
 142         uploader = mobj.group(1).decode('utf-8')
 143         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 144
 145         # construct API request
 146         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 147         # retrieve .json file with links to files
 148         request = compat_urllib_request.Request(file_url)
 149         try:
 150             self.report_download_json(file_url)
 151             jsonData = compat_urllib_request.urlopen(request).read()
 152         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 153             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 154
 155         # parse JSON
 156         json_data = json.loads(jsonData)
 157         player_url = json_data['player_swf_url']
 158         formats = dict(json_data['audio_formats'])
 159
 160         req_format = self._downloader.params.get('format', None)
 161         bitrate = None
 162
 163         if self._downloader.params.get('listformats', None):
 164             self._print_formats(formats)
 165             return
 166
 167         if req_format is None or req_format == 'best':
 168             for format_param in formats.keys():
 169                 url_list = self.get_urls(formats, format_param)
 170                 # check urls
 171                 file_url = self.check_urls(url_list)
 172                 if file_url is not None:
 173                     break # got it!
 174         else:
 175             if req_format not in formats:
 176                 raise ExtractorError(u'Format is not available')
 177
 178             url_list = self.get_urls(formats, req_format)
 179             file_url = self.check_urls(url_list)
 180             format_param = req_format
 181
 182         return [{
 183             'id': file_id.decode('utf-8'),
 184             'url': file_url.decode('utf-8'),
 185             'uploader': uploader.decode('utf-8'),
 186             'upload_date': None,
 187             'title': json_data['name'],
 188             'ext': file_url.split('.')[-1].decode('utf-8'),
 189             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 190             'thumbnail': json_data['thumbnail_url'],
 191             'description': json_data['description'],
 192             'player_url': player_url.decode('utf-8'),
 193         }]
 194
 195 class StanfordOpenClassroomIE(InfoExtractor):
 196     """Information extractor for Stanford's Open ClassRoom"""
 197
 198     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 199     IE_NAME = u'stanfordoc'
 200
 201     def _real_extract(self, url):
 202         mobj = re.match(self._VALID_URL, url)
 203         if mobj is None:
 204             raise ExtractorError(u'Invalid URL: %s' % url)
 205
 206         if mobj.group('course') and mobj.group('video'): # A specific video
 207             course = mobj.group('course')
 208             video = mobj.group('video')
 209             info = {
 210                 'id': course + '_' + video,
 211                 'uploader': None,
 212                 'upload_date': None,
 213             }
 214
 215             self.report_extraction(info['id'])
 216             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 217             xmlUrl = baseUrl + video + '.xml'
 218             try:
 219                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 220             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 221                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 222             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 223             try:
 224                 info['title'] = mdoc.findall('./title')[0].text
 225                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 226             except IndexError:
 227                 raise ExtractorError(u'Invalid metadata XML file')
 228             info['ext'] = info['url'].rpartition('.')[2]
 229             return [info]
 230         elif mobj.group('course'): # A course page
 231             course = mobj.group('course')
 232             info = {
 233                 'id': course,
 234                 'type': 'playlist',
 235                 'uploader': None,
 236                 'upload_date': None,
 237             }
 238
 239             coursepage = self._download_webpage(url, info['id'],
 240                                         note='Downloading course info page',
 241                                         errnote='Unable to download course info page')
 242
 243             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 244
 245             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 246                 coursepage, u'description', fatal=False)
 247
 248             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 249             info['list'] = [
 250                 {
 251                     'type': 'reference',
 252                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 253                 }
 254                     for vpage in links]
 255             results = []
 256             for entry in info['list']:
 257                 assert entry['type'] == 'reference'
 258                 results += self.extract(entry['url'])
 259             return results
 260         else: # Root page
 261             info = {
 262                 'id': 'Stanford OpenClassroom',
 263                 'type': 'playlist',
 264                 'uploader': None,
 265                 'upload_date': None,
 266             }
 267
 268             self.report_download_webpage(info['id'])
 269             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 270             try:
 271                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 272             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 273                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 274
 275             info['title'] = info['id']
 276
 277             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 278             info['list'] = [
 279                 {
 280                     'type': 'reference',
 281                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 282                 }
 283                     for cpage in links]
 284
 285             results = []
 286             for entry in info['list']:
 287                 assert entry['type'] == 'reference'
 288                 results += self.extract(entry['url'])
 289             return results
 290
 291 class MTVIE(InfoExtractor):
 292     """Information extractor for MTV.com"""
 293
 294     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 295     IE_NAME = u'mtv'
 296
 297     def _real_extract(self, url):
 298         mobj = re.match(self._VALID_URL, url)
 299         if mobj is None:
 300             raise ExtractorError(u'Invalid URL: %s' % url)
 301         if not mobj.group('proto'):
 302             url = 'http://' + url
 303         video_id = mobj.group('videoid')
 304
 305         webpage = self._download_webpage(url, video_id)
 306
 307         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 308             webpage, u'song name', fatal=False)
 309
 310         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 311             webpage, u'title')
 312
 313         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 314             webpage, u'mtvn_uri', fatal=False)
 315
 316         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 317             webpage, u'content id', fatal=False)
 318
 319         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 320         self.report_extraction(video_id)
 321         request = compat_urllib_request.Request(videogen_url)
 322         try:
 323             metadataXml = compat_urllib_request.urlopen(request).read()
 324         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 325             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 326
 327         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 328         renditions = mdoc.findall('.//rendition')
 329
 330         # For now, always pick the highest quality.
 331         rendition = renditions[-1]
 332
 333         try:
 334             _,_,ext = rendition.attrib['type'].partition('/')
 335             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 336             video_url = rendition.find('./src').text
 337         except KeyError:
 338             raise ExtractorError('Invalid rendition field.')
 339
 340         info = {
 341             'id': video_id,
 342             'url': video_url,
 343             'uploader': performer,
 344             'upload_date': None,
 345             'title': video_title,
 346             'ext': ext,
 347             'format': format,
 348         }
 349
 350         return [info]
 351
 352
 353 class YoukuIE(InfoExtractor):
 354     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 355
 356     def _gen_sid(self):
 357         nowTime = int(time.time() * 1000)
 358         random1 = random.randint(1000,1998)
 359         random2 = random.randint(1000,9999)
 360
 361         return "%d%d%d" %(nowTime,random1,random2)
 362
 363     def _get_file_ID_mix_string(self, seed):
 364         mixed = []
 365         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 366         seed = float(seed)
 367         for i in range(len(source)):
 368             seed  =  (seed * 211 + 30031 ) % 65536
 369             index  =  math.floor(seed / 65536 * len(source) )
 370             mixed.append(source[int(index)])
 371             source.remove(source[int(index)])
 372         #return ''.join(mixed)
 373         return mixed
 374
 375     def _get_file_id(self, fileId, seed):
 376         mixed = self._get_file_ID_mix_string(seed)
 377         ids = fileId.split('*')
 378         realId = []
 379         for ch in ids:
 380             if ch:
 381                 realId.append(mixed[int(ch)])
 382         return ''.join(realId)
 383
 384     def _real_extract(self, url):
 385         mobj = re.match(self._VALID_URL, url)
 386         if mobj is None:
 387             raise ExtractorError(u'Invalid URL: %s' % url)
 388         video_id = mobj.group('ID')
 389
 390         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 391
 392         jsondata = self._download_webpage(info_url, video_id)
 393
 394         self.report_extraction(video_id)
 395         try:
 396             config = json.loads(jsondata)
 397
 398             video_title =  config['data'][0]['title']
 399             seed = config['data'][0]['seed']
 400
 401             format = self._downloader.params.get('format', None)
 402             supported_format = list(config['data'][0]['streamfileids'].keys())
 403
 404             if format is None or format == 'best':
 405                 if 'hd2' in supported_format:
 406                     format = 'hd2'
 407                 else:
 408                     format = 'flv'
 409                 ext = u'flv'
 410             elif format == 'worst':
 411                 format = 'mp4'
 412                 ext = u'mp4'
 413             else:
 414                 format = 'flv'
 415                 ext = u'flv'
 416
 417
 418             fileid = config['data'][0]['streamfileids'][format]
 419             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 420         except (UnicodeDecodeError, ValueError, KeyError):
 421             raise ExtractorError(u'Unable to extract info section')
 422
 423         files_info=[]
 424         sid = self._gen_sid()
 425         fileid = self._get_file_id(fileid, seed)
 426
 427         #column 8,9 of fileid represent the segment number
 428         #fileid[7:9] should be changed
 429         for index, key in enumerate(keys):
 430
 431             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 432             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 433
 434             info = {
 435                 'id': '%s_part%02d' % (video_id, index),
 436                 'url': download_url,
 437                 'uploader': None,
 438                 'upload_date': None,
 439                 'title': video_title,
 440                 'ext': ext,
 441             }
 442             files_info.append(info)
 443
 444         return files_info
 445
 446
 447 class XNXXIE(InfoExtractor):
 448     """Information extractor for xnxx.com"""
 449
 450     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 451     IE_NAME = u'xnxx'
 452     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 453     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 454     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 455
 456     def _real_extract(self, url):
 457         mobj = re.match(self._VALID_URL, url)
 458         if mobj is None:
 459             raise ExtractorError(u'Invalid URL: %s' % url)
 460         video_id = mobj.group(1)
 461
 462         # Get webpage content
 463         webpage = self._download_webpage(url, video_id)
 464
 465         video_url = self._search_regex(self.VIDEO_URL_RE,
 466             webpage, u'video URL')
 467         video_url = compat_urllib_parse.unquote(video_url)
 468
 469         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 470             webpage, u'title')
 471
 472         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 473             webpage, u'thumbnail', fatal=False)
 474
 475         return [{
 476             'id': video_id,
 477             'url': video_url,
 478             'uploader': None,
 479             'upload_date': None,
 480             'title': video_title,
 481             'ext': 'flv',
 482             'thumbnail': video_thumbnail,
 483             'description': None,
 484         }]
 485
 486
 487
 488 class NBAIE(InfoExtractor):
 489     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
 490     IE_NAME = u'nba'
 491
 492     def _real_extract(self, url):
 493         mobj = re.match(self._VALID_URL, url)
 494         if mobj is None:
 495             raise ExtractorError(u'Invalid URL: %s' % url)
 496
 497         video_id = mobj.group(1)
 498
 499         webpage = self._download_webpage(url, video_id)
 500
 501         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 502
 503         shortened_video_id = video_id.rpartition('/')[2]
 504         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
 505             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 506
 507         # It isn't there in the HTML it returns to us
 508         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 509
 510         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 511
 512         info = {
 513             'id': shortened_video_id,
 514             'url': video_url,
 515             'ext': 'mp4',
 516             'title': title,
 517             # 'uploader_date': uploader_date,
 518             'description': description,
 519         }
 520         return [info]
 521
 522 class JustinTVIE(InfoExtractor):
 523     """Information extractor for justin.tv and twitch.tv"""
 524     # TODO: One broadcast may be split into multiple videos. The key
 525     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 526     # starts at 1 and increases. Can we treat all parts as one video?
 527
 528     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 529         (?:
 530             (?P<channelid>[^/]+)|
 531             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 532             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 533         )
 534         /?(?:\#.*)?$
 535         """
 536     _JUSTIN_PAGE_LIMIT = 100
 537     IE_NAME = u'justin.tv'
 538
 539     def report_download_page(self, channel, offset):
 540         """Report attempt to download a single page of videos."""
 541         self.to_screen(u'%s: Downloading video information from %d to %d' %
 542                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 543
 544     # Return count of items, list of *valid* items
 545     def _parse_page(self, url, video_id):
 546         webpage = self._download_webpage(url, video_id,
 547                                          u'Downloading video info JSON',
 548                                          u'unable to download video info JSON')
 549
 550         response = json.loads(webpage)
 551         if type(response) != list:
 552             error_text = response.get('error', 'unknown error')
 553             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 554         info = []
 555         for clip in response:
 556             video_url = clip['video_file_url']
 557             if video_url:
 558                 video_extension = os.path.splitext(video_url)[1][1:]
 559                 video_date = re.sub('-', '', clip['start_time'][:10])
 560                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 561                 video_id = clip['id']
 562                 video_title = clip.get('title', video_id)
 563                 info.append({
 564                     'id': video_id,
 565                     'url': video_url,
 566                     'title': video_title,
 567                     'uploader': clip.get('channel_name', video_uploader_id),
 568                     'uploader_id': video_uploader_id,
 569                     'upload_date': video_date,
 570                     'ext': video_extension,
 571                 })
 572         return (len(response), info)
 573
 574     def _real_extract(self, url):
 575         mobj = re.match(self._VALID_URL, url)
 576         if mobj is None:
 577             raise ExtractorError(u'invalid URL: %s' % url)
 578
 579         api_base = 'http://api.justin.tv'
 580         paged = False
 581         if mobj.group('channelid'):
 582             paged = True
 583             video_id = mobj.group('channelid')
 584             api = api_base + '/channel/archives/%s.json' % video_id
 585         elif mobj.group('chapterid'):
 586             chapter_id = mobj.group('chapterid')
 587
 588             webpage = self._download_webpage(url, chapter_id)
 589             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 590             if not m:
 591                 raise ExtractorError(u'Cannot find archive of a chapter')
 592             archive_id = m.group(1)
 593
 594             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 595             chapter_info_xml = self._download_webpage(api, chapter_id,
 596                                              note=u'Downloading chapter information',
 597                                              errnote=u'Chapter information download failed')
 598             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 599             for a in doc.findall('.//archive'):
 600                 if archive_id == a.find('./id').text:
 601                     break
 602             else:
 603                 raise ExtractorError(u'Could not find chapter in chapter information')
 604
 605             video_url = a.find('./video_file_url').text
 606             video_ext = video_url.rpartition('.')[2] or u'flv'
 607
 608             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 609             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 610                                    note='Downloading chapter metadata',
 611                                    errnote='Download of chapter metadata failed')
 612             chapter_info = json.loads(chapter_info_json)
 613
 614             bracket_start = int(doc.find('.//bracket_start').text)
 615             bracket_end = int(doc.find('.//bracket_end').text)
 616
 617             # TODO determine start (and probably fix up file)
 618             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 619             #video_url += u'?start=' + TODO:start_timestamp
 620             # bracket_start is 13290, but we want 51670615
 621             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 622                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 623
 624             info = {
 625                 'id': u'c' + chapter_id,
 626                 'url': video_url,
 627                 'ext': video_ext,
 628                 'title': chapter_info['title'],
 629                 'thumbnail': chapter_info['preview'],
 630                 'description': chapter_info['description'],
 631                 'uploader': chapter_info['channel']['display_name'],
 632                 'uploader_id': chapter_info['channel']['name'],
 633             }
 634             return [info]
 635         else:
 636             video_id = mobj.group('videoid')
 637             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 638
 639         self.report_extraction(video_id)
 640
 641         info = []
 642         offset = 0
 643         limit = self._JUSTIN_PAGE_LIMIT
 644         while True:
 645             if paged:
 646                 self.report_download_page(video_id, offset)
 647             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 648             page_count, page_info = self._parse_page(page_url, video_id)
 649             info.extend(page_info)
 650             if not paged or page_count != limit:
 651                 break
 652             offset += limit
 653         return info
 654
 655 class FunnyOrDieIE(InfoExtractor):
 656     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 657
 658     def _real_extract(self, url):
 659         mobj = re.match(self._VALID_URL, url)
 660         if mobj is None:
 661             raise ExtractorError(u'invalid URL: %s' % url)
 662
 663         video_id = mobj.group('id')
 664         webpage = self._download_webpage(url, video_id)
 665
 666         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 667             webpage, u'video URL', flags=re.DOTALL)
 668
 669         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 670             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 671
 672         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 673             webpage, u'description', fatal=False, flags=re.DOTALL)
 674
 675         info = {
 676             'id': video_id,
 677             'url': video_url,
 678             'ext': 'mp4',
 679             'title': title,
 680             'description': video_description,
 681         }
 682         return [info]
 683
 684 class SteamIE(InfoExtractor):
 685     _VALID_URL = r"""http://store\.steampowered\.com/
 686                 (agecheck/)?
 687                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 688                 (?P<gameID>\d+)/?
 689                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 690                 """
 691     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 692     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 693
 694     @classmethod
 695     def suitable(cls, url):
 696         """Receives a URL and returns True if suitable for this IE."""
 697         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 698
 699     def _real_extract(self, url):
 700         m = re.match(self._VALID_URL, url, re.VERBOSE)
 701         gameID = m.group('gameID')
 702
 703         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 704         webpage = self._download_webpage(videourl, gameID)
 705
 706         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 707             videourl = self._AGECHECK_TEMPLATE % gameID
 708             self.report_age_confirmation()
 709             webpage = self._download_webpage(videourl, gameID)
 710
 711         self.report_extraction(gameID)
 712         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 713                                              webpage, 'game title')
 714
 715         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 716         mweb = re.finditer(urlRE, webpage)
 717         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 718         titles = re.finditer(namesRE, webpage)
 719         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 720         thumbs = re.finditer(thumbsRE, webpage)
 721         videos = []
 722         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 723             video_id = vid.group('videoID')
 724             title = vtitle.group('videoName')
 725             video_url = vid.group('videoURL')
 726             video_thumb = thumb.group('thumbnail')
 727             if not video_url:
 728                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 729             info = {
 730                 'id':video_id,
 731                 'url':video_url,
 732                 'ext': 'flv',
 733                 'title': unescapeHTML(title),
 734                 'thumbnail': video_thumb
 735                   }
 736             videos.append(info)
 737         return [self.playlist_result(videos, gameID, game_title)]
 738
 739 class UstreamIE(InfoExtractor):
 740     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 741     IE_NAME = u'ustream'
 742
 743     def _real_extract(self, url):
 744         m = re.match(self._VALID_URL, url)
 745         video_id = m.group('videoID')
 746
 747         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 748         webpage = self._download_webpage(url, video_id)
 749
 750         self.report_extraction(video_id)
 751
 752         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 753             webpage, u'title')
 754
 755         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 756             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 757
 758         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 759             webpage, u'thumbnail', fatal=False)
 760
 761         info = {
 762                 'id': video_id,
 763                 'url': video_url,
 764                 'ext': 'flv',
 765                 'title': video_title,
 766                 'uploader': uploader,
 767                 'thumbnail': thumbnail,
 768                }
 769         return info
 770
 771 class WorldStarHipHopIE(InfoExtractor):
 772     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 773     IE_NAME = u'WorldStarHipHop'
 774
 775     def _real_extract(self, url):
 776         m = re.match(self._VALID_URL, url)
 777         video_id = m.group('id')
 778
 779         webpage_src = self._download_webpage(url, video_id)
 780
 781         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 782             webpage_src, u'video URL')
 783
 784         if 'mp4' in video_url:
 785             ext = 'mp4'
 786         else:
 787             ext = 'flv'
 788
 789         video_title = self._html_search_regex(r"<title>(.*)</title>",
 790             webpage_src, u'title')
 791
 792         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 793         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 794             webpage_src, u'thumbnail', fatal=False)
 795
 796         if not thumbnail:
 797             _title = r"""candytitles.*>(.*)</span>"""
 798             mobj = re.search(_title, webpage_src)
 799             if mobj is not None:
 800                 video_title = mobj.group(1)
 801
 802         results = [{
 803                     'id': video_id,
 804                     'url' : video_url,
 805                     'title' : video_title,
 806                     'thumbnail' : thumbnail,
 807                     'ext' : ext,
 808                     }]
 809         return results
 810
 811 class RBMARadioIE(InfoExtractor):
 812     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 813
 814     def _real_extract(self, url):
 815         m = re.match(self._VALID_URL, url)
 816         video_id = m.group('videoID')
 817
 818         webpage = self._download_webpage(url, video_id)
 819
 820         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 821             webpage, u'json data', flags=re.MULTILINE)
 822
 823         try:
 824             data = json.loads(json_data)
 825         except ValueError as e:
 826             raise ExtractorError(u'Invalid JSON: ' + str(e))
 827
 828         video_url = data['akamai_url'] + '&cbr=256'
 829         url_parts = compat_urllib_parse_urlparse(video_url)
 830         video_ext = url_parts.path.rpartition('.')[2]
 831         info = {
 832                 'id': video_id,
 833                 'url': video_url,
 834                 'ext': video_ext,
 835                 'title': data['title'],
 836                 'description': data.get('teaser_text'),
 837                 'location': data.get('country_of_origin'),
 838                 'uploader': data.get('host', {}).get('name'),
 839                 'uploader_id': data.get('host', {}).get('slug'),
 840                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 841                 'duration': data.get('duration'),
 842         }
 843         return [info]
 844
 845
 846 class YouPornIE(InfoExtractor):
 847     """Information extractor for youporn.com."""
 848     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 849
 850     def _print_formats(self, formats):
 851         """Print all available formats"""
 852         print(u'Available formats:')
 853         print(u'ext\t\tformat')
 854         print(u'---------------------------------')
 855         for format in formats:
 856             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 857
 858     def _specific(self, req_format, formats):
 859         for x in formats:
 860             if(x["format"]==req_format):
 861                 return x
 862         return None
 863
 864     def _real_extract(self, url):
 865         mobj = re.match(self._VALID_URL, url)
 866         if mobj is None:
 867             raise ExtractorError(u'Invalid URL: %s' % url)
 868         video_id = mobj.group('videoid')
 869
 870         req = compat_urllib_request.Request(url)
 871         req.add_header('Cookie', 'age_verified=1')
 872         webpage = self._download_webpage(req, video_id)
 873
 874         # Get JSON parameters
 875         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 876         try:
 877             params = json.loads(json_params)
 878         except:
 879             raise ExtractorError(u'Invalid JSON')
 880
 881         self.report_extraction(video_id)
 882         try:
 883             video_title = params['title']
 884             upload_date = unified_strdate(params['release_date_f'])
 885             video_description = params['description']
 886             video_uploader = params['submitted_by']
 887             thumbnail = params['thumbnails'][0]['image']
 888         except KeyError:
 889             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 890
 891         # Get all of the formats available
 892         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 893         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 894             webpage, u'download list').strip()
 895
 896         # Get all of the links from the page
 897         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 898         links = re.findall(LINK_RE, download_list_html)
 899         if(len(links) == 0):
 900             raise ExtractorError(u'ERROR: no known formats available for video')
 901
 902         self.to_screen(u'Links found: %d' % len(links))
 903
 904         formats = []
 905         for link in links:
 906
 907             # A link looks like this:
 908             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 909             # A path looks like this:
 910             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 911             video_url = unescapeHTML( link )
 912             path = compat_urllib_parse_urlparse( video_url ).path
 913             extension = os.path.splitext( path )[1][1:]
 914             format = path.split('/')[4].split('_')[:2]
 915             size = format[0]
 916             bitrate = format[1]
 917             format = "-".join( format )
 918             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 919
 920             formats.append({
 921                 'id': video_id,
 922                 'url': video_url,
 923                 'uploader': video_uploader,
 924                 'upload_date': upload_date,
 925                 'title': video_title,
 926                 'ext': extension,
 927                 'format': format,
 928                 'thumbnail': thumbnail,
 929                 'description': video_description
 930             })
 931
 932         if self._downloader.params.get('listformats', None):
 933             self._print_formats(formats)
 934             return
 935
 936         req_format = self._downloader.params.get('format', None)
 937         self.to_screen(u'Format: %s' % req_format)
 938
 939         if req_format is None or req_format == 'best':
 940             return [formats[0]]
 941         elif req_format == 'worst':
 942             return [formats[-1]]
 943         elif req_format in ('-1', 'all'):
 944             return formats
 945         else:
 946             format = self._specific( req_format, formats )
 947             if result is None:
 948                 raise ExtractorError(u'Requested format not available')
 949             return [format]
 950
 951
 952
 953 class PornotubeIE(InfoExtractor):
 954     """Information extractor for pornotube.com."""
 955     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 956
 957     def _real_extract(self, url):
 958         mobj = re.match(self._VALID_URL, url)
 959         if mobj is None:
 960             raise ExtractorError(u'Invalid URL: %s' % url)
 961
 962         video_id = mobj.group('videoid')
 963         video_title = mobj.group('title')
 964
 965         # Get webpage content
 966         webpage = self._download_webpage(url, video_id)
 967
 968         # Get the video URL
 969         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 970         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 971         video_url = compat_urllib_parse.unquote(video_url)
 972
 973         #Get the uploaded date
 974         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 975         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 976         if upload_date: upload_date = unified_strdate(upload_date)
 977
 978         info = {'id': video_id,
 979                 'url': video_url,
 980                 'uploader': None,
 981                 'upload_date': upload_date,
 982                 'title': video_title,
 983                 'ext': 'flv',
 984                 'format': 'flv'}
 985
 986         return [info]
 987
 988 class YouJizzIE(InfoExtractor):
 989     """Information extractor for youjizz.com."""
 990     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 991
 992     def _real_extract(self, url):
 993         mobj = re.match(self._VALID_URL, url)
 994         if mobj is None:
 995             raise ExtractorError(u'Invalid URL: %s' % url)
 996
 997         video_id = mobj.group('videoid')
 998
 999         # Get webpage content
1000         webpage = self._download_webpage(url, video_id)
1001
1002         # Get the video title
1003         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1004             webpage, u'title').strip()
1005
1006         # Get the embed page
1007         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1008         if result is None:
1009             raise ExtractorError(u'ERROR: unable to extract embed page')
1010
1011         embed_page_url = result.group(0).strip()
1012         video_id = result.group('videoid')
1013
1014         webpage = self._download_webpage(embed_page_url, video_id)
1015
1016         # Get the video URL
1017         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1018             webpage, u'video URL')
1019
1020         info = {'id': video_id,
1021                 'url': video_url,
1022                 'title': video_title,
1023                 'ext': 'flv',
1024                 'format': 'flv',
1025                 'player_url': embed_page_url}
1026
1027         return [info]
1028
1029 class EightTracksIE(InfoExtractor):
1030     IE_NAME = '8tracks'
1031     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1032
1033     def _real_extract(self, url):
1034         mobj = re.match(self._VALID_URL, url)
1035         if mobj is None:
1036             raise ExtractorError(u'Invalid URL: %s' % url)
1037         playlist_id = mobj.group('id')
1038
1039         webpage = self._download_webpage(url, playlist_id)
1040
1041         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1042         data = json.loads(json_like)
1043
1044         session = str(random.randint(0, 1000000000))
1045         mix_id = data['id']
1046         track_count = data['tracks_count']
1047         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1048         next_url = first_url
1049         res = []
1050         for i in itertools.count():
1051             api_json = self._download_webpage(next_url, playlist_id,
1052                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1053                 errnote=u'Failed to download song information')
1054             api_data = json.loads(api_json)
1055             track_data = api_data[u'set']['track']
1056             info = {
1057                 'id': track_data['id'],
1058                 'url': track_data['track_file_stream_url'],
1059                 'title': track_data['performer'] + u' - ' + track_data['name'],
1060                 'raw_title': track_data['name'],
1061                 'uploader_id': data['user']['login'],
1062                 'ext': 'm4a',
1063             }
1064             res.append(info)
1065             if api_data['set']['at_last_track']:
1066                 break
1067             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1068         return res
1069
1070 class KeekIE(InfoExtractor):
1071     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1072     IE_NAME = u'keek'
1073
1074     def _real_extract(self, url):
1075         m = re.match(self._VALID_URL, url)
1076         video_id = m.group('videoID')
1077
1078         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1079         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1080         webpage = self._download_webpage(url, video_id)
1081
1082         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1083             webpage, u'title')
1084
1085         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1086             webpage, u'uploader', fatal=False)
1087
1088         info = {
1089                 'id': video_id,
1090                 'url': video_url,
1091                 'ext': 'mp4',
1092                 'title': video_title,
1093                 'thumbnail': thumbnail,
1094                 'uploader': uploader
1095         }
1096         return [info]
1097
1098 class TEDIE(InfoExtractor):
1099     _VALID_URL=r'''http://www\.ted\.com/
1100                    (
1101                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1102                         |
1103                         ((?P<type_talk>talks)) # We have a simple talk
1104                    )
1105                    (/lang/(.*?))? # The url may contain the language
1106                    /(?P<name>\w+) # Here goes the name and then ".html"
1107                    '''
1108
1109     @classmethod
1110     def suitable(cls, url):
1111         """Receives a URL and returns True if suitable for this IE."""
1112         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1113
1114     def _real_extract(self, url):
1115         m=re.match(self._VALID_URL, url, re.VERBOSE)
1116         if m.group('type_talk'):
1117             return [self._talk_info(url)]
1118         else :
1119             playlist_id=m.group('playlist_id')
1120             name=m.group('name')
1121             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1122             return [self._playlist_videos_info(url,name,playlist_id)]
1123
1124     def _playlist_videos_info(self,url,name,playlist_id=0):
1125         '''Returns the videos of the playlist'''
1126         video_RE=r'''
1127                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1128                      ([.\s]*?)data-playlist_item_id="(\d+)"
1129                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1130                      '''
1131         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1132         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1133         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1134         m_names=re.finditer(video_name_RE,webpage)
1135
1136         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1137                                                  webpage, 'playlist title')
1138
1139         playlist_entries = []
1140         for m_video, m_name in zip(m_videos,m_names):
1141             video_id=m_video.group('video_id')
1142             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1143             playlist_entries.append(self.url_result(talk_url, 'TED'))
1144         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1145
1146     def _talk_info(self, url, video_id=0):
1147         """Return the video for the talk in the url"""
1148         m = re.match(self._VALID_URL, url,re.VERBOSE)
1149         video_name = m.group('name')
1150         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1151         self.report_extraction(video_name)
1152         # If the url includes the language we get the title translated
1153         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1154                                         webpage, 'title')
1155         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1156                                     webpage, 'json data')
1157         info = json.loads(json_data)
1158         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1159                                        webpage, 'description', flags = re.DOTALL)
1160
1161         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1162                                        webpage, 'thumbnail')
1163         info = {
1164                 'id': info['id'],
1165                 'url': info['htmlStreams'][-1]['file'],
1166                 'ext': 'mp4',
1167                 'title': title,
1168                 'thumbnail': thumbnail,
1169                 'description': desc,
1170                 }
1171         return info
1172
1173 class MySpassIE(InfoExtractor):
1174     _VALID_URL = r'http://www.myspass.de/.*'
1175
1176     def _real_extract(self, url):
1177         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1178
1179         # video id is the last path element of the URL
1180         # usually there is a trailing slash, so also try the second but last
1181         url_path = compat_urllib_parse_urlparse(url).path
1182         url_parent_path, video_id = os.path.split(url_path)
1183         if not video_id:
1184             _, video_id = os.path.split(url_parent_path)
1185
1186         # get metadata
1187         metadata_url = META_DATA_URL_TEMPLATE % video_id
1188         metadata_text = self._download_webpage(metadata_url, video_id)
1189         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1190
1191         # extract values from metadata
1192         url_flv_el = metadata.find('url_flv')
1193         if url_flv_el is None:
1194             raise ExtractorError(u'Unable to extract download url')
1195         video_url = url_flv_el.text
1196         extension = os.path.splitext(video_url)[1][1:]
1197         title_el = metadata.find('title')
1198         if title_el is None:
1199             raise ExtractorError(u'Unable to extract title')
1200         title = title_el.text
1201         format_id_el = metadata.find('format_id')
1202         if format_id_el is None:
1203             format = ext
1204         else:
1205             format = format_id_el.text
1206         description_el = metadata.find('description')
1207         if description_el is not None:
1208             description = description_el.text
1209         else:
1210             description = None
1211         imagePreview_el = metadata.find('imagePreview')
1212         if imagePreview_el is not None:
1213             thumbnail = imagePreview_el.text
1214         else:
1215             thumbnail = None
1216         info = {
1217             'id': video_id,
1218             'url': video_url,
1219             'title': title,
1220             'ext': extension,
1221             'format': format,
1222             'thumbnail': thumbnail,
1223             'description': description
1224         }
1225         return [info]
1226
1227 class SpiegelIE(InfoExtractor):
1228     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1229
1230     def _real_extract(self, url):
1231         m = re.match(self._VALID_URL, url)
1232         video_id = m.group('videoID')
1233
1234         webpage = self._download_webpage(url, video_id)
1235
1236         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1237             webpage, u'title')
1238
1239         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1240         xml_code = self._download_webpage(xml_url, video_id,
1241                     note=u'Downloading XML', errnote=u'Failed to download XML')
1242
1243         idoc = xml.etree.ElementTree.fromstring(xml_code)
1244         last_type = idoc[-1]
1245         filename = last_type.findall('./filename')[0].text
1246         duration = float(last_type.findall('./duration')[0].text)
1247
1248         video_url = 'http://video2.spiegel.de/flash/' + filename
1249         video_ext = filename.rpartition('.')[2]
1250         info = {
1251             'id': video_id,
1252             'url': video_url,
1253             'ext': video_ext,
1254             'title': video_title,
1255             'duration': duration,
1256         }
1257         return [info]
1258
1259 class LiveLeakIE(InfoExtractor):
1260
1261     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1262     IE_NAME = u'liveleak'
1263
1264     def _real_extract(self, url):
1265         mobj = re.match(self._VALID_URL, url)
1266         if mobj is None:
1267             raise ExtractorError(u'Invalid URL: %s' % url)
1268
1269         video_id = mobj.group('video_id')
1270
1271         webpage = self._download_webpage(url, video_id)
1272
1273         video_url = self._search_regex(r'file: "(.*?)",',
1274             webpage, u'video URL')
1275
1276         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1277             webpage, u'title').replace('LiveLeak.com -', '').strip()
1278
1279         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1280             webpage, u'description', fatal=False)
1281
1282         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1283             webpage, u'uploader', fatal=False)
1284
1285         info = {
1286             'id':  video_id,
1287             'url': video_url,
1288             'ext': 'mp4',
1289             'title': video_title,
1290             'description': video_description,
1291             'uploader': video_uploader
1292         }
1293
1294         return [info]
1295
1296
1297
1298 class TumblrIE(InfoExtractor):
1299     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1300
1301     def _real_extract(self, url):
1302         m_url = re.match(self._VALID_URL, url)
1303         video_id = m_url.group('id')
1304         blog = m_url.group('blog_name')
1305
1306         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1307         webpage = self._download_webpage(url, video_id)
1308
1309         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1310         video = re.search(re_video, webpage)
1311         if video is None:
1312            raise ExtractorError(u'Unable to extract video')
1313         video_url = video.group('video_url')
1314         ext = video.group('ext')
1315
1316         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1317             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1318         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1319
1320         # The only place where you can get a title, it's not complete,
1321         # but searching in other places doesn't work for all videos
1322         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1323             webpage, u'title', flags=re.DOTALL)
1324
1325         return [{'id': video_id,
1326                  'url': video_url,
1327                  'title': video_title,
1328                  'thumbnail': video_thumbnail,
1329                  'ext': ext
1330                  }]
1331
1332 class BandcampIE(InfoExtractor):
1333     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1334
1335     def _real_extract(self, url):
1336         mobj = re.match(self._VALID_URL, url)
1337         title = mobj.group('title')
1338         webpage = self._download_webpage(url, title)
1339         # We get the link to the free download page
1340         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1341         if m_download is None:
1342             raise ExtractorError(u'No free songs found')
1343
1344         download_link = m_download.group(1)
1345         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1346                        webpage, re.MULTILINE|re.DOTALL).group('id')
1347
1348         download_webpage = self._download_webpage(download_link, id,
1349                                                   'Downloading free downloads page')
1350         # We get the dictionary of the track from some javascrip code
1351         info = re.search(r'items: (.*?),$',
1352                          download_webpage, re.MULTILINE).group(1)
1353         info = json.loads(info)[0]
1354         # We pick mp3-320 for now, until format selection can be easily implemented.
1355         mp3_info = info[u'downloads'][u'mp3-320']
1356         # If we try to use this url it says the link has expired
1357         initial_url = mp3_info[u'url']
1358         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1359         m_url = re.match(re_url, initial_url)
1360         #We build the url we will use to get the final track url
1361         # This url is build in Bandcamp in the script download_bunde_*.js
1362         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1363         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1364         # If we could correctly generate the .rand field the url would be
1365         #in the "download_url" key
1366         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1367
1368         track_info = {'id':id,
1369                       'title' : info[u'title'],
1370                       'ext' :   'mp3',
1371                       'url' :   final_url,
1372                       'thumbnail' : info[u'thumb_url'],
1373                       'uploader' :  info[u'artist']
1374                       }
1375
1376         return [track_info]
1377
1378 class RedTubeIE(InfoExtractor):
1379     """Information Extractor for redtube"""
1380     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1381
1382     def _real_extract(self,url):
1383         mobj = re.match(self._VALID_URL, url)
1384         if mobj is None:
1385             raise ExtractorError(u'Invalid URL: %s' % url)
1386
1387         video_id = mobj.group('id')
1388         video_extension = 'mp4'
1389         webpage = self._download_webpage(url, video_id)
1390
1391         self.report_extraction(video_id)
1392
1393         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1394             webpage, u'video URL')
1395
1396         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1397             webpage, u'title')
1398
1399         return [{
1400             'id':       video_id,
1401             'url':      video_url,
1402             'ext':      video_extension,
1403             'title':    video_title,
1404         }]
1405
1406 class InaIE(InfoExtractor):
1407     """Information Extractor for Ina.fr"""
1408     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1409
1410     def _real_extract(self,url):
1411         mobj = re.match(self._VALID_URL, url)
1412
1413         video_id = mobj.group('id')
1414         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1415         video_extension = 'mp4'
1416         webpage = self._download_webpage(mrss_url, video_id)
1417
1418         self.report_extraction(video_id)
1419
1420         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1421             webpage, u'video URL')
1422
1423         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1424             webpage, u'title')
1425
1426         return [{
1427             'id':       video_id,
1428             'url':      video_url,
1429             'ext':      video_extension,
1430             'title':    video_title,
1431         }]
1432
1433 class HowcastIE(InfoExtractor):
1434     """Information Extractor for Howcast.com"""
1435     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1436
1437     def _real_extract(self, url):
1438         mobj = re.match(self._VALID_URL, url)
1439
1440         video_id = mobj.group('id')
1441         webpage_url = 'http://www.howcast.com/videos/' + video_id
1442         webpage = self._download_webpage(webpage_url, video_id)
1443
1444         self.report_extraction(video_id)
1445
1446         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1447             webpage, u'video URL')
1448
1449         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1450             webpage, u'title')
1451
1452         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1453             webpage, u'description', fatal=False)
1454
1455         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1456             webpage, u'thumbnail', fatal=False)
1457
1458         return [{
1459             'id':       video_id,
1460             'url':      video_url,
1461             'ext':      'mp4',
1462             'title':    video_title,
1463             'description': video_description,
1464             'thumbnail': thumbnail,
1465         }]
1466
1467 class VineIE(InfoExtractor):
1468     """Information Extractor for Vine.co"""
1469     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1470
1471     def _real_extract(self, url):
1472         mobj = re.match(self._VALID_URL, url)
1473
1474         video_id = mobj.group('id')
1475         webpage_url = 'https://vine.co/v/' + video_id
1476         webpage = self._download_webpage(webpage_url, video_id)
1477
1478         self.report_extraction(video_id)
1479
1480         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1481             webpage, u'video URL')
1482
1483         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1484             webpage, u'title')
1485
1486         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1487             webpage, u'thumbnail', fatal=False)
1488
1489         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1490             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1491
1492         return [{
1493             'id':        video_id,
1494             'url':       video_url,
1495             'ext':       'mp4',
1496             'title':     video_title,
1497             'thumbnail': thumbnail,
1498             'uploader':  uploader,
1499         }]
1500
1501 class FlickrIE(InfoExtractor):
1502     """Information Extractor for Flickr videos"""
1503     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1504
1505     def _real_extract(self, url):
1506         mobj = re.match(self._VALID_URL, url)
1507
1508         video_id = mobj.group('id')
1509         video_uploader_id = mobj.group('uploader_id')
1510         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1511         webpage = self._download_webpage(webpage_url, video_id)
1512
1513         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1514
1515         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1516         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1517
1518         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1519             first_xml, u'node_id')
1520
1521         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1522         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1523
1524         self.report_extraction(video_id)
1525
1526         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1527         if mobj is None:
1528             raise ExtractorError(u'Unable to extract video url')
1529         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1530
1531         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1532             webpage, u'video title')
1533
1534         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1535             webpage, u'description', fatal=False)
1536
1537         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1538             webpage, u'thumbnail', fatal=False)
1539
1540         return [{
1541             'id':          video_id,
1542             'url':         video_url,
1543             'ext':         'mp4',
1544             'title':       video_title,
1545             'description': video_description,
1546             'thumbnail':   thumbnail,
1547             'uploader_id': video_uploader_id,
1548         }]
1549
1550 class TeamcocoIE(InfoExtractor):
1551     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1552
1553     def _real_extract(self, url):
1554         mobj = re.match(self._VALID_URL, url)
1555         if mobj is None:
1556             raise ExtractorError(u'Invalid URL: %s' % url)
1557         url_title = mobj.group('url_title')
1558         webpage = self._download_webpage(url, url_title)
1559
1560         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1561             webpage, u'video id')
1562
1563         self.report_extraction(video_id)
1564
1565         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1566             webpage, u'title')
1567
1568         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1569             webpage, u'thumbnail', fatal=False)
1570
1571         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1572             webpage, u'description', fatal=False)
1573
1574         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1575         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1576
1577         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1578             data, u'video URL')
1579
1580         return [{
1581             'id':          video_id,
1582             'url':         video_url,
1583             'ext':         'mp4',
1584             'title':       video_title,
1585             'thumbnail':   thumbnail,
1586             'description': video_description,
1587         }]
1588
1589 class XHamsterIE(InfoExtractor):
1590     """Information Extractor for xHamster"""
1591     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1592
1593     def _real_extract(self,url):
1594         mobj = re.match(self._VALID_URL, url)
1595
1596         video_id = mobj.group('id')
1597         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1598         webpage = self._download_webpage(mrss_url, video_id)
1599
1600         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1601         if mobj is None:
1602             raise ExtractorError(u'Unable to extract media URL')
1603         if len(mobj.group('server')) == 0:
1604             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1605         else:
1606             video_url = mobj.group('server')+'/key='+mobj.group('file')
1607         video_extension = video_url.split('.')[-1]
1608
1609         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1610             webpage, u'title')
1611
1612         # Can't see the description anywhere in the UI
1613         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1614         #     webpage, u'description', fatal=False)
1615         # if video_description: video_description = unescapeHTML(video_description)
1616
1617         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1618         if mobj:
1619             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1620         else:
1621             video_upload_date = None
1622             self._downloader.report_warning(u'Unable to extract upload date')
1623
1624         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1625             webpage, u'uploader id', default=u'anonymous')
1626
1627         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1628             webpage, u'thumbnail', fatal=False)
1629
1630         return [{
1631             'id':       video_id,
1632             'url':      video_url,
1633             'ext':      video_extension,
1634             'title':    video_title,
1635             # 'description': video_description,
1636             'upload_date': video_upload_date,
1637             'uploader_id': video_uploader_id,
1638             'thumbnail': video_thumbnail
1639         }]
1640
1641 class HypemIE(InfoExtractor):
1642     """Information Extractor for hypem"""
1643     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1644
1645     def _real_extract(self, url):
1646         mobj = re.match(self._VALID_URL, url)
1647         if mobj is None:
1648             raise ExtractorError(u'Invalid URL: %s' % url)
1649         track_id = mobj.group(1)
1650
1651         data = { 'ax': 1, 'ts': time.time() }
1652         data_encoded = compat_urllib_parse.urlencode(data)
1653         complete_url = url + "?" + data_encoded
1654         request = compat_urllib_request.Request(complete_url)
1655         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1656         cookie = urlh.headers.get('Set-Cookie', '')
1657
1658         self.report_extraction(track_id)
1659
1660         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1661             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1662         try:
1663             track_list = json.loads(html_tracks)
1664             track = track_list[u'tracks'][0]
1665         except ValueError:
1666             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1667
1668         key = track[u"key"]
1669         track_id = track[u"id"]
1670         artist = track[u"artist"]
1671         title = track[u"song"]
1672
1673         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1674         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1675         request.add_header('cookie', cookie)
1676         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1677         try:
1678             song_data = json.loads(song_data_json)
1679         except ValueError:
1680             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1681         final_url = song_data[u"url"]
1682
1683         return [{
1684             'id':       track_id,
1685             'url':      final_url,
1686             'ext':      "mp3",
1687             'title':    title,
1688             'artist':   artist,
1689         }]
1690
1691 class Vbox7IE(InfoExtractor):
1692     """Information Extractor for Vbox7"""
1693     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1694
1695     def _real_extract(self,url):
1696         mobj = re.match(self._VALID_URL, url)
1697         if mobj is None:
1698             raise ExtractorError(u'Invalid URL: %s' % url)
1699         video_id = mobj.group(1)
1700
1701         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1702         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1703         redirect_url = urlh.geturl() + new_location
1704         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1705
1706         title = self._html_search_regex(r'<title>(.*)</title>',
1707             webpage, u'title').split('/')[0].strip()
1708
1709         ext = "flv"
1710         info_url = "http://vbox7.com/play/magare.do"
1711         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1712         info_request = compat_urllib_request.Request(info_url, data)
1713         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1714         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1715         if info_response is None:
1716             raise ExtractorError(u'Unable to extract the media url')
1717         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1718
1719         return [{
1720             'id':        video_id,
1721             'url':       final_url,
1722             'ext':       ext,
1723             'title':     title,
1724             'thumbnail': thumbnail_url,
1725         }]
1726
1727
1728 def gen_extractors():
1729     """ Return a list of an instance of every supported extractor.
1730     The order does matter; the first extractor matched is the one handling the URL.
1731     """
1732     return [
1733         YoutubePlaylistIE(),
1734         YoutubeChannelIE(),
1735         YoutubeUserIE(),
1736         YoutubeSearchIE(),
1737         YoutubeIE(),
1738         MetacafeIE(),
1739         DailymotionIE(),
1740         GoogleSearchIE(),
1741         PhotobucketIE(),
1742         YahooIE(),
1743         YahooSearchIE(),
1744         DepositFilesIE(),
1745         FacebookIE(),
1746         BlipTVIE(),
1747         BlipTVUserIE(),
1748         VimeoIE(),
1749         MyVideoIE(),
1750         ComedyCentralIE(),
1751         EscapistIE(),
1752         CollegeHumorIE(),
1753         XVideosIE(),
1754         SoundcloudSetIE(),
1755         SoundcloudIE(),
1756         InfoQIE(),
1757         MixcloudIE(),
1758         StanfordOpenClassroomIE(),
1759         MTVIE(),
1760         YoukuIE(),
1761         XNXXIE(),
1762         YouJizzIE(),
1763         PornotubeIE(),
1764         YouPornIE(),
1765         GooglePlusIE(),
1766         ArteTvIE(),
1767         NBAIE(),
1768         WorldStarHipHopIE(),
1769         JustinTVIE(),
1770         FunnyOrDieIE(),
1771         SteamIE(),
1772         UstreamIE(),
1773         RBMARadioIE(),
1774         EightTracksIE(),
1775         KeekIE(),
1776         TEDIE(),
1777         MySpassIE(),
1778         SpiegelIE(),
1779         LiveLeakIE(),
1780         ARDIE(),
1781         ZDFIE(),
1782         TumblrIE(),
1783         BandcampIE(),
1784         RedTubeIE(),
1785         InaIE(),
1786         HowcastIE(),
1787         VineIE(),
1788         FlickrIE(),
1789         TeamcocoIE(),
1790         XHamsterIE(),
1791         HypemIE(),
1792         Vbox7IE(),
1793         GametrailersIE(),
1794         StatigramIE(),
1795         GenericIE()
1796     ]
1797
1798 def get_info_extractor(ie_name):
1799     """Returns the info extractor class with the given ie_name"""
1800     return globals()[ie_name+'IE']