_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.infoq import InfoQIE
  35 from .extractor.metacafe import MetacafeIE
  36 from .extractor.mixcloud import MixcloudIE
  37 from .extractor.mtv import MTVIE
  38 from .extractor.myvideo import MyVideoIE
  39 from .extractor.nba import NBAIE
  40 from .extractor.statigram import StatigramIE
  41 from .extractor.photobucket import PhotobucketIE
  42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  43 from .extractor.stanfordoc import StanfordOpenClassroomIE
  44 from .extractor.ted import TEDIE
  45 from .extractor.vimeo import VimeoIE
  46 from .extractor.xvideos import XVideosIE
  47 from .extractor.yahoo import YahooIE, YahooSearchIE
  48 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  49 from .extractor.zdf import ZDFIE
  50
  51
  52
  53
  54
  55
  56
  57 class YoukuIE(InfoExtractor):
  58     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
  59
  60     def _gen_sid(self):
  61         nowTime = int(time.time() * 1000)
  62         random1 = random.randint(1000,1998)
  63         random2 = random.randint(1000,9999)
  64
  65         return "%d%d%d" %(nowTime,random1,random2)
  66
  67     def _get_file_ID_mix_string(self, seed):
  68         mixed = []
  69         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
  70         seed = float(seed)
  71         for i in range(len(source)):
  72             seed  =  (seed * 211 + 30031 ) % 65536
  73             index  =  math.floor(seed / 65536 * len(source) )
  74             mixed.append(source[int(index)])
  75             source.remove(source[int(index)])
  76         #return ''.join(mixed)
  77         return mixed
  78
  79     def _get_file_id(self, fileId, seed):
  80         mixed = self._get_file_ID_mix_string(seed)
  81         ids = fileId.split('*')
  82         realId = []
  83         for ch in ids:
  84             if ch:
  85                 realId.append(mixed[int(ch)])
  86         return ''.join(realId)
  87
  88     def _real_extract(self, url):
  89         mobj = re.match(self._VALID_URL, url)
  90         if mobj is None:
  91             raise ExtractorError(u'Invalid URL: %s' % url)
  92         video_id = mobj.group('ID')
  93
  94         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
  95
  96         jsondata = self._download_webpage(info_url, video_id)
  97
  98         self.report_extraction(video_id)
  99         try:
 100             config = json.loads(jsondata)
 101
 102             video_title =  config['data'][0]['title']
 103             seed = config['data'][0]['seed']
 104
 105             format = self._downloader.params.get('format', None)
 106             supported_format = list(config['data'][0]['streamfileids'].keys())
 107
 108             if format is None or format == 'best':
 109                 if 'hd2' in supported_format:
 110                     format = 'hd2'
 111                 else:
 112                     format = 'flv'
 113                 ext = u'flv'
 114             elif format == 'worst':
 115                 format = 'mp4'
 116                 ext = u'mp4'
 117             else:
 118                 format = 'flv'
 119                 ext = u'flv'
 120
 121
 122             fileid = config['data'][0]['streamfileids'][format]
 123             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 124         except (UnicodeDecodeError, ValueError, KeyError):
 125             raise ExtractorError(u'Unable to extract info section')
 126
 127         files_info=[]
 128         sid = self._gen_sid()
 129         fileid = self._get_file_id(fileid, seed)
 130
 131         #column 8,9 of fileid represent the segment number
 132         #fileid[7:9] should be changed
 133         for index, key in enumerate(keys):
 134
 135             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 136             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 137
 138             info = {
 139                 'id': '%s_part%02d' % (video_id, index),
 140                 'url': download_url,
 141                 'uploader': None,
 142                 'upload_date': None,
 143                 'title': video_title,
 144                 'ext': ext,
 145             }
 146             files_info.append(info)
 147
 148         return files_info
 149
 150
 151 class XNXXIE(InfoExtractor):
 152     """Information extractor for xnxx.com"""
 153
 154     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 155     IE_NAME = u'xnxx'
 156     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 157     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 158     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 159
 160     def _real_extract(self, url):
 161         mobj = re.match(self._VALID_URL, url)
 162         if mobj is None:
 163             raise ExtractorError(u'Invalid URL: %s' % url)
 164         video_id = mobj.group(1)
 165
 166         # Get webpage content
 167         webpage = self._download_webpage(url, video_id)
 168
 169         video_url = self._search_regex(self.VIDEO_URL_RE,
 170             webpage, u'video URL')
 171         video_url = compat_urllib_parse.unquote(video_url)
 172
 173         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 174             webpage, u'title')
 175
 176         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 177             webpage, u'thumbnail', fatal=False)
 178
 179         return [{
 180             'id': video_id,
 181             'url': video_url,
 182             'uploader': None,
 183             'upload_date': None,
 184             'title': video_title,
 185             'ext': 'flv',
 186             'thumbnail': video_thumbnail,
 187             'description': None,
 188         }]
 189
 190
 191
 192
 193 class JustinTVIE(InfoExtractor):
 194     """Information extractor for justin.tv and twitch.tv"""
 195     # TODO: One broadcast may be split into multiple videos. The key
 196     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 197     # starts at 1 and increases. Can we treat all parts as one video?
 198
 199     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 200         (?:
 201             (?P<channelid>[^/]+)|
 202             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 203             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 204         )
 205         /?(?:\#.*)?$
 206         """
 207     _JUSTIN_PAGE_LIMIT = 100
 208     IE_NAME = u'justin.tv'
 209
 210     def report_download_page(self, channel, offset):
 211         """Report attempt to download a single page of videos."""
 212         self.to_screen(u'%s: Downloading video information from %d to %d' %
 213                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 214
 215     # Return count of items, list of *valid* items
 216     def _parse_page(self, url, video_id):
 217         webpage = self._download_webpage(url, video_id,
 218                                          u'Downloading video info JSON',
 219                                          u'unable to download video info JSON')
 220
 221         response = json.loads(webpage)
 222         if type(response) != list:
 223             error_text = response.get('error', 'unknown error')
 224             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 225         info = []
 226         for clip in response:
 227             video_url = clip['video_file_url']
 228             if video_url:
 229                 video_extension = os.path.splitext(video_url)[1][1:]
 230                 video_date = re.sub('-', '', clip['start_time'][:10])
 231                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 232                 video_id = clip['id']
 233                 video_title = clip.get('title', video_id)
 234                 info.append({
 235                     'id': video_id,
 236                     'url': video_url,
 237                     'title': video_title,
 238                     'uploader': clip.get('channel_name', video_uploader_id),
 239                     'uploader_id': video_uploader_id,
 240                     'upload_date': video_date,
 241                     'ext': video_extension,
 242                 })
 243         return (len(response), info)
 244
 245     def _real_extract(self, url):
 246         mobj = re.match(self._VALID_URL, url)
 247         if mobj is None:
 248             raise ExtractorError(u'invalid URL: %s' % url)
 249
 250         api_base = 'http://api.justin.tv'
 251         paged = False
 252         if mobj.group('channelid'):
 253             paged = True
 254             video_id = mobj.group('channelid')
 255             api = api_base + '/channel/archives/%s.json' % video_id
 256         elif mobj.group('chapterid'):
 257             chapter_id = mobj.group('chapterid')
 258
 259             webpage = self._download_webpage(url, chapter_id)
 260             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 261             if not m:
 262                 raise ExtractorError(u'Cannot find archive of a chapter')
 263             archive_id = m.group(1)
 264
 265             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 266             chapter_info_xml = self._download_webpage(api, chapter_id,
 267                                              note=u'Downloading chapter information',
 268                                              errnote=u'Chapter information download failed')
 269             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 270             for a in doc.findall('.//archive'):
 271                 if archive_id == a.find('./id').text:
 272                     break
 273             else:
 274                 raise ExtractorError(u'Could not find chapter in chapter information')
 275
 276             video_url = a.find('./video_file_url').text
 277             video_ext = video_url.rpartition('.')[2] or u'flv'
 278
 279             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 280             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 281                                    note='Downloading chapter metadata',
 282                                    errnote='Download of chapter metadata failed')
 283             chapter_info = json.loads(chapter_info_json)
 284
 285             bracket_start = int(doc.find('.//bracket_start').text)
 286             bracket_end = int(doc.find('.//bracket_end').text)
 287
 288             # TODO determine start (and probably fix up file)
 289             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 290             #video_url += u'?start=' + TODO:start_timestamp
 291             # bracket_start is 13290, but we want 51670615
 292             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 293                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 294
 295             info = {
 296                 'id': u'c' + chapter_id,
 297                 'url': video_url,
 298                 'ext': video_ext,
 299                 'title': chapter_info['title'],
 300                 'thumbnail': chapter_info['preview'],
 301                 'description': chapter_info['description'],
 302                 'uploader': chapter_info['channel']['display_name'],
 303                 'uploader_id': chapter_info['channel']['name'],
 304             }
 305             return [info]
 306         else:
 307             video_id = mobj.group('videoid')
 308             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 309
 310         self.report_extraction(video_id)
 311
 312         info = []
 313         offset = 0
 314         limit = self._JUSTIN_PAGE_LIMIT
 315         while True:
 316             if paged:
 317                 self.report_download_page(video_id, offset)
 318             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 319             page_count, page_info = self._parse_page(page_url, video_id)
 320             info.extend(page_info)
 321             if not paged or page_count != limit:
 322                 break
 323             offset += limit
 324         return info
 325
 326 class FunnyOrDieIE(InfoExtractor):
 327     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 328
 329     def _real_extract(self, url):
 330         mobj = re.match(self._VALID_URL, url)
 331         if mobj is None:
 332             raise ExtractorError(u'invalid URL: %s' % url)
 333
 334         video_id = mobj.group('id')
 335         webpage = self._download_webpage(url, video_id)
 336
 337         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 338             webpage, u'video URL', flags=re.DOTALL)
 339
 340         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 341             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 342
 343         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 344             webpage, u'description', fatal=False, flags=re.DOTALL)
 345
 346         info = {
 347             'id': video_id,
 348             'url': video_url,
 349             'ext': 'mp4',
 350             'title': title,
 351             'description': video_description,
 352         }
 353         return [info]
 354
 355 class SteamIE(InfoExtractor):
 356     _VALID_URL = r"""http://store\.steampowered\.com/
 357                 (agecheck/)?
 358                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 359                 (?P<gameID>\d+)/?
 360                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 361                 """
 362     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 363     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 364
 365     @classmethod
 366     def suitable(cls, url):
 367         """Receives a URL and returns True if suitable for this IE."""
 368         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 369
 370     def _real_extract(self, url):
 371         m = re.match(self._VALID_URL, url, re.VERBOSE)
 372         gameID = m.group('gameID')
 373
 374         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 375         webpage = self._download_webpage(videourl, gameID)
 376
 377         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 378             videourl = self._AGECHECK_TEMPLATE % gameID
 379             self.report_age_confirmation()
 380             webpage = self._download_webpage(videourl, gameID)
 381
 382         self.report_extraction(gameID)
 383         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 384                                              webpage, 'game title')
 385
 386         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 387         mweb = re.finditer(urlRE, webpage)
 388         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 389         titles = re.finditer(namesRE, webpage)
 390         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 391         thumbs = re.finditer(thumbsRE, webpage)
 392         videos = []
 393         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 394             video_id = vid.group('videoID')
 395             title = vtitle.group('videoName')
 396             video_url = vid.group('videoURL')
 397             video_thumb = thumb.group('thumbnail')
 398             if not video_url:
 399                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 400             info = {
 401                 'id':video_id,
 402                 'url':video_url,
 403                 'ext': 'flv',
 404                 'title': unescapeHTML(title),
 405                 'thumbnail': video_thumb
 406                   }
 407             videos.append(info)
 408         return [self.playlist_result(videos, gameID, game_title)]
 409
 410 class UstreamIE(InfoExtractor):
 411     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 412     IE_NAME = u'ustream'
 413
 414     def _real_extract(self, url):
 415         m = re.match(self._VALID_URL, url)
 416         video_id = m.group('videoID')
 417
 418         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 419         webpage = self._download_webpage(url, video_id)
 420
 421         self.report_extraction(video_id)
 422
 423         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 424             webpage, u'title')
 425
 426         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 427             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 428
 429         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 430             webpage, u'thumbnail', fatal=False)
 431
 432         info = {
 433                 'id': video_id,
 434                 'url': video_url,
 435                 'ext': 'flv',
 436                 'title': video_title,
 437                 'uploader': uploader,
 438                 'thumbnail': thumbnail,
 439                }
 440         return info
 441
 442 class WorldStarHipHopIE(InfoExtractor):
 443     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 444     IE_NAME = u'WorldStarHipHop'
 445
 446     def _real_extract(self, url):
 447         m = re.match(self._VALID_URL, url)
 448         video_id = m.group('id')
 449
 450         webpage_src = self._download_webpage(url, video_id)
 451
 452         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 453             webpage_src, u'video URL')
 454
 455         if 'mp4' in video_url:
 456             ext = 'mp4'
 457         else:
 458             ext = 'flv'
 459
 460         video_title = self._html_search_regex(r"<title>(.*)</title>",
 461             webpage_src, u'title')
 462
 463         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 464         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 465             webpage_src, u'thumbnail', fatal=False)
 466
 467         if not thumbnail:
 468             _title = r"""candytitles.*>(.*)</span>"""
 469             mobj = re.search(_title, webpage_src)
 470             if mobj is not None:
 471                 video_title = mobj.group(1)
 472
 473         results = [{
 474                     'id': video_id,
 475                     'url' : video_url,
 476                     'title' : video_title,
 477                     'thumbnail' : thumbnail,
 478                     'ext' : ext,
 479                     }]
 480         return results
 481
 482 class RBMARadioIE(InfoExtractor):
 483     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 484
 485     def _real_extract(self, url):
 486         m = re.match(self._VALID_URL, url)
 487         video_id = m.group('videoID')
 488
 489         webpage = self._download_webpage(url, video_id)
 490
 491         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 492             webpage, u'json data', flags=re.MULTILINE)
 493
 494         try:
 495             data = json.loads(json_data)
 496         except ValueError as e:
 497             raise ExtractorError(u'Invalid JSON: ' + str(e))
 498
 499         video_url = data['akamai_url'] + '&cbr=256'
 500         url_parts = compat_urllib_parse_urlparse(video_url)
 501         video_ext = url_parts.path.rpartition('.')[2]
 502         info = {
 503                 'id': video_id,
 504                 'url': video_url,
 505                 'ext': video_ext,
 506                 'title': data['title'],
 507                 'description': data.get('teaser_text'),
 508                 'location': data.get('country_of_origin'),
 509                 'uploader': data.get('host', {}).get('name'),
 510                 'uploader_id': data.get('host', {}).get('slug'),
 511                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 512                 'duration': data.get('duration'),
 513         }
 514         return [info]
 515
 516
 517 class YouPornIE(InfoExtractor):
 518     """Information extractor for youporn.com."""
 519     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 520
 521     def _print_formats(self, formats):
 522         """Print all available formats"""
 523         print(u'Available formats:')
 524         print(u'ext\t\tformat')
 525         print(u'---------------------------------')
 526         for format in formats:
 527             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 528
 529     def _specific(self, req_format, formats):
 530         for x in formats:
 531             if(x["format"]==req_format):
 532                 return x
 533         return None
 534
 535     def _real_extract(self, url):
 536         mobj = re.match(self._VALID_URL, url)
 537         if mobj is None:
 538             raise ExtractorError(u'Invalid URL: %s' % url)
 539         video_id = mobj.group('videoid')
 540
 541         req = compat_urllib_request.Request(url)
 542         req.add_header('Cookie', 'age_verified=1')
 543         webpage = self._download_webpage(req, video_id)
 544
 545         # Get JSON parameters
 546         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 547         try:
 548             params = json.loads(json_params)
 549         except:
 550             raise ExtractorError(u'Invalid JSON')
 551
 552         self.report_extraction(video_id)
 553         try:
 554             video_title = params['title']
 555             upload_date = unified_strdate(params['release_date_f'])
 556             video_description = params['description']
 557             video_uploader = params['submitted_by']
 558             thumbnail = params['thumbnails'][0]['image']
 559         except KeyError:
 560             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 561
 562         # Get all of the formats available
 563         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 564         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 565             webpage, u'download list').strip()
 566
 567         # Get all of the links from the page
 568         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 569         links = re.findall(LINK_RE, download_list_html)
 570         if(len(links) == 0):
 571             raise ExtractorError(u'ERROR: no known formats available for video')
 572
 573         self.to_screen(u'Links found: %d' % len(links))
 574
 575         formats = []
 576         for link in links:
 577
 578             # A link looks like this:
 579             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 580             # A path looks like this:
 581             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 582             video_url = unescapeHTML( link )
 583             path = compat_urllib_parse_urlparse( video_url ).path
 584             extension = os.path.splitext( path )[1][1:]
 585             format = path.split('/')[4].split('_')[:2]
 586             size = format[0]
 587             bitrate = format[1]
 588             format = "-".join( format )
 589             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 590
 591             formats.append({
 592                 'id': video_id,
 593                 'url': video_url,
 594                 'uploader': video_uploader,
 595                 'upload_date': upload_date,
 596                 'title': video_title,
 597                 'ext': extension,
 598                 'format': format,
 599                 'thumbnail': thumbnail,
 600                 'description': video_description
 601             })
 602
 603         if self._downloader.params.get('listformats', None):
 604             self._print_formats(formats)
 605             return
 606
 607         req_format = self._downloader.params.get('format', None)
 608         self.to_screen(u'Format: %s' % req_format)
 609
 610         if req_format is None or req_format == 'best':
 611             return [formats[0]]
 612         elif req_format == 'worst':
 613             return [formats[-1]]
 614         elif req_format in ('-1', 'all'):
 615             return formats
 616         else:
 617             format = self._specific( req_format, formats )
 618             if result is None:
 619                 raise ExtractorError(u'Requested format not available')
 620             return [format]
 621
 622
 623
 624 class PornotubeIE(InfoExtractor):
 625     """Information extractor for pornotube.com."""
 626     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 627
 628     def _real_extract(self, url):
 629         mobj = re.match(self._VALID_URL, url)
 630         if mobj is None:
 631             raise ExtractorError(u'Invalid URL: %s' % url)
 632
 633         video_id = mobj.group('videoid')
 634         video_title = mobj.group('title')
 635
 636         # Get webpage content
 637         webpage = self._download_webpage(url, video_id)
 638
 639         # Get the video URL
 640         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 641         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 642         video_url = compat_urllib_parse.unquote(video_url)
 643
 644         #Get the uploaded date
 645         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 646         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 647         if upload_date: upload_date = unified_strdate(upload_date)
 648
 649         info = {'id': video_id,
 650                 'url': video_url,
 651                 'uploader': None,
 652                 'upload_date': upload_date,
 653                 'title': video_title,
 654                 'ext': 'flv',
 655                 'format': 'flv'}
 656
 657         return [info]
 658
 659 class YouJizzIE(InfoExtractor):
 660     """Information extractor for youjizz.com."""
 661     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 662
 663     def _real_extract(self, url):
 664         mobj = re.match(self._VALID_URL, url)
 665         if mobj is None:
 666             raise ExtractorError(u'Invalid URL: %s' % url)
 667
 668         video_id = mobj.group('videoid')
 669
 670         # Get webpage content
 671         webpage = self._download_webpage(url, video_id)
 672
 673         # Get the video title
 674         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 675             webpage, u'title').strip()
 676
 677         # Get the embed page
 678         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 679         if result is None:
 680             raise ExtractorError(u'ERROR: unable to extract embed page')
 681
 682         embed_page_url = result.group(0).strip()
 683         video_id = result.group('videoid')
 684
 685         webpage = self._download_webpage(embed_page_url, video_id)
 686
 687         # Get the video URL
 688         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 689             webpage, u'video URL')
 690
 691         info = {'id': video_id,
 692                 'url': video_url,
 693                 'title': video_title,
 694                 'ext': 'flv',
 695                 'format': 'flv',
 696                 'player_url': embed_page_url}
 697
 698         return [info]
 699
 700 class EightTracksIE(InfoExtractor):
 701     IE_NAME = '8tracks'
 702     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 703
 704     def _real_extract(self, url):
 705         mobj = re.match(self._VALID_URL, url)
 706         if mobj is None:
 707             raise ExtractorError(u'Invalid URL: %s' % url)
 708         playlist_id = mobj.group('id')
 709
 710         webpage = self._download_webpage(url, playlist_id)
 711
 712         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 713         data = json.loads(json_like)
 714
 715         session = str(random.randint(0, 1000000000))
 716         mix_id = data['id']
 717         track_count = data['tracks_count']
 718         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 719         next_url = first_url
 720         res = []
 721         for i in itertools.count():
 722             api_json = self._download_webpage(next_url, playlist_id,
 723                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 724                 errnote=u'Failed to download song information')
 725             api_data = json.loads(api_json)
 726             track_data = api_data[u'set']['track']
 727             info = {
 728                 'id': track_data['id'],
 729                 'url': track_data['track_file_stream_url'],
 730                 'title': track_data['performer'] + u' - ' + track_data['name'],
 731                 'raw_title': track_data['name'],
 732                 'uploader_id': data['user']['login'],
 733                 'ext': 'm4a',
 734             }
 735             res.append(info)
 736             if api_data['set']['at_last_track']:
 737                 break
 738             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 739         return res
 740
 741 class KeekIE(InfoExtractor):
 742     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 743     IE_NAME = u'keek'
 744
 745     def _real_extract(self, url):
 746         m = re.match(self._VALID_URL, url)
 747         video_id = m.group('videoID')
 748
 749         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 750         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 751         webpage = self._download_webpage(url, video_id)
 752
 753         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 754             webpage, u'title')
 755
 756         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 757             webpage, u'uploader', fatal=False)
 758
 759         info = {
 760                 'id': video_id,
 761                 'url': video_url,
 762                 'ext': 'mp4',
 763                 'title': video_title,
 764                 'thumbnail': thumbnail,
 765                 'uploader': uploader
 766         }
 767         return [info]
 768
 769
 770 class MySpassIE(InfoExtractor):
 771     _VALID_URL = r'http://www.myspass.de/.*'
 772
 773     def _real_extract(self, url):
 774         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 775
 776         # video id is the last path element of the URL
 777         # usually there is a trailing slash, so also try the second but last
 778         url_path = compat_urllib_parse_urlparse(url).path
 779         url_parent_path, video_id = os.path.split(url_path)
 780         if not video_id:
 781             _, video_id = os.path.split(url_parent_path)
 782
 783         # get metadata
 784         metadata_url = META_DATA_URL_TEMPLATE % video_id
 785         metadata_text = self._download_webpage(metadata_url, video_id)
 786         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 787
 788         # extract values from metadata
 789         url_flv_el = metadata.find('url_flv')
 790         if url_flv_el is None:
 791             raise ExtractorError(u'Unable to extract download url')
 792         video_url = url_flv_el.text
 793         extension = os.path.splitext(video_url)[1][1:]
 794         title_el = metadata.find('title')
 795         if title_el is None:
 796             raise ExtractorError(u'Unable to extract title')
 797         title = title_el.text
 798         format_id_el = metadata.find('format_id')
 799         if format_id_el is None:
 800             format = ext
 801         else:
 802             format = format_id_el.text
 803         description_el = metadata.find('description')
 804         if description_el is not None:
 805             description = description_el.text
 806         else:
 807             description = None
 808         imagePreview_el = metadata.find('imagePreview')
 809         if imagePreview_el is not None:
 810             thumbnail = imagePreview_el.text
 811         else:
 812             thumbnail = None
 813         info = {
 814             'id': video_id,
 815             'url': video_url,
 816             'title': title,
 817             'ext': extension,
 818             'format': format,
 819             'thumbnail': thumbnail,
 820             'description': description
 821         }
 822         return [info]
 823
 824 class SpiegelIE(InfoExtractor):
 825     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 826
 827     def _real_extract(self, url):
 828         m = re.match(self._VALID_URL, url)
 829         video_id = m.group('videoID')
 830
 831         webpage = self._download_webpage(url, video_id)
 832
 833         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 834             webpage, u'title')
 835
 836         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 837         xml_code = self._download_webpage(xml_url, video_id,
 838                     note=u'Downloading XML', errnote=u'Failed to download XML')
 839
 840         idoc = xml.etree.ElementTree.fromstring(xml_code)
 841         last_type = idoc[-1]
 842         filename = last_type.findall('./filename')[0].text
 843         duration = float(last_type.findall('./duration')[0].text)
 844
 845         video_url = 'http://video2.spiegel.de/flash/' + filename
 846         video_ext = filename.rpartition('.')[2]
 847         info = {
 848             'id': video_id,
 849             'url': video_url,
 850             'ext': video_ext,
 851             'title': video_title,
 852             'duration': duration,
 853         }
 854         return [info]
 855
 856 class LiveLeakIE(InfoExtractor):
 857
 858     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 859     IE_NAME = u'liveleak'
 860
 861     def _real_extract(self, url):
 862         mobj = re.match(self._VALID_URL, url)
 863         if mobj is None:
 864             raise ExtractorError(u'Invalid URL: %s' % url)
 865
 866         video_id = mobj.group('video_id')
 867
 868         webpage = self._download_webpage(url, video_id)
 869
 870         video_url = self._search_regex(r'file: "(.*?)",',
 871             webpage, u'video URL')
 872
 873         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 874             webpage, u'title').replace('LiveLeak.com -', '').strip()
 875
 876         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 877             webpage, u'description', fatal=False)
 878
 879         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 880             webpage, u'uploader', fatal=False)
 881
 882         info = {
 883             'id':  video_id,
 884             'url': video_url,
 885             'ext': 'mp4',
 886             'title': video_title,
 887             'description': video_description,
 888             'uploader': video_uploader
 889         }
 890
 891         return [info]
 892
 893
 894
 895 class TumblrIE(InfoExtractor):
 896     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 897
 898     def _real_extract(self, url):
 899         m_url = re.match(self._VALID_URL, url)
 900         video_id = m_url.group('id')
 901         blog = m_url.group('blog_name')
 902
 903         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
 904         webpage = self._download_webpage(url, video_id)
 905
 906         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
 907         video = re.search(re_video, webpage)
 908         if video is None:
 909            raise ExtractorError(u'Unable to extract video')
 910         video_url = video.group('video_url')
 911         ext = video.group('ext')
 912
 913         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
 914             webpage, u'thumbnail', fatal=False)  # We pick the first poster
 915         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
 916
 917         # The only place where you can get a title, it's not complete,
 918         # but searching in other places doesn't work for all videos
 919         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
 920             webpage, u'title', flags=re.DOTALL)
 921
 922         return [{'id': video_id,
 923                  'url': video_url,
 924                  'title': video_title,
 925                  'thumbnail': video_thumbnail,
 926                  'ext': ext
 927                  }]
 928
 929 class BandcampIE(InfoExtractor):
 930     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
 931
 932     def _real_extract(self, url):
 933         mobj = re.match(self._VALID_URL, url)
 934         title = mobj.group('title')
 935         webpage = self._download_webpage(url, title)
 936         # We get the link to the free download page
 937         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
 938         if m_download is None:
 939             raise ExtractorError(u'No free songs found')
 940
 941         download_link = m_download.group(1)
 942         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
 943                        webpage, re.MULTILINE|re.DOTALL).group('id')
 944
 945         download_webpage = self._download_webpage(download_link, id,
 946                                                   'Downloading free downloads page')
 947         # We get the dictionary of the track from some javascrip code
 948         info = re.search(r'items: (.*?),$',
 949                          download_webpage, re.MULTILINE).group(1)
 950         info = json.loads(info)[0]
 951         # We pick mp3-320 for now, until format selection can be easily implemented.
 952         mp3_info = info[u'downloads'][u'mp3-320']
 953         # If we try to use this url it says the link has expired
 954         initial_url = mp3_info[u'url']
 955         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
 956         m_url = re.match(re_url, initial_url)
 957         #We build the url we will use to get the final track url
 958         # This url is build in Bandcamp in the script download_bunde_*.js
 959         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
 960         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
 961         # If we could correctly generate the .rand field the url would be
 962         #in the "download_url" key
 963         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
 964
 965         track_info = {'id':id,
 966                       'title' : info[u'title'],
 967                       'ext' :   'mp3',
 968                       'url' :   final_url,
 969                       'thumbnail' : info[u'thumb_url'],
 970                       'uploader' :  info[u'artist']
 971                       }
 972
 973         return [track_info]
 974
 975 class RedTubeIE(InfoExtractor):
 976     """Information Extractor for redtube"""
 977     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
 978
 979     def _real_extract(self,url):
 980         mobj = re.match(self._VALID_URL, url)
 981         if mobj is None:
 982             raise ExtractorError(u'Invalid URL: %s' % url)
 983
 984         video_id = mobj.group('id')
 985         video_extension = 'mp4'
 986         webpage = self._download_webpage(url, video_id)
 987
 988         self.report_extraction(video_id)
 989
 990         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
 991             webpage, u'video URL')
 992
 993         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
 994             webpage, u'title')
 995
 996         return [{
 997             'id':       video_id,
 998             'url':      video_url,
 999             'ext':      video_extension,
1000             'title':    video_title,
1001         }]
1002
1003 class InaIE(InfoExtractor):
1004     """Information Extractor for Ina.fr"""
1005     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1006
1007     def _real_extract(self,url):
1008         mobj = re.match(self._VALID_URL, url)
1009
1010         video_id = mobj.group('id')
1011         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1012         video_extension = 'mp4'
1013         webpage = self._download_webpage(mrss_url, video_id)
1014
1015         self.report_extraction(video_id)
1016
1017         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1018             webpage, u'video URL')
1019
1020         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1021             webpage, u'title')
1022
1023         return [{
1024             'id':       video_id,
1025             'url':      video_url,
1026             'ext':      video_extension,
1027             'title':    video_title,
1028         }]
1029
1030 class HowcastIE(InfoExtractor):
1031     """Information Extractor for Howcast.com"""
1032     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1033
1034     def _real_extract(self, url):
1035         mobj = re.match(self._VALID_URL, url)
1036
1037         video_id = mobj.group('id')
1038         webpage_url = 'http://www.howcast.com/videos/' + video_id
1039         webpage = self._download_webpage(webpage_url, video_id)
1040
1041         self.report_extraction(video_id)
1042
1043         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1044             webpage, u'video URL')
1045
1046         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1047             webpage, u'title')
1048
1049         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1050             webpage, u'description', fatal=False)
1051
1052         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1053             webpage, u'thumbnail', fatal=False)
1054
1055         return [{
1056             'id':       video_id,
1057             'url':      video_url,
1058             'ext':      'mp4',
1059             'title':    video_title,
1060             'description': video_description,
1061             'thumbnail': thumbnail,
1062         }]
1063
1064 class VineIE(InfoExtractor):
1065     """Information Extractor for Vine.co"""
1066     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1067
1068     def _real_extract(self, url):
1069         mobj = re.match(self._VALID_URL, url)
1070
1071         video_id = mobj.group('id')
1072         webpage_url = 'https://vine.co/v/' + video_id
1073         webpage = self._download_webpage(webpage_url, video_id)
1074
1075         self.report_extraction(video_id)
1076
1077         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1078             webpage, u'video URL')
1079
1080         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1081             webpage, u'title')
1082
1083         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1084             webpage, u'thumbnail', fatal=False)
1085
1086         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1087             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1088
1089         return [{
1090             'id':        video_id,
1091             'url':       video_url,
1092             'ext':       'mp4',
1093             'title':     video_title,
1094             'thumbnail': thumbnail,
1095             'uploader':  uploader,
1096         }]
1097
1098 class FlickrIE(InfoExtractor):
1099     """Information Extractor for Flickr videos"""
1100     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1101
1102     def _real_extract(self, url):
1103         mobj = re.match(self._VALID_URL, url)
1104
1105         video_id = mobj.group('id')
1106         video_uploader_id = mobj.group('uploader_id')
1107         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1108         webpage = self._download_webpage(webpage_url, video_id)
1109
1110         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1111
1112         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1113         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1114
1115         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1116             first_xml, u'node_id')
1117
1118         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1119         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1120
1121         self.report_extraction(video_id)
1122
1123         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1124         if mobj is None:
1125             raise ExtractorError(u'Unable to extract video url')
1126         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1127
1128         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1129             webpage, u'video title')
1130
1131         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1132             webpage, u'description', fatal=False)
1133
1134         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1135             webpage, u'thumbnail', fatal=False)
1136
1137         return [{
1138             'id':          video_id,
1139             'url':         video_url,
1140             'ext':         'mp4',
1141             'title':       video_title,
1142             'description': video_description,
1143             'thumbnail':   thumbnail,
1144             'uploader_id': video_uploader_id,
1145         }]
1146
1147 class TeamcocoIE(InfoExtractor):
1148     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1149
1150     def _real_extract(self, url):
1151         mobj = re.match(self._VALID_URL, url)
1152         if mobj is None:
1153             raise ExtractorError(u'Invalid URL: %s' % url)
1154         url_title = mobj.group('url_title')
1155         webpage = self._download_webpage(url, url_title)
1156
1157         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1158             webpage, u'video id')
1159
1160         self.report_extraction(video_id)
1161
1162         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1163             webpage, u'title')
1164
1165         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1166             webpage, u'thumbnail', fatal=False)
1167
1168         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1169             webpage, u'description', fatal=False)
1170
1171         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1172         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1173
1174         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1175             data, u'video URL')
1176
1177         return [{
1178             'id':          video_id,
1179             'url':         video_url,
1180             'ext':         'mp4',
1181             'title':       video_title,
1182             'thumbnail':   thumbnail,
1183             'description': video_description,
1184         }]
1185
1186 class XHamsterIE(InfoExtractor):
1187     """Information Extractor for xHamster"""
1188     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1189
1190     def _real_extract(self,url):
1191         mobj = re.match(self._VALID_URL, url)
1192
1193         video_id = mobj.group('id')
1194         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1195         webpage = self._download_webpage(mrss_url, video_id)
1196
1197         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1198         if mobj is None:
1199             raise ExtractorError(u'Unable to extract media URL')
1200         if len(mobj.group('server')) == 0:
1201             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1202         else:
1203             video_url = mobj.group('server')+'/key='+mobj.group('file')
1204         video_extension = video_url.split('.')[-1]
1205
1206         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1207             webpage, u'title')
1208
1209         # Can't see the description anywhere in the UI
1210         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1211         #     webpage, u'description', fatal=False)
1212         # if video_description: video_description = unescapeHTML(video_description)
1213
1214         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1215         if mobj:
1216             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1217         else:
1218             video_upload_date = None
1219             self._downloader.report_warning(u'Unable to extract upload date')
1220
1221         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1222             webpage, u'uploader id', default=u'anonymous')
1223
1224         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1225             webpage, u'thumbnail', fatal=False)
1226
1227         return [{
1228             'id':       video_id,
1229             'url':      video_url,
1230             'ext':      video_extension,
1231             'title':    video_title,
1232             # 'description': video_description,
1233             'upload_date': video_upload_date,
1234             'uploader_id': video_uploader_id,
1235             'thumbnail': video_thumbnail
1236         }]
1237
1238 class HypemIE(InfoExtractor):
1239     """Information Extractor for hypem"""
1240     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1241
1242     def _real_extract(self, url):
1243         mobj = re.match(self._VALID_URL, url)
1244         if mobj is None:
1245             raise ExtractorError(u'Invalid URL: %s' % url)
1246         track_id = mobj.group(1)
1247
1248         data = { 'ax': 1, 'ts': time.time() }
1249         data_encoded = compat_urllib_parse.urlencode(data)
1250         complete_url = url + "?" + data_encoded
1251         request = compat_urllib_request.Request(complete_url)
1252         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1253         cookie = urlh.headers.get('Set-Cookie', '')
1254
1255         self.report_extraction(track_id)
1256
1257         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1258             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1259         try:
1260             track_list = json.loads(html_tracks)
1261             track = track_list[u'tracks'][0]
1262         except ValueError:
1263             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1264
1265         key = track[u"key"]
1266         track_id = track[u"id"]
1267         artist = track[u"artist"]
1268         title = track[u"song"]
1269
1270         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1271         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1272         request.add_header('cookie', cookie)
1273         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1274         try:
1275             song_data = json.loads(song_data_json)
1276         except ValueError:
1277             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1278         final_url = song_data[u"url"]
1279
1280         return [{
1281             'id':       track_id,
1282             'url':      final_url,
1283             'ext':      "mp3",
1284             'title':    title,
1285             'artist':   artist,
1286         }]
1287
1288 class Vbox7IE(InfoExtractor):
1289     """Information Extractor for Vbox7"""
1290     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1291
1292     def _real_extract(self,url):
1293         mobj = re.match(self._VALID_URL, url)
1294         if mobj is None:
1295             raise ExtractorError(u'Invalid URL: %s' % url)
1296         video_id = mobj.group(1)
1297
1298         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1299         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1300         redirect_url = urlh.geturl() + new_location
1301         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1302
1303         title = self._html_search_regex(r'<title>(.*)</title>',
1304             webpage, u'title').split('/')[0].strip()
1305
1306         ext = "flv"
1307         info_url = "http://vbox7.com/play/magare.do"
1308         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1309         info_request = compat_urllib_request.Request(info_url, data)
1310         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1311         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1312         if info_response is None:
1313             raise ExtractorError(u'Unable to extract the media url')
1314         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1315
1316         return [{
1317             'id':        video_id,
1318             'url':       final_url,
1319             'ext':       ext,
1320             'title':     title,
1321             'thumbnail': thumbnail_url,
1322         }]
1323
1324
1325 def gen_extractors():
1326     """ Return a list of an instance of every supported extractor.
1327     The order does matter; the first extractor matched is the one handling the URL.
1328     """
1329     return [
1330         YoutubePlaylistIE(),
1331         YoutubeChannelIE(),
1332         YoutubeUserIE(),
1333         YoutubeSearchIE(),
1334         YoutubeIE(),
1335         MetacafeIE(),
1336         DailymotionIE(),
1337         GoogleSearchIE(),
1338         PhotobucketIE(),
1339         YahooIE(),
1340         YahooSearchIE(),
1341         DepositFilesIE(),
1342         FacebookIE(),
1343         BlipTVIE(),
1344         BlipTVUserIE(),
1345         VimeoIE(),
1346         MyVideoIE(),
1347         ComedyCentralIE(),
1348         EscapistIE(),
1349         CollegeHumorIE(),
1350         XVideosIE(),
1351         SoundcloudSetIE(),
1352         SoundcloudIE(),
1353         InfoQIE(),
1354         MixcloudIE(),
1355         StanfordOpenClassroomIE(),
1356         MTVIE(),
1357         YoukuIE(),
1358         XNXXIE(),
1359         YouJizzIE(),
1360         PornotubeIE(),
1361         YouPornIE(),
1362         GooglePlusIE(),
1363         ArteTvIE(),
1364         NBAIE(),
1365         WorldStarHipHopIE(),
1366         JustinTVIE(),
1367         FunnyOrDieIE(),
1368         SteamIE(),
1369         UstreamIE(),
1370         RBMARadioIE(),
1371         EightTracksIE(),
1372         KeekIE(),
1373         TEDIE(),
1374         MySpassIE(),
1375         SpiegelIE(),
1376         LiveLeakIE(),
1377         ARDIE(),
1378         ZDFIE(),
1379         TumblrIE(),
1380         BandcampIE(),
1381         RedTubeIE(),
1382         InaIE(),
1383         HowcastIE(),
1384         VineIE(),
1385         FlickrIE(),
1386         TeamcocoIE(),
1387         XHamsterIE(),
1388         HypemIE(),
1389         Vbox7IE(),
1390         GametrailersIE(),
1391         StatigramIE(),
1392         GenericIE()
1393     ]
1394
1395 def get_info_extractor(ie_name):
1396     """Returns the info extractor class with the given ie_name"""
1397     return globals()[ie_name+'IE']