_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.infoq import InfoQIE
  35 from .extractor.metacafe import MetacafeIE
  36 from .extractor.mtv import MTVIE
  37 from .extractor.myvideo import MyVideoIE
  38 from .extractor.nba import NBAIE
  39 from .extractor.statigram import StatigramIE
  40 from .extractor.photobucket import PhotobucketIE
  41 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  42 from .extractor.stanfordoc import StanfordOpenClassroomIE
  43 from .extractor.ted import TEDIE
  44 from .extractor.vimeo import VimeoIE
  45 from .extractor.xvideos import XVideosIE
  46 from .extractor.yahoo import YahooIE, YahooSearchIE
  47 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  48 from .extractor.zdf import ZDFIE
  49
  50
  51
  52 class MixcloudIE(InfoExtractor):
  53     """Information extractor for www.mixcloud.com"""
  54
  55     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
  56     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
  57     IE_NAME = u'mixcloud'
  58
  59     def report_download_json(self, file_id):
  60         """Report JSON download."""
  61         self.to_screen(u'Downloading json')
  62
  63     def get_urls(self, jsonData, fmt, bitrate='best'):
  64         """Get urls from 'audio_formats' section in json"""
  65         file_url = None
  66         try:
  67             bitrate_list = jsonData[fmt]
  68             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
  69                 bitrate = max(bitrate_list) # select highest
  70
  71             url_list = jsonData[fmt][bitrate]
  72         except TypeError: # we have no bitrate info.
  73             url_list = jsonData[fmt]
  74         return url_list
  75
  76     def check_urls(self, url_list):
  77         """Returns 1st active url from list"""
  78         for url in url_list:
  79             try:
  80                 compat_urllib_request.urlopen(url)
  81                 return url
  82             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  83                 url = None
  84
  85         return None
  86
  87     def _print_formats(self, formats):
  88         print('Available formats:')
  89         for fmt in formats.keys():
  90             for b in formats[fmt]:
  91                 try:
  92                     ext = formats[fmt][b][0]
  93                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
  94                 except TypeError: # we have no bitrate info
  95                     ext = formats[fmt][0]
  96                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
  97                     break
  98
  99     def _real_extract(self, url):
 100         mobj = re.match(self._VALID_URL, url)
 101         if mobj is None:
 102             raise ExtractorError(u'Invalid URL: %s' % url)
 103         # extract uploader & filename from url
 104         uploader = mobj.group(1).decode('utf-8')
 105         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 106
 107         # construct API request
 108         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 109         # retrieve .json file with links to files
 110         request = compat_urllib_request.Request(file_url)
 111         try:
 112             self.report_download_json(file_url)
 113             jsonData = compat_urllib_request.urlopen(request).read()
 114         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 115             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 116
 117         # parse JSON
 118         json_data = json.loads(jsonData)
 119         player_url = json_data['player_swf_url']
 120         formats = dict(json_data['audio_formats'])
 121
 122         req_format = self._downloader.params.get('format', None)
 123         bitrate = None
 124
 125         if self._downloader.params.get('listformats', None):
 126             self._print_formats(formats)
 127             return
 128
 129         if req_format is None or req_format == 'best':
 130             for format_param in formats.keys():
 131                 url_list = self.get_urls(formats, format_param)
 132                 # check urls
 133                 file_url = self.check_urls(url_list)
 134                 if file_url is not None:
 135                     break # got it!
 136         else:
 137             if req_format not in formats:
 138                 raise ExtractorError(u'Format is not available')
 139
 140             url_list = self.get_urls(formats, req_format)
 141             file_url = self.check_urls(url_list)
 142             format_param = req_format
 143
 144         return [{
 145             'id': file_id.decode('utf-8'),
 146             'url': file_url.decode('utf-8'),
 147             'uploader': uploader.decode('utf-8'),
 148             'upload_date': None,
 149             'title': json_data['name'],
 150             'ext': file_url.split('.')[-1].decode('utf-8'),
 151             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 152             'thumbnail': json_data['thumbnail_url'],
 153             'description': json_data['description'],
 154             'player_url': player_url.decode('utf-8'),
 155         }]
 156
 157
 158
 159
 160 class YoukuIE(InfoExtractor):
 161     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 162
 163     def _gen_sid(self):
 164         nowTime = int(time.time() * 1000)
 165         random1 = random.randint(1000,1998)
 166         random2 = random.randint(1000,9999)
 167
 168         return "%d%d%d" %(nowTime,random1,random2)
 169
 170     def _get_file_ID_mix_string(self, seed):
 171         mixed = []
 172         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 173         seed = float(seed)
 174         for i in range(len(source)):
 175             seed  =  (seed * 211 + 30031 ) % 65536
 176             index  =  math.floor(seed / 65536 * len(source) )
 177             mixed.append(source[int(index)])
 178             source.remove(source[int(index)])
 179         #return ''.join(mixed)
 180         return mixed
 181
 182     def _get_file_id(self, fileId, seed):
 183         mixed = self._get_file_ID_mix_string(seed)
 184         ids = fileId.split('*')
 185         realId = []
 186         for ch in ids:
 187             if ch:
 188                 realId.append(mixed[int(ch)])
 189         return ''.join(realId)
 190
 191     def _real_extract(self, url):
 192         mobj = re.match(self._VALID_URL, url)
 193         if mobj is None:
 194             raise ExtractorError(u'Invalid URL: %s' % url)
 195         video_id = mobj.group('ID')
 196
 197         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 198
 199         jsondata = self._download_webpage(info_url, video_id)
 200
 201         self.report_extraction(video_id)
 202         try:
 203             config = json.loads(jsondata)
 204
 205             video_title =  config['data'][0]['title']
 206             seed = config['data'][0]['seed']
 207
 208             format = self._downloader.params.get('format', None)
 209             supported_format = list(config['data'][0]['streamfileids'].keys())
 210
 211             if format is None or format == 'best':
 212                 if 'hd2' in supported_format:
 213                     format = 'hd2'
 214                 else:
 215                     format = 'flv'
 216                 ext = u'flv'
 217             elif format == 'worst':
 218                 format = 'mp4'
 219                 ext = u'mp4'
 220             else:
 221                 format = 'flv'
 222                 ext = u'flv'
 223
 224
 225             fileid = config['data'][0]['streamfileids'][format]
 226             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 227         except (UnicodeDecodeError, ValueError, KeyError):
 228             raise ExtractorError(u'Unable to extract info section')
 229
 230         files_info=[]
 231         sid = self._gen_sid()
 232         fileid = self._get_file_id(fileid, seed)
 233
 234         #column 8,9 of fileid represent the segment number
 235         #fileid[7:9] should be changed
 236         for index, key in enumerate(keys):
 237
 238             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 239             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 240
 241             info = {
 242                 'id': '%s_part%02d' % (video_id, index),
 243                 'url': download_url,
 244                 'uploader': None,
 245                 'upload_date': None,
 246                 'title': video_title,
 247                 'ext': ext,
 248             }
 249             files_info.append(info)
 250
 251         return files_info
 252
 253
 254 class XNXXIE(InfoExtractor):
 255     """Information extractor for xnxx.com"""
 256
 257     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 258     IE_NAME = u'xnxx'
 259     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 260     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 261     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 262
 263     def _real_extract(self, url):
 264         mobj = re.match(self._VALID_URL, url)
 265         if mobj is None:
 266             raise ExtractorError(u'Invalid URL: %s' % url)
 267         video_id = mobj.group(1)
 268
 269         # Get webpage content
 270         webpage = self._download_webpage(url, video_id)
 271
 272         video_url = self._search_regex(self.VIDEO_URL_RE,
 273             webpage, u'video URL')
 274         video_url = compat_urllib_parse.unquote(video_url)
 275
 276         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 277             webpage, u'title')
 278
 279         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 280             webpage, u'thumbnail', fatal=False)
 281
 282         return [{
 283             'id': video_id,
 284             'url': video_url,
 285             'uploader': None,
 286             'upload_date': None,
 287             'title': video_title,
 288             'ext': 'flv',
 289             'thumbnail': video_thumbnail,
 290             'description': None,
 291         }]
 292
 293
 294
 295
 296 class JustinTVIE(InfoExtractor):
 297     """Information extractor for justin.tv and twitch.tv"""
 298     # TODO: One broadcast may be split into multiple videos. The key
 299     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 300     # starts at 1 and increases. Can we treat all parts as one video?
 301
 302     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 303         (?:
 304             (?P<channelid>[^/]+)|
 305             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 306             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 307         )
 308         /?(?:\#.*)?$
 309         """
 310     _JUSTIN_PAGE_LIMIT = 100
 311     IE_NAME = u'justin.tv'
 312
 313     def report_download_page(self, channel, offset):
 314         """Report attempt to download a single page of videos."""
 315         self.to_screen(u'%s: Downloading video information from %d to %d' %
 316                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 317
 318     # Return count of items, list of *valid* items
 319     def _parse_page(self, url, video_id):
 320         webpage = self._download_webpage(url, video_id,
 321                                          u'Downloading video info JSON',
 322                                          u'unable to download video info JSON')
 323
 324         response = json.loads(webpage)
 325         if type(response) != list:
 326             error_text = response.get('error', 'unknown error')
 327             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 328         info = []
 329         for clip in response:
 330             video_url = clip['video_file_url']
 331             if video_url:
 332                 video_extension = os.path.splitext(video_url)[1][1:]
 333                 video_date = re.sub('-', '', clip['start_time'][:10])
 334                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 335                 video_id = clip['id']
 336                 video_title = clip.get('title', video_id)
 337                 info.append({
 338                     'id': video_id,
 339                     'url': video_url,
 340                     'title': video_title,
 341                     'uploader': clip.get('channel_name', video_uploader_id),
 342                     'uploader_id': video_uploader_id,
 343                     'upload_date': video_date,
 344                     'ext': video_extension,
 345                 })
 346         return (len(response), info)
 347
 348     def _real_extract(self, url):
 349         mobj = re.match(self._VALID_URL, url)
 350         if mobj is None:
 351             raise ExtractorError(u'invalid URL: %s' % url)
 352
 353         api_base = 'http://api.justin.tv'
 354         paged = False
 355         if mobj.group('channelid'):
 356             paged = True
 357             video_id = mobj.group('channelid')
 358             api = api_base + '/channel/archives/%s.json' % video_id
 359         elif mobj.group('chapterid'):
 360             chapter_id = mobj.group('chapterid')
 361
 362             webpage = self._download_webpage(url, chapter_id)
 363             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 364             if not m:
 365                 raise ExtractorError(u'Cannot find archive of a chapter')
 366             archive_id = m.group(1)
 367
 368             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 369             chapter_info_xml = self._download_webpage(api, chapter_id,
 370                                              note=u'Downloading chapter information',
 371                                              errnote=u'Chapter information download failed')
 372             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 373             for a in doc.findall('.//archive'):
 374                 if archive_id == a.find('./id').text:
 375                     break
 376             else:
 377                 raise ExtractorError(u'Could not find chapter in chapter information')
 378
 379             video_url = a.find('./video_file_url').text
 380             video_ext = video_url.rpartition('.')[2] or u'flv'
 381
 382             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 383             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 384                                    note='Downloading chapter metadata',
 385                                    errnote='Download of chapter metadata failed')
 386             chapter_info = json.loads(chapter_info_json)
 387
 388             bracket_start = int(doc.find('.//bracket_start').text)
 389             bracket_end = int(doc.find('.//bracket_end').text)
 390
 391             # TODO determine start (and probably fix up file)
 392             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 393             #video_url += u'?start=' + TODO:start_timestamp
 394             # bracket_start is 13290, but we want 51670615
 395             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 396                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 397
 398             info = {
 399                 'id': u'c' + chapter_id,
 400                 'url': video_url,
 401                 'ext': video_ext,
 402                 'title': chapter_info['title'],
 403                 'thumbnail': chapter_info['preview'],
 404                 'description': chapter_info['description'],
 405                 'uploader': chapter_info['channel']['display_name'],
 406                 'uploader_id': chapter_info['channel']['name'],
 407             }
 408             return [info]
 409         else:
 410             video_id = mobj.group('videoid')
 411             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 412
 413         self.report_extraction(video_id)
 414
 415         info = []
 416         offset = 0
 417         limit = self._JUSTIN_PAGE_LIMIT
 418         while True:
 419             if paged:
 420                 self.report_download_page(video_id, offset)
 421             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 422             page_count, page_info = self._parse_page(page_url, video_id)
 423             info.extend(page_info)
 424             if not paged or page_count != limit:
 425                 break
 426             offset += limit
 427         return info
 428
 429 class FunnyOrDieIE(InfoExtractor):
 430     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 431
 432     def _real_extract(self, url):
 433         mobj = re.match(self._VALID_URL, url)
 434         if mobj is None:
 435             raise ExtractorError(u'invalid URL: %s' % url)
 436
 437         video_id = mobj.group('id')
 438         webpage = self._download_webpage(url, video_id)
 439
 440         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 441             webpage, u'video URL', flags=re.DOTALL)
 442
 443         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 444             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 445
 446         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 447             webpage, u'description', fatal=False, flags=re.DOTALL)
 448
 449         info = {
 450             'id': video_id,
 451             'url': video_url,
 452             'ext': 'mp4',
 453             'title': title,
 454             'description': video_description,
 455         }
 456         return [info]
 457
 458 class SteamIE(InfoExtractor):
 459     _VALID_URL = r"""http://store\.steampowered\.com/
 460                 (agecheck/)?
 461                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 462                 (?P<gameID>\d+)/?
 463                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 464                 """
 465     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 466     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 467
 468     @classmethod
 469     def suitable(cls, url):
 470         """Receives a URL and returns True if suitable for this IE."""
 471         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 472
 473     def _real_extract(self, url):
 474         m = re.match(self._VALID_URL, url, re.VERBOSE)
 475         gameID = m.group('gameID')
 476
 477         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 478         webpage = self._download_webpage(videourl, gameID)
 479
 480         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 481             videourl = self._AGECHECK_TEMPLATE % gameID
 482             self.report_age_confirmation()
 483             webpage = self._download_webpage(videourl, gameID)
 484
 485         self.report_extraction(gameID)
 486         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 487                                              webpage, 'game title')
 488
 489         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 490         mweb = re.finditer(urlRE, webpage)
 491         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 492         titles = re.finditer(namesRE, webpage)
 493         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 494         thumbs = re.finditer(thumbsRE, webpage)
 495         videos = []
 496         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 497             video_id = vid.group('videoID')
 498             title = vtitle.group('videoName')
 499             video_url = vid.group('videoURL')
 500             video_thumb = thumb.group('thumbnail')
 501             if not video_url:
 502                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 503             info = {
 504                 'id':video_id,
 505                 'url':video_url,
 506                 'ext': 'flv',
 507                 'title': unescapeHTML(title),
 508                 'thumbnail': video_thumb
 509                   }
 510             videos.append(info)
 511         return [self.playlist_result(videos, gameID, game_title)]
 512
 513 class UstreamIE(InfoExtractor):
 514     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 515     IE_NAME = u'ustream'
 516
 517     def _real_extract(self, url):
 518         m = re.match(self._VALID_URL, url)
 519         video_id = m.group('videoID')
 520
 521         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 522         webpage = self._download_webpage(url, video_id)
 523
 524         self.report_extraction(video_id)
 525
 526         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 527             webpage, u'title')
 528
 529         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 530             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 531
 532         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 533             webpage, u'thumbnail', fatal=False)
 534
 535         info = {
 536                 'id': video_id,
 537                 'url': video_url,
 538                 'ext': 'flv',
 539                 'title': video_title,
 540                 'uploader': uploader,
 541                 'thumbnail': thumbnail,
 542                }
 543         return info
 544
 545 class WorldStarHipHopIE(InfoExtractor):
 546     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 547     IE_NAME = u'WorldStarHipHop'
 548
 549     def _real_extract(self, url):
 550         m = re.match(self._VALID_URL, url)
 551         video_id = m.group('id')
 552
 553         webpage_src = self._download_webpage(url, video_id)
 554
 555         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 556             webpage_src, u'video URL')
 557
 558         if 'mp4' in video_url:
 559             ext = 'mp4'
 560         else:
 561             ext = 'flv'
 562
 563         video_title = self._html_search_regex(r"<title>(.*)</title>",
 564             webpage_src, u'title')
 565
 566         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 567         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 568             webpage_src, u'thumbnail', fatal=False)
 569
 570         if not thumbnail:
 571             _title = r"""candytitles.*>(.*)</span>"""
 572             mobj = re.search(_title, webpage_src)
 573             if mobj is not None:
 574                 video_title = mobj.group(1)
 575
 576         results = [{
 577                     'id': video_id,
 578                     'url' : video_url,
 579                     'title' : video_title,
 580                     'thumbnail' : thumbnail,
 581                     'ext' : ext,
 582                     }]
 583         return results
 584
 585 class RBMARadioIE(InfoExtractor):
 586     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 587
 588     def _real_extract(self, url):
 589         m = re.match(self._VALID_URL, url)
 590         video_id = m.group('videoID')
 591
 592         webpage = self._download_webpage(url, video_id)
 593
 594         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 595             webpage, u'json data', flags=re.MULTILINE)
 596
 597         try:
 598             data = json.loads(json_data)
 599         except ValueError as e:
 600             raise ExtractorError(u'Invalid JSON: ' + str(e))
 601
 602         video_url = data['akamai_url'] + '&cbr=256'
 603         url_parts = compat_urllib_parse_urlparse(video_url)
 604         video_ext = url_parts.path.rpartition('.')[2]
 605         info = {
 606                 'id': video_id,
 607                 'url': video_url,
 608                 'ext': video_ext,
 609                 'title': data['title'],
 610                 'description': data.get('teaser_text'),
 611                 'location': data.get('country_of_origin'),
 612                 'uploader': data.get('host', {}).get('name'),
 613                 'uploader_id': data.get('host', {}).get('slug'),
 614                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 615                 'duration': data.get('duration'),
 616         }
 617         return [info]
 618
 619
 620 class YouPornIE(InfoExtractor):
 621     """Information extractor for youporn.com."""
 622     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 623
 624     def _print_formats(self, formats):
 625         """Print all available formats"""
 626         print(u'Available formats:')
 627         print(u'ext\t\tformat')
 628         print(u'---------------------------------')
 629         for format in formats:
 630             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 631
 632     def _specific(self, req_format, formats):
 633         for x in formats:
 634             if(x["format"]==req_format):
 635                 return x
 636         return None
 637
 638     def _real_extract(self, url):
 639         mobj = re.match(self._VALID_URL, url)
 640         if mobj is None:
 641             raise ExtractorError(u'Invalid URL: %s' % url)
 642         video_id = mobj.group('videoid')
 643
 644         req = compat_urllib_request.Request(url)
 645         req.add_header('Cookie', 'age_verified=1')
 646         webpage = self._download_webpage(req, video_id)
 647
 648         # Get JSON parameters
 649         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 650         try:
 651             params = json.loads(json_params)
 652         except:
 653             raise ExtractorError(u'Invalid JSON')
 654
 655         self.report_extraction(video_id)
 656         try:
 657             video_title = params['title']
 658             upload_date = unified_strdate(params['release_date_f'])
 659             video_description = params['description']
 660             video_uploader = params['submitted_by']
 661             thumbnail = params['thumbnails'][0]['image']
 662         except KeyError:
 663             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 664
 665         # Get all of the formats available
 666         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 667         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 668             webpage, u'download list').strip()
 669
 670         # Get all of the links from the page
 671         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 672         links = re.findall(LINK_RE, download_list_html)
 673         if(len(links) == 0):
 674             raise ExtractorError(u'ERROR: no known formats available for video')
 675
 676         self.to_screen(u'Links found: %d' % len(links))
 677
 678         formats = []
 679         for link in links:
 680
 681             # A link looks like this:
 682             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 683             # A path looks like this:
 684             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 685             video_url = unescapeHTML( link )
 686             path = compat_urllib_parse_urlparse( video_url ).path
 687             extension = os.path.splitext( path )[1][1:]
 688             format = path.split('/')[4].split('_')[:2]
 689             size = format[0]
 690             bitrate = format[1]
 691             format = "-".join( format )
 692             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 693
 694             formats.append({
 695                 'id': video_id,
 696                 'url': video_url,
 697                 'uploader': video_uploader,
 698                 'upload_date': upload_date,
 699                 'title': video_title,
 700                 'ext': extension,
 701                 'format': format,
 702                 'thumbnail': thumbnail,
 703                 'description': video_description
 704             })
 705
 706         if self._downloader.params.get('listformats', None):
 707             self._print_formats(formats)
 708             return
 709
 710         req_format = self._downloader.params.get('format', None)
 711         self.to_screen(u'Format: %s' % req_format)
 712
 713         if req_format is None or req_format == 'best':
 714             return [formats[0]]
 715         elif req_format == 'worst':
 716             return [formats[-1]]
 717         elif req_format in ('-1', 'all'):
 718             return formats
 719         else:
 720             format = self._specific( req_format, formats )
 721             if result is None:
 722                 raise ExtractorError(u'Requested format not available')
 723             return [format]
 724
 725
 726
 727 class PornotubeIE(InfoExtractor):
 728     """Information extractor for pornotube.com."""
 729     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 730
 731     def _real_extract(self, url):
 732         mobj = re.match(self._VALID_URL, url)
 733         if mobj is None:
 734             raise ExtractorError(u'Invalid URL: %s' % url)
 735
 736         video_id = mobj.group('videoid')
 737         video_title = mobj.group('title')
 738
 739         # Get webpage content
 740         webpage = self._download_webpage(url, video_id)
 741
 742         # Get the video URL
 743         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 744         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 745         video_url = compat_urllib_parse.unquote(video_url)
 746
 747         #Get the uploaded date
 748         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 749         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 750         if upload_date: upload_date = unified_strdate(upload_date)
 751
 752         info = {'id': video_id,
 753                 'url': video_url,
 754                 'uploader': None,
 755                 'upload_date': upload_date,
 756                 'title': video_title,
 757                 'ext': 'flv',
 758                 'format': 'flv'}
 759
 760         return [info]
 761
 762 class YouJizzIE(InfoExtractor):
 763     """Information extractor for youjizz.com."""
 764     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 765
 766     def _real_extract(self, url):
 767         mobj = re.match(self._VALID_URL, url)
 768         if mobj is None:
 769             raise ExtractorError(u'Invalid URL: %s' % url)
 770
 771         video_id = mobj.group('videoid')
 772
 773         # Get webpage content
 774         webpage = self._download_webpage(url, video_id)
 775
 776         # Get the video title
 777         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 778             webpage, u'title').strip()
 779
 780         # Get the embed page
 781         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 782         if result is None:
 783             raise ExtractorError(u'ERROR: unable to extract embed page')
 784
 785         embed_page_url = result.group(0).strip()
 786         video_id = result.group('videoid')
 787
 788         webpage = self._download_webpage(embed_page_url, video_id)
 789
 790         # Get the video URL
 791         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 792             webpage, u'video URL')
 793
 794         info = {'id': video_id,
 795                 'url': video_url,
 796                 'title': video_title,
 797                 'ext': 'flv',
 798                 'format': 'flv',
 799                 'player_url': embed_page_url}
 800
 801         return [info]
 802
 803 class EightTracksIE(InfoExtractor):
 804     IE_NAME = '8tracks'
 805     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 806
 807     def _real_extract(self, url):
 808         mobj = re.match(self._VALID_URL, url)
 809         if mobj is None:
 810             raise ExtractorError(u'Invalid URL: %s' % url)
 811         playlist_id = mobj.group('id')
 812
 813         webpage = self._download_webpage(url, playlist_id)
 814
 815         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 816         data = json.loads(json_like)
 817
 818         session = str(random.randint(0, 1000000000))
 819         mix_id = data['id']
 820         track_count = data['tracks_count']
 821         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 822         next_url = first_url
 823         res = []
 824         for i in itertools.count():
 825             api_json = self._download_webpage(next_url, playlist_id,
 826                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 827                 errnote=u'Failed to download song information')
 828             api_data = json.loads(api_json)
 829             track_data = api_data[u'set']['track']
 830             info = {
 831                 'id': track_data['id'],
 832                 'url': track_data['track_file_stream_url'],
 833                 'title': track_data['performer'] + u' - ' + track_data['name'],
 834                 'raw_title': track_data['name'],
 835                 'uploader_id': data['user']['login'],
 836                 'ext': 'm4a',
 837             }
 838             res.append(info)
 839             if api_data['set']['at_last_track']:
 840                 break
 841             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 842         return res
 843
 844 class KeekIE(InfoExtractor):
 845     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 846     IE_NAME = u'keek'
 847
 848     def _real_extract(self, url):
 849         m = re.match(self._VALID_URL, url)
 850         video_id = m.group('videoID')
 851
 852         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 853         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 854         webpage = self._download_webpage(url, video_id)
 855
 856         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 857             webpage, u'title')
 858
 859         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 860             webpage, u'uploader', fatal=False)
 861
 862         info = {
 863                 'id': video_id,
 864                 'url': video_url,
 865                 'ext': 'mp4',
 866                 'title': video_title,
 867                 'thumbnail': thumbnail,
 868                 'uploader': uploader
 869         }
 870         return [info]
 871
 872
 873 class MySpassIE(InfoExtractor):
 874     _VALID_URL = r'http://www.myspass.de/.*'
 875
 876     def _real_extract(self, url):
 877         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 878
 879         # video id is the last path element of the URL
 880         # usually there is a trailing slash, so also try the second but last
 881         url_path = compat_urllib_parse_urlparse(url).path
 882         url_parent_path, video_id = os.path.split(url_path)
 883         if not video_id:
 884             _, video_id = os.path.split(url_parent_path)
 885
 886         # get metadata
 887         metadata_url = META_DATA_URL_TEMPLATE % video_id
 888         metadata_text = self._download_webpage(metadata_url, video_id)
 889         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 890
 891         # extract values from metadata
 892         url_flv_el = metadata.find('url_flv')
 893         if url_flv_el is None:
 894             raise ExtractorError(u'Unable to extract download url')
 895         video_url = url_flv_el.text
 896         extension = os.path.splitext(video_url)[1][1:]
 897         title_el = metadata.find('title')
 898         if title_el is None:
 899             raise ExtractorError(u'Unable to extract title')
 900         title = title_el.text
 901         format_id_el = metadata.find('format_id')
 902         if format_id_el is None:
 903             format = ext
 904         else:
 905             format = format_id_el.text
 906         description_el = metadata.find('description')
 907         if description_el is not None:
 908             description = description_el.text
 909         else:
 910             description = None
 911         imagePreview_el = metadata.find('imagePreview')
 912         if imagePreview_el is not None:
 913             thumbnail = imagePreview_el.text
 914         else:
 915             thumbnail = None
 916         info = {
 917             'id': video_id,
 918             'url': video_url,
 919             'title': title,
 920             'ext': extension,
 921             'format': format,
 922             'thumbnail': thumbnail,
 923             'description': description
 924         }
 925         return [info]
 926
 927 class SpiegelIE(InfoExtractor):
 928     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 929
 930     def _real_extract(self, url):
 931         m = re.match(self._VALID_URL, url)
 932         video_id = m.group('videoID')
 933
 934         webpage = self._download_webpage(url, video_id)
 935
 936         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
 937             webpage, u'title')
 938
 939         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
 940         xml_code = self._download_webpage(xml_url, video_id,
 941                     note=u'Downloading XML', errnote=u'Failed to download XML')
 942
 943         idoc = xml.etree.ElementTree.fromstring(xml_code)
 944         last_type = idoc[-1]
 945         filename = last_type.findall('./filename')[0].text
 946         duration = float(last_type.findall('./duration')[0].text)
 947
 948         video_url = 'http://video2.spiegel.de/flash/' + filename
 949         video_ext = filename.rpartition('.')[2]
 950         info = {
 951             'id': video_id,
 952             'url': video_url,
 953             'ext': video_ext,
 954             'title': video_title,
 955             'duration': duration,
 956         }
 957         return [info]
 958
 959 class LiveLeakIE(InfoExtractor):
 960
 961     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
 962     IE_NAME = u'liveleak'
 963
 964     def _real_extract(self, url):
 965         mobj = re.match(self._VALID_URL, url)
 966         if mobj is None:
 967             raise ExtractorError(u'Invalid URL: %s' % url)
 968
 969         video_id = mobj.group('video_id')
 970
 971         webpage = self._download_webpage(url, video_id)
 972
 973         video_url = self._search_regex(r'file: "(.*?)",',
 974             webpage, u'video URL')
 975
 976         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 977             webpage, u'title').replace('LiveLeak.com -', '').strip()
 978
 979         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 980             webpage, u'description', fatal=False)
 981
 982         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
 983             webpage, u'uploader', fatal=False)
 984
 985         info = {
 986             'id':  video_id,
 987             'url': video_url,
 988             'ext': 'mp4',
 989             'title': video_title,
 990             'description': video_description,
 991             'uploader': video_uploader
 992         }
 993
 994         return [info]
 995
 996
 997
 998 class TumblrIE(InfoExtractor):
 999     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1000
1001     def _real_extract(self, url):
1002         m_url = re.match(self._VALID_URL, url)
1003         video_id = m_url.group('id')
1004         blog = m_url.group('blog_name')
1005
1006         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1007         webpage = self._download_webpage(url, video_id)
1008
1009         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1010         video = re.search(re_video, webpage)
1011         if video is None:
1012            raise ExtractorError(u'Unable to extract video')
1013         video_url = video.group('video_url')
1014         ext = video.group('ext')
1015
1016         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1017             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1018         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1019
1020         # The only place where you can get a title, it's not complete,
1021         # but searching in other places doesn't work for all videos
1022         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1023             webpage, u'title', flags=re.DOTALL)
1024
1025         return [{'id': video_id,
1026                  'url': video_url,
1027                  'title': video_title,
1028                  'thumbnail': video_thumbnail,
1029                  'ext': ext
1030                  }]
1031
1032 class BandcampIE(InfoExtractor):
1033     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1034
1035     def _real_extract(self, url):
1036         mobj = re.match(self._VALID_URL, url)
1037         title = mobj.group('title')
1038         webpage = self._download_webpage(url, title)
1039         # We get the link to the free download page
1040         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1041         if m_download is None:
1042             raise ExtractorError(u'No free songs found')
1043
1044         download_link = m_download.group(1)
1045         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1046                        webpage, re.MULTILINE|re.DOTALL).group('id')
1047
1048         download_webpage = self._download_webpage(download_link, id,
1049                                                   'Downloading free downloads page')
1050         # We get the dictionary of the track from some javascrip code
1051         info = re.search(r'items: (.*?),$',
1052                          download_webpage, re.MULTILINE).group(1)
1053         info = json.loads(info)[0]
1054         # We pick mp3-320 for now, until format selection can be easily implemented.
1055         mp3_info = info[u'downloads'][u'mp3-320']
1056         # If we try to use this url it says the link has expired
1057         initial_url = mp3_info[u'url']
1058         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1059         m_url = re.match(re_url, initial_url)
1060         #We build the url we will use to get the final track url
1061         # This url is build in Bandcamp in the script download_bunde_*.js
1062         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1063         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1064         # If we could correctly generate the .rand field the url would be
1065         #in the "download_url" key
1066         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1067
1068         track_info = {'id':id,
1069                       'title' : info[u'title'],
1070                       'ext' :   'mp3',
1071                       'url' :   final_url,
1072                       'thumbnail' : info[u'thumb_url'],
1073                       'uploader' :  info[u'artist']
1074                       }
1075
1076         return [track_info]
1077
1078 class RedTubeIE(InfoExtractor):
1079     """Information Extractor for redtube"""
1080     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1081
1082     def _real_extract(self,url):
1083         mobj = re.match(self._VALID_URL, url)
1084         if mobj is None:
1085             raise ExtractorError(u'Invalid URL: %s' % url)
1086
1087         video_id = mobj.group('id')
1088         video_extension = 'mp4'
1089         webpage = self._download_webpage(url, video_id)
1090
1091         self.report_extraction(video_id)
1092
1093         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1094             webpage, u'video URL')
1095
1096         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1097             webpage, u'title')
1098
1099         return [{
1100             'id':       video_id,
1101             'url':      video_url,
1102             'ext':      video_extension,
1103             'title':    video_title,
1104         }]
1105
1106 class InaIE(InfoExtractor):
1107     """Information Extractor for Ina.fr"""
1108     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1109
1110     def _real_extract(self,url):
1111         mobj = re.match(self._VALID_URL, url)
1112
1113         video_id = mobj.group('id')
1114         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1115         video_extension = 'mp4'
1116         webpage = self._download_webpage(mrss_url, video_id)
1117
1118         self.report_extraction(video_id)
1119
1120         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1121             webpage, u'video URL')
1122
1123         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1124             webpage, u'title')
1125
1126         return [{
1127             'id':       video_id,
1128             'url':      video_url,
1129             'ext':      video_extension,
1130             'title':    video_title,
1131         }]
1132
1133 class HowcastIE(InfoExtractor):
1134     """Information Extractor for Howcast.com"""
1135     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1136
1137     def _real_extract(self, url):
1138         mobj = re.match(self._VALID_URL, url)
1139
1140         video_id = mobj.group('id')
1141         webpage_url = 'http://www.howcast.com/videos/' + video_id
1142         webpage = self._download_webpage(webpage_url, video_id)
1143
1144         self.report_extraction(video_id)
1145
1146         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1147             webpage, u'video URL')
1148
1149         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1150             webpage, u'title')
1151
1152         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1153             webpage, u'description', fatal=False)
1154
1155         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1156             webpage, u'thumbnail', fatal=False)
1157
1158         return [{
1159             'id':       video_id,
1160             'url':      video_url,
1161             'ext':      'mp4',
1162             'title':    video_title,
1163             'description': video_description,
1164             'thumbnail': thumbnail,
1165         }]
1166
1167 class VineIE(InfoExtractor):
1168     """Information Extractor for Vine.co"""
1169     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1170
1171     def _real_extract(self, url):
1172         mobj = re.match(self._VALID_URL, url)
1173
1174         video_id = mobj.group('id')
1175         webpage_url = 'https://vine.co/v/' + video_id
1176         webpage = self._download_webpage(webpage_url, video_id)
1177
1178         self.report_extraction(video_id)
1179
1180         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1181             webpage, u'video URL')
1182
1183         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1184             webpage, u'title')
1185
1186         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1187             webpage, u'thumbnail', fatal=False)
1188
1189         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1190             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1191
1192         return [{
1193             'id':        video_id,
1194             'url':       video_url,
1195             'ext':       'mp4',
1196             'title':     video_title,
1197             'thumbnail': thumbnail,
1198             'uploader':  uploader,
1199         }]
1200
1201 class FlickrIE(InfoExtractor):
1202     """Information Extractor for Flickr videos"""
1203     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1204
1205     def _real_extract(self, url):
1206         mobj = re.match(self._VALID_URL, url)
1207
1208         video_id = mobj.group('id')
1209         video_uploader_id = mobj.group('uploader_id')
1210         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1211         webpage = self._download_webpage(webpage_url, video_id)
1212
1213         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1214
1215         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1216         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1217
1218         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1219             first_xml, u'node_id')
1220
1221         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1222         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1223
1224         self.report_extraction(video_id)
1225
1226         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1227         if mobj is None:
1228             raise ExtractorError(u'Unable to extract video url')
1229         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1230
1231         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1232             webpage, u'video title')
1233
1234         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1235             webpage, u'description', fatal=False)
1236
1237         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1238             webpage, u'thumbnail', fatal=False)
1239
1240         return [{
1241             'id':          video_id,
1242             'url':         video_url,
1243             'ext':         'mp4',
1244             'title':       video_title,
1245             'description': video_description,
1246             'thumbnail':   thumbnail,
1247             'uploader_id': video_uploader_id,
1248         }]
1249
1250 class TeamcocoIE(InfoExtractor):
1251     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1252
1253     def _real_extract(self, url):
1254         mobj = re.match(self._VALID_URL, url)
1255         if mobj is None:
1256             raise ExtractorError(u'Invalid URL: %s' % url)
1257         url_title = mobj.group('url_title')
1258         webpage = self._download_webpage(url, url_title)
1259
1260         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1261             webpage, u'video id')
1262
1263         self.report_extraction(video_id)
1264
1265         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1266             webpage, u'title')
1267
1268         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1269             webpage, u'thumbnail', fatal=False)
1270
1271         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1272             webpage, u'description', fatal=False)
1273
1274         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1275         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1276
1277         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1278             data, u'video URL')
1279
1280         return [{
1281             'id':          video_id,
1282             'url':         video_url,
1283             'ext':         'mp4',
1284             'title':       video_title,
1285             'thumbnail':   thumbnail,
1286             'description': video_description,
1287         }]
1288
1289 class XHamsterIE(InfoExtractor):
1290     """Information Extractor for xHamster"""
1291     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1292
1293     def _real_extract(self,url):
1294         mobj = re.match(self._VALID_URL, url)
1295
1296         video_id = mobj.group('id')
1297         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1298         webpage = self._download_webpage(mrss_url, video_id)
1299
1300         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1301         if mobj is None:
1302             raise ExtractorError(u'Unable to extract media URL')
1303         if len(mobj.group('server')) == 0:
1304             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1305         else:
1306             video_url = mobj.group('server')+'/key='+mobj.group('file')
1307         video_extension = video_url.split('.')[-1]
1308
1309         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1310             webpage, u'title')
1311
1312         # Can't see the description anywhere in the UI
1313         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1314         #     webpage, u'description', fatal=False)
1315         # if video_description: video_description = unescapeHTML(video_description)
1316
1317         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1318         if mobj:
1319             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1320         else:
1321             video_upload_date = None
1322             self._downloader.report_warning(u'Unable to extract upload date')
1323
1324         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1325             webpage, u'uploader id', default=u'anonymous')
1326
1327         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1328             webpage, u'thumbnail', fatal=False)
1329
1330         return [{
1331             'id':       video_id,
1332             'url':      video_url,
1333             'ext':      video_extension,
1334             'title':    video_title,
1335             # 'description': video_description,
1336             'upload_date': video_upload_date,
1337             'uploader_id': video_uploader_id,
1338             'thumbnail': video_thumbnail
1339         }]
1340
1341 class HypemIE(InfoExtractor):
1342     """Information Extractor for hypem"""
1343     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1344
1345     def _real_extract(self, url):
1346         mobj = re.match(self._VALID_URL, url)
1347         if mobj is None:
1348             raise ExtractorError(u'Invalid URL: %s' % url)
1349         track_id = mobj.group(1)
1350
1351         data = { 'ax': 1, 'ts': time.time() }
1352         data_encoded = compat_urllib_parse.urlencode(data)
1353         complete_url = url + "?" + data_encoded
1354         request = compat_urllib_request.Request(complete_url)
1355         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1356         cookie = urlh.headers.get('Set-Cookie', '')
1357
1358         self.report_extraction(track_id)
1359
1360         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1361             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1362         try:
1363             track_list = json.loads(html_tracks)
1364             track = track_list[u'tracks'][0]
1365         except ValueError:
1366             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1367
1368         key = track[u"key"]
1369         track_id = track[u"id"]
1370         artist = track[u"artist"]
1371         title = track[u"song"]
1372
1373         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1374         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1375         request.add_header('cookie', cookie)
1376         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1377         try:
1378             song_data = json.loads(song_data_json)
1379         except ValueError:
1380             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1381         final_url = song_data[u"url"]
1382
1383         return [{
1384             'id':       track_id,
1385             'url':      final_url,
1386             'ext':      "mp3",
1387             'title':    title,
1388             'artist':   artist,
1389         }]
1390
1391 class Vbox7IE(InfoExtractor):
1392     """Information Extractor for Vbox7"""
1393     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1394
1395     def _real_extract(self,url):
1396         mobj = re.match(self._VALID_URL, url)
1397         if mobj is None:
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399         video_id = mobj.group(1)
1400
1401         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1402         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1403         redirect_url = urlh.geturl() + new_location
1404         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1405
1406         title = self._html_search_regex(r'<title>(.*)</title>',
1407             webpage, u'title').split('/')[0].strip()
1408
1409         ext = "flv"
1410         info_url = "http://vbox7.com/play/magare.do"
1411         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1412         info_request = compat_urllib_request.Request(info_url, data)
1413         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1414         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1415         if info_response is None:
1416             raise ExtractorError(u'Unable to extract the media url')
1417         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1418
1419         return [{
1420             'id':        video_id,
1421             'url':       final_url,
1422             'ext':       ext,
1423             'title':     title,
1424             'thumbnail': thumbnail_url,
1425         }]
1426
1427
1428 def gen_extractors():
1429     """ Return a list of an instance of every supported extractor.
1430     The order does matter; the first extractor matched is the one handling the URL.
1431     """
1432     return [
1433         YoutubePlaylistIE(),
1434         YoutubeChannelIE(),
1435         YoutubeUserIE(),
1436         YoutubeSearchIE(),
1437         YoutubeIE(),
1438         MetacafeIE(),
1439         DailymotionIE(),
1440         GoogleSearchIE(),
1441         PhotobucketIE(),
1442         YahooIE(),
1443         YahooSearchIE(),
1444         DepositFilesIE(),
1445         FacebookIE(),
1446         BlipTVIE(),
1447         BlipTVUserIE(),
1448         VimeoIE(),
1449         MyVideoIE(),
1450         ComedyCentralIE(),
1451         EscapistIE(),
1452         CollegeHumorIE(),
1453         XVideosIE(),
1454         SoundcloudSetIE(),
1455         SoundcloudIE(),
1456         InfoQIE(),
1457         MixcloudIE(),
1458         StanfordOpenClassroomIE(),
1459         MTVIE(),
1460         YoukuIE(),
1461         XNXXIE(),
1462         YouJizzIE(),
1463         PornotubeIE(),
1464         YouPornIE(),
1465         GooglePlusIE(),
1466         ArteTvIE(),
1467         NBAIE(),
1468         WorldStarHipHopIE(),
1469         JustinTVIE(),
1470         FunnyOrDieIE(),
1471         SteamIE(),
1472         UstreamIE(),
1473         RBMARadioIE(),
1474         EightTracksIE(),
1475         KeekIE(),
1476         TEDIE(),
1477         MySpassIE(),
1478         SpiegelIE(),
1479         LiveLeakIE(),
1480         ARDIE(),
1481         ZDFIE(),
1482         TumblrIE(),
1483         BandcampIE(),
1484         RedTubeIE(),
1485         InaIE(),
1486         HowcastIE(),
1487         VineIE(),
1488         FlickrIE(),
1489         TeamcocoIE(),
1490         XHamsterIE(),
1491         HypemIE(),
1492         Vbox7IE(),
1493         GametrailersIE(),
1494         StatigramIE(),
1495         GenericIE()
1496     ]
1497
1498 def get_info_extractor(ie_name):
1499     """Returns the info extractor class with the given ie_name"""
1500     return globals()[ie_name+'IE']