_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.infoq import InfoQIE
  35 from .extractor.metacafe import MetacafeIE
  36 from .extractor.myvideo import MyVideoIE
  37 from .extractor.nba import NBAIE
  38 from .extractor.statigram import StatigramIE
  39 from .extractor.photobucket import PhotobucketIE
  40 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  41 from .extractor.stanfordoc import StanfordOpenClassroomIE
  42 from .extractor.vimeo import VimeoIE
  43 from .extractor.xvideos import XVideosIE
  44 from .extractor.yahoo import YahooIE, YahooSearchIE
  45 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  46 from .extractor.zdf import ZDFIE
  47
  48
  49
  50 class MixcloudIE(InfoExtractor):
  51     """Information extractor for www.mixcloud.com"""
  52
  53     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
  54     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
  55     IE_NAME = u'mixcloud'
  56
  57     def report_download_json(self, file_id):
  58         """Report JSON download."""
  59         self.to_screen(u'Downloading json')
  60
  61     def get_urls(self, jsonData, fmt, bitrate='best'):
  62         """Get urls from 'audio_formats' section in json"""
  63         file_url = None
  64         try:
  65             bitrate_list = jsonData[fmt]
  66             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
  67                 bitrate = max(bitrate_list) # select highest
  68
  69             url_list = jsonData[fmt][bitrate]
  70         except TypeError: # we have no bitrate info.
  71             url_list = jsonData[fmt]
  72         return url_list
  73
  74     def check_urls(self, url_list):
  75         """Returns 1st active url from list"""
  76         for url in url_list:
  77             try:
  78                 compat_urllib_request.urlopen(url)
  79                 return url
  80             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  81                 url = None
  82
  83         return None
  84
  85     def _print_formats(self, formats):
  86         print('Available formats:')
  87         for fmt in formats.keys():
  88             for b in formats[fmt]:
  89                 try:
  90                     ext = formats[fmt][b][0]
  91                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
  92                 except TypeError: # we have no bitrate info
  93                     ext = formats[fmt][0]
  94                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
  95                     break
  96
  97     def _real_extract(self, url):
  98         mobj = re.match(self._VALID_URL, url)
  99         if mobj is None:
 100             raise ExtractorError(u'Invalid URL: %s' % url)
 101         # extract uploader & filename from url
 102         uploader = mobj.group(1).decode('utf-8')
 103         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 104
 105         # construct API request
 106         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 107         # retrieve .json file with links to files
 108         request = compat_urllib_request.Request(file_url)
 109         try:
 110             self.report_download_json(file_url)
 111             jsonData = compat_urllib_request.urlopen(request).read()
 112         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 113             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 114
 115         # parse JSON
 116         json_data = json.loads(jsonData)
 117         player_url = json_data['player_swf_url']
 118         formats = dict(json_data['audio_formats'])
 119
 120         req_format = self._downloader.params.get('format', None)
 121         bitrate = None
 122
 123         if self._downloader.params.get('listformats', None):
 124             self._print_formats(formats)
 125             return
 126
 127         if req_format is None or req_format == 'best':
 128             for format_param in formats.keys():
 129                 url_list = self.get_urls(formats, format_param)
 130                 # check urls
 131                 file_url = self.check_urls(url_list)
 132                 if file_url is not None:
 133                     break # got it!
 134         else:
 135             if req_format not in formats:
 136                 raise ExtractorError(u'Format is not available')
 137
 138             url_list = self.get_urls(formats, req_format)
 139             file_url = self.check_urls(url_list)
 140             format_param = req_format
 141
 142         return [{
 143             'id': file_id.decode('utf-8'),
 144             'url': file_url.decode('utf-8'),
 145             'uploader': uploader.decode('utf-8'),
 146             'upload_date': None,
 147             'title': json_data['name'],
 148             'ext': file_url.split('.')[-1].decode('utf-8'),
 149             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 150             'thumbnail': json_data['thumbnail_url'],
 151             'description': json_data['description'],
 152             'player_url': player_url.decode('utf-8'),
 153         }]
 154
 155
 156 class MTVIE(InfoExtractor):
 157     """Information extractor for MTV.com"""
 158
 159     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 160     IE_NAME = u'mtv'
 161
 162     def _real_extract(self, url):
 163         mobj = re.match(self._VALID_URL, url)
 164         if mobj is None:
 165             raise ExtractorError(u'Invalid URL: %s' % url)
 166         if not mobj.group('proto'):
 167             url = 'http://' + url
 168         video_id = mobj.group('videoid')
 169
 170         webpage = self._download_webpage(url, video_id)
 171
 172         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 173             webpage, u'song name', fatal=False)
 174
 175         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 176             webpage, u'title')
 177
 178         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 179             webpage, u'mtvn_uri', fatal=False)
 180
 181         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 182             webpage, u'content id', fatal=False)
 183
 184         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 185         self.report_extraction(video_id)
 186         request = compat_urllib_request.Request(videogen_url)
 187         try:
 188             metadataXml = compat_urllib_request.urlopen(request).read()
 189         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 190             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 191
 192         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 193         renditions = mdoc.findall('.//rendition')
 194
 195         # For now, always pick the highest quality.
 196         rendition = renditions[-1]
 197
 198         try:
 199             _,_,ext = rendition.attrib['type'].partition('/')
 200             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 201             video_url = rendition.find('./src').text
 202         except KeyError:
 203             raise ExtractorError('Invalid rendition field.')
 204
 205         info = {
 206             'id': video_id,
 207             'url': video_url,
 208             'uploader': performer,
 209             'upload_date': None,
 210             'title': video_title,
 211             'ext': ext,
 212             'format': format,
 213         }
 214
 215         return [info]
 216
 217
 218 class YoukuIE(InfoExtractor):
 219     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 220
 221     def _gen_sid(self):
 222         nowTime = int(time.time() * 1000)
 223         random1 = random.randint(1000,1998)
 224         random2 = random.randint(1000,9999)
 225
 226         return "%d%d%d" %(nowTime,random1,random2)
 227
 228     def _get_file_ID_mix_string(self, seed):
 229         mixed = []
 230         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 231         seed = float(seed)
 232         for i in range(len(source)):
 233             seed  =  (seed * 211 + 30031 ) % 65536
 234             index  =  math.floor(seed / 65536 * len(source) )
 235             mixed.append(source[int(index)])
 236             source.remove(source[int(index)])
 237         #return ''.join(mixed)
 238         return mixed
 239
 240     def _get_file_id(self, fileId, seed):
 241         mixed = self._get_file_ID_mix_string(seed)
 242         ids = fileId.split('*')
 243         realId = []
 244         for ch in ids:
 245             if ch:
 246                 realId.append(mixed[int(ch)])
 247         return ''.join(realId)
 248
 249     def _real_extract(self, url):
 250         mobj = re.match(self._VALID_URL, url)
 251         if mobj is None:
 252             raise ExtractorError(u'Invalid URL: %s' % url)
 253         video_id = mobj.group('ID')
 254
 255         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 256
 257         jsondata = self._download_webpage(info_url, video_id)
 258
 259         self.report_extraction(video_id)
 260         try:
 261             config = json.loads(jsondata)
 262
 263             video_title =  config['data'][0]['title']
 264             seed = config['data'][0]['seed']
 265
 266             format = self._downloader.params.get('format', None)
 267             supported_format = list(config['data'][0]['streamfileids'].keys())
 268
 269             if format is None or format == 'best':
 270                 if 'hd2' in supported_format:
 271                     format = 'hd2'
 272                 else:
 273                     format = 'flv'
 274                 ext = u'flv'
 275             elif format == 'worst':
 276                 format = 'mp4'
 277                 ext = u'mp4'
 278             else:
 279                 format = 'flv'
 280                 ext = u'flv'
 281
 282
 283             fileid = config['data'][0]['streamfileids'][format]
 284             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 285         except (UnicodeDecodeError, ValueError, KeyError):
 286             raise ExtractorError(u'Unable to extract info section')
 287
 288         files_info=[]
 289         sid = self._gen_sid()
 290         fileid = self._get_file_id(fileid, seed)
 291
 292         #column 8,9 of fileid represent the segment number
 293         #fileid[7:9] should be changed
 294         for index, key in enumerate(keys):
 295
 296             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 297             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 298
 299             info = {
 300                 'id': '%s_part%02d' % (video_id, index),
 301                 'url': download_url,
 302                 'uploader': None,
 303                 'upload_date': None,
 304                 'title': video_title,
 305                 'ext': ext,
 306             }
 307             files_info.append(info)
 308
 309         return files_info
 310
 311
 312 class XNXXIE(InfoExtractor):
 313     """Information extractor for xnxx.com"""
 314
 315     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 316     IE_NAME = u'xnxx'
 317     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 318     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 319     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 320
 321     def _real_extract(self, url):
 322         mobj = re.match(self._VALID_URL, url)
 323         if mobj is None:
 324             raise ExtractorError(u'Invalid URL: %s' % url)
 325         video_id = mobj.group(1)
 326
 327         # Get webpage content
 328         webpage = self._download_webpage(url, video_id)
 329
 330         video_url = self._search_regex(self.VIDEO_URL_RE,
 331             webpage, u'video URL')
 332         video_url = compat_urllib_parse.unquote(video_url)
 333
 334         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 335             webpage, u'title')
 336
 337         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 338             webpage, u'thumbnail', fatal=False)
 339
 340         return [{
 341             'id': video_id,
 342             'url': video_url,
 343             'uploader': None,
 344             'upload_date': None,
 345             'title': video_title,
 346             'ext': 'flv',
 347             'thumbnail': video_thumbnail,
 348             'description': None,
 349         }]
 350
 351
 352
 353
 354 class JustinTVIE(InfoExtractor):
 355     """Information extractor for justin.tv and twitch.tv"""
 356     # TODO: One broadcast may be split into multiple videos. The key
 357     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 358     # starts at 1 and increases. Can we treat all parts as one video?
 359
 360     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 361         (?:
 362             (?P<channelid>[^/]+)|
 363             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 364             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 365         )
 366         /?(?:\#.*)?$
 367         """
 368     _JUSTIN_PAGE_LIMIT = 100
 369     IE_NAME = u'justin.tv'
 370
 371     def report_download_page(self, channel, offset):
 372         """Report attempt to download a single page of videos."""
 373         self.to_screen(u'%s: Downloading video information from %d to %d' %
 374                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 375
 376     # Return count of items, list of *valid* items
 377     def _parse_page(self, url, video_id):
 378         webpage = self._download_webpage(url, video_id,
 379                                          u'Downloading video info JSON',
 380                                          u'unable to download video info JSON')
 381
 382         response = json.loads(webpage)
 383         if type(response) != list:
 384             error_text = response.get('error', 'unknown error')
 385             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 386         info = []
 387         for clip in response:
 388             video_url = clip['video_file_url']
 389             if video_url:
 390                 video_extension = os.path.splitext(video_url)[1][1:]
 391                 video_date = re.sub('-', '', clip['start_time'][:10])
 392                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 393                 video_id = clip['id']
 394                 video_title = clip.get('title', video_id)
 395                 info.append({
 396                     'id': video_id,
 397                     'url': video_url,
 398                     'title': video_title,
 399                     'uploader': clip.get('channel_name', video_uploader_id),
 400                     'uploader_id': video_uploader_id,
 401                     'upload_date': video_date,
 402                     'ext': video_extension,
 403                 })
 404         return (len(response), info)
 405
 406     def _real_extract(self, url):
 407         mobj = re.match(self._VALID_URL, url)
 408         if mobj is None:
 409             raise ExtractorError(u'invalid URL: %s' % url)
 410
 411         api_base = 'http://api.justin.tv'
 412         paged = False
 413         if mobj.group('channelid'):
 414             paged = True
 415             video_id = mobj.group('channelid')
 416             api = api_base + '/channel/archives/%s.json' % video_id
 417         elif mobj.group('chapterid'):
 418             chapter_id = mobj.group('chapterid')
 419
 420             webpage = self._download_webpage(url, chapter_id)
 421             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 422             if not m:
 423                 raise ExtractorError(u'Cannot find archive of a chapter')
 424             archive_id = m.group(1)
 425
 426             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 427             chapter_info_xml = self._download_webpage(api, chapter_id,
 428                                              note=u'Downloading chapter information',
 429                                              errnote=u'Chapter information download failed')
 430             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 431             for a in doc.findall('.//archive'):
 432                 if archive_id == a.find('./id').text:
 433                     break
 434             else:
 435                 raise ExtractorError(u'Could not find chapter in chapter information')
 436
 437             video_url = a.find('./video_file_url').text
 438             video_ext = video_url.rpartition('.')[2] or u'flv'
 439
 440             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 441             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 442                                    note='Downloading chapter metadata',
 443                                    errnote='Download of chapter metadata failed')
 444             chapter_info = json.loads(chapter_info_json)
 445
 446             bracket_start = int(doc.find('.//bracket_start').text)
 447             bracket_end = int(doc.find('.//bracket_end').text)
 448
 449             # TODO determine start (and probably fix up file)
 450             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 451             #video_url += u'?start=' + TODO:start_timestamp
 452             # bracket_start is 13290, but we want 51670615
 453             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 454                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 455
 456             info = {
 457                 'id': u'c' + chapter_id,
 458                 'url': video_url,
 459                 'ext': video_ext,
 460                 'title': chapter_info['title'],
 461                 'thumbnail': chapter_info['preview'],
 462                 'description': chapter_info['description'],
 463                 'uploader': chapter_info['channel']['display_name'],
 464                 'uploader_id': chapter_info['channel']['name'],
 465             }
 466             return [info]
 467         else:
 468             video_id = mobj.group('videoid')
 469             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 470
 471         self.report_extraction(video_id)
 472
 473         info = []
 474         offset = 0
 475         limit = self._JUSTIN_PAGE_LIMIT
 476         while True:
 477             if paged:
 478                 self.report_download_page(video_id, offset)
 479             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 480             page_count, page_info = self._parse_page(page_url, video_id)
 481             info.extend(page_info)
 482             if not paged or page_count != limit:
 483                 break
 484             offset += limit
 485         return info
 486
 487 class FunnyOrDieIE(InfoExtractor):
 488     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 489
 490     def _real_extract(self, url):
 491         mobj = re.match(self._VALID_URL, url)
 492         if mobj is None:
 493             raise ExtractorError(u'invalid URL: %s' % url)
 494
 495         video_id = mobj.group('id')
 496         webpage = self._download_webpage(url, video_id)
 497
 498         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 499             webpage, u'video URL', flags=re.DOTALL)
 500
 501         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 502             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 503
 504         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 505             webpage, u'description', fatal=False, flags=re.DOTALL)
 506
 507         info = {
 508             'id': video_id,
 509             'url': video_url,
 510             'ext': 'mp4',
 511             'title': title,
 512             'description': video_description,
 513         }
 514         return [info]
 515
 516 class SteamIE(InfoExtractor):
 517     _VALID_URL = r"""http://store\.steampowered\.com/
 518                 (agecheck/)?
 519                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 520                 (?P<gameID>\d+)/?
 521                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 522                 """
 523     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 524     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 525
 526     @classmethod
 527     def suitable(cls, url):
 528         """Receives a URL and returns True if suitable for this IE."""
 529         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 530
 531     def _real_extract(self, url):
 532         m = re.match(self._VALID_URL, url, re.VERBOSE)
 533         gameID = m.group('gameID')
 534
 535         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 536         webpage = self._download_webpage(videourl, gameID)
 537
 538         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 539             videourl = self._AGECHECK_TEMPLATE % gameID
 540             self.report_age_confirmation()
 541             webpage = self._download_webpage(videourl, gameID)
 542
 543         self.report_extraction(gameID)
 544         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 545                                              webpage, 'game title')
 546
 547         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 548         mweb = re.finditer(urlRE, webpage)
 549         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 550         titles = re.finditer(namesRE, webpage)
 551         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 552         thumbs = re.finditer(thumbsRE, webpage)
 553         videos = []
 554         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 555             video_id = vid.group('videoID')
 556             title = vtitle.group('videoName')
 557             video_url = vid.group('videoURL')
 558             video_thumb = thumb.group('thumbnail')
 559             if not video_url:
 560                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 561             info = {
 562                 'id':video_id,
 563                 'url':video_url,
 564                 'ext': 'flv',
 565                 'title': unescapeHTML(title),
 566                 'thumbnail': video_thumb
 567                   }
 568             videos.append(info)
 569         return [self.playlist_result(videos, gameID, game_title)]
 570
 571 class UstreamIE(InfoExtractor):
 572     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 573     IE_NAME = u'ustream'
 574
 575     def _real_extract(self, url):
 576         m = re.match(self._VALID_URL, url)
 577         video_id = m.group('videoID')
 578
 579         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 580         webpage = self._download_webpage(url, video_id)
 581
 582         self.report_extraction(video_id)
 583
 584         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 585             webpage, u'title')
 586
 587         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 588             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 589
 590         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 591             webpage, u'thumbnail', fatal=False)
 592
 593         info = {
 594                 'id': video_id,
 595                 'url': video_url,
 596                 'ext': 'flv',
 597                 'title': video_title,
 598                 'uploader': uploader,
 599                 'thumbnail': thumbnail,
 600                }
 601         return info
 602
 603 class WorldStarHipHopIE(InfoExtractor):
 604     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 605     IE_NAME = u'WorldStarHipHop'
 606
 607     def _real_extract(self, url):
 608         m = re.match(self._VALID_URL, url)
 609         video_id = m.group('id')
 610
 611         webpage_src = self._download_webpage(url, video_id)
 612
 613         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 614             webpage_src, u'video URL')
 615
 616         if 'mp4' in video_url:
 617             ext = 'mp4'
 618         else:
 619             ext = 'flv'
 620
 621         video_title = self._html_search_regex(r"<title>(.*)</title>",
 622             webpage_src, u'title')
 623
 624         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 625         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 626             webpage_src, u'thumbnail', fatal=False)
 627
 628         if not thumbnail:
 629             _title = r"""candytitles.*>(.*)</span>"""
 630             mobj = re.search(_title, webpage_src)
 631             if mobj is not None:
 632                 video_title = mobj.group(1)
 633
 634         results = [{
 635                     'id': video_id,
 636                     'url' : video_url,
 637                     'title' : video_title,
 638                     'thumbnail' : thumbnail,
 639                     'ext' : ext,
 640                     }]
 641         return results
 642
 643 class RBMARadioIE(InfoExtractor):
 644     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 645
 646     def _real_extract(self, url):
 647         m = re.match(self._VALID_URL, url)
 648         video_id = m.group('videoID')
 649
 650         webpage = self._download_webpage(url, video_id)
 651
 652         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 653             webpage, u'json data', flags=re.MULTILINE)
 654
 655         try:
 656             data = json.loads(json_data)
 657         except ValueError as e:
 658             raise ExtractorError(u'Invalid JSON: ' + str(e))
 659
 660         video_url = data['akamai_url'] + '&cbr=256'
 661         url_parts = compat_urllib_parse_urlparse(video_url)
 662         video_ext = url_parts.path.rpartition('.')[2]
 663         info = {
 664                 'id': video_id,
 665                 'url': video_url,
 666                 'ext': video_ext,
 667                 'title': data['title'],
 668                 'description': data.get('teaser_text'),
 669                 'location': data.get('country_of_origin'),
 670                 'uploader': data.get('host', {}).get('name'),
 671                 'uploader_id': data.get('host', {}).get('slug'),
 672                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 673                 'duration': data.get('duration'),
 674         }
 675         return [info]
 676
 677
 678 class YouPornIE(InfoExtractor):
 679     """Information extractor for youporn.com."""
 680     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 681
 682     def _print_formats(self, formats):
 683         """Print all available formats"""
 684         print(u'Available formats:')
 685         print(u'ext\t\tformat')
 686         print(u'---------------------------------')
 687         for format in formats:
 688             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 689
 690     def _specific(self, req_format, formats):
 691         for x in formats:
 692             if(x["format"]==req_format):
 693                 return x
 694         return None
 695
 696     def _real_extract(self, url):
 697         mobj = re.match(self._VALID_URL, url)
 698         if mobj is None:
 699             raise ExtractorError(u'Invalid URL: %s' % url)
 700         video_id = mobj.group('videoid')
 701
 702         req = compat_urllib_request.Request(url)
 703         req.add_header('Cookie', 'age_verified=1')
 704         webpage = self._download_webpage(req, video_id)
 705
 706         # Get JSON parameters
 707         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 708         try:
 709             params = json.loads(json_params)
 710         except:
 711             raise ExtractorError(u'Invalid JSON')
 712
 713         self.report_extraction(video_id)
 714         try:
 715             video_title = params['title']
 716             upload_date = unified_strdate(params['release_date_f'])
 717             video_description = params['description']
 718             video_uploader = params['submitted_by']
 719             thumbnail = params['thumbnails'][0]['image']
 720         except KeyError:
 721             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 722
 723         # Get all of the formats available
 724         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 725         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 726             webpage, u'download list').strip()
 727
 728         # Get all of the links from the page
 729         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 730         links = re.findall(LINK_RE, download_list_html)
 731         if(len(links) == 0):
 732             raise ExtractorError(u'ERROR: no known formats available for video')
 733
 734         self.to_screen(u'Links found: %d' % len(links))
 735
 736         formats = []
 737         for link in links:
 738
 739             # A link looks like this:
 740             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 741             # A path looks like this:
 742             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 743             video_url = unescapeHTML( link )
 744             path = compat_urllib_parse_urlparse( video_url ).path
 745             extension = os.path.splitext( path )[1][1:]
 746             format = path.split('/')[4].split('_')[:2]
 747             size = format[0]
 748             bitrate = format[1]
 749             format = "-".join( format )
 750             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 751
 752             formats.append({
 753                 'id': video_id,
 754                 'url': video_url,
 755                 'uploader': video_uploader,
 756                 'upload_date': upload_date,
 757                 'title': video_title,
 758                 'ext': extension,
 759                 'format': format,
 760                 'thumbnail': thumbnail,
 761                 'description': video_description
 762             })
 763
 764         if self._downloader.params.get('listformats', None):
 765             self._print_formats(formats)
 766             return
 767
 768         req_format = self._downloader.params.get('format', None)
 769         self.to_screen(u'Format: %s' % req_format)
 770
 771         if req_format is None or req_format == 'best':
 772             return [formats[0]]
 773         elif req_format == 'worst':
 774             return [formats[-1]]
 775         elif req_format in ('-1', 'all'):
 776             return formats
 777         else:
 778             format = self._specific( req_format, formats )
 779             if result is None:
 780                 raise ExtractorError(u'Requested format not available')
 781             return [format]
 782
 783
 784
 785 class PornotubeIE(InfoExtractor):
 786     """Information extractor for pornotube.com."""
 787     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 788
 789     def _real_extract(self, url):
 790         mobj = re.match(self._VALID_URL, url)
 791         if mobj is None:
 792             raise ExtractorError(u'Invalid URL: %s' % url)
 793
 794         video_id = mobj.group('videoid')
 795         video_title = mobj.group('title')
 796
 797         # Get webpage content
 798         webpage = self._download_webpage(url, video_id)
 799
 800         # Get the video URL
 801         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 802         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 803         video_url = compat_urllib_parse.unquote(video_url)
 804
 805         #Get the uploaded date
 806         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 807         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 808         if upload_date: upload_date = unified_strdate(upload_date)
 809
 810         info = {'id': video_id,
 811                 'url': video_url,
 812                 'uploader': None,
 813                 'upload_date': upload_date,
 814                 'title': video_title,
 815                 'ext': 'flv',
 816                 'format': 'flv'}
 817
 818         return [info]
 819
 820 class YouJizzIE(InfoExtractor):
 821     """Information extractor for youjizz.com."""
 822     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 823
 824     def _real_extract(self, url):
 825         mobj = re.match(self._VALID_URL, url)
 826         if mobj is None:
 827             raise ExtractorError(u'Invalid URL: %s' % url)
 828
 829         video_id = mobj.group('videoid')
 830
 831         # Get webpage content
 832         webpage = self._download_webpage(url, video_id)
 833
 834         # Get the video title
 835         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 836             webpage, u'title').strip()
 837
 838         # Get the embed page
 839         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 840         if result is None:
 841             raise ExtractorError(u'ERROR: unable to extract embed page')
 842
 843         embed_page_url = result.group(0).strip()
 844         video_id = result.group('videoid')
 845
 846         webpage = self._download_webpage(embed_page_url, video_id)
 847
 848         # Get the video URL
 849         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 850             webpage, u'video URL')
 851
 852         info = {'id': video_id,
 853                 'url': video_url,
 854                 'title': video_title,
 855                 'ext': 'flv',
 856                 'format': 'flv',
 857                 'player_url': embed_page_url}
 858
 859         return [info]
 860
 861 class EightTracksIE(InfoExtractor):
 862     IE_NAME = '8tracks'
 863     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 864
 865     def _real_extract(self, url):
 866         mobj = re.match(self._VALID_URL, url)
 867         if mobj is None:
 868             raise ExtractorError(u'Invalid URL: %s' % url)
 869         playlist_id = mobj.group('id')
 870
 871         webpage = self._download_webpage(url, playlist_id)
 872
 873         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 874         data = json.loads(json_like)
 875
 876         session = str(random.randint(0, 1000000000))
 877         mix_id = data['id']
 878         track_count = data['tracks_count']
 879         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 880         next_url = first_url
 881         res = []
 882         for i in itertools.count():
 883             api_json = self._download_webpage(next_url, playlist_id,
 884                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 885                 errnote=u'Failed to download song information')
 886             api_data = json.loads(api_json)
 887             track_data = api_data[u'set']['track']
 888             info = {
 889                 'id': track_data['id'],
 890                 'url': track_data['track_file_stream_url'],
 891                 'title': track_data['performer'] + u' - ' + track_data['name'],
 892                 'raw_title': track_data['name'],
 893                 'uploader_id': data['user']['login'],
 894                 'ext': 'm4a',
 895             }
 896             res.append(info)
 897             if api_data['set']['at_last_track']:
 898                 break
 899             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 900         return res
 901
 902 class KeekIE(InfoExtractor):
 903     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 904     IE_NAME = u'keek'
 905
 906     def _real_extract(self, url):
 907         m = re.match(self._VALID_URL, url)
 908         video_id = m.group('videoID')
 909
 910         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 911         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 912         webpage = self._download_webpage(url, video_id)
 913
 914         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 915             webpage, u'title')
 916
 917         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 918             webpage, u'uploader', fatal=False)
 919
 920         info = {
 921                 'id': video_id,
 922                 'url': video_url,
 923                 'ext': 'mp4',
 924                 'title': video_title,
 925                 'thumbnail': thumbnail,
 926                 'uploader': uploader
 927         }
 928         return [info]
 929
 930 class TEDIE(InfoExtractor):
 931     _VALID_URL=r'''http://www\.ted\.com/
 932                    (
 933                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
 934                         |
 935                         ((?P<type_talk>talks)) # We have a simple talk
 936                    )
 937                    (/lang/(.*?))? # The url may contain the language
 938                    /(?P<name>\w+) # Here goes the name and then ".html"
 939                    '''
 940
 941     @classmethod
 942     def suitable(cls, url):
 943         """Receives a URL and returns True if suitable for this IE."""
 944         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 945
 946     def _real_extract(self, url):
 947         m=re.match(self._VALID_URL, url, re.VERBOSE)
 948         if m.group('type_talk'):
 949             return [self._talk_info(url)]
 950         else :
 951             playlist_id=m.group('playlist_id')
 952             name=m.group('name')
 953             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
 954             return [self._playlist_videos_info(url,name,playlist_id)]
 955
 956     def _playlist_videos_info(self,url,name,playlist_id=0):
 957         '''Returns the videos of the playlist'''
 958         video_RE=r'''
 959                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
 960                      ([.\s]*?)data-playlist_item_id="(\d+)"
 961                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
 962                      '''
 963         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
 964         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
 965         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
 966         m_names=re.finditer(video_name_RE,webpage)
 967
 968         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
 969                                                  webpage, 'playlist title')
 970
 971         playlist_entries = []
 972         for m_video, m_name in zip(m_videos,m_names):
 973             video_id=m_video.group('video_id')
 974             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
 975             playlist_entries.append(self.url_result(talk_url, 'TED'))
 976         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
 977
 978     def _talk_info(self, url, video_id=0):
 979         """Return the video for the talk in the url"""
 980         m = re.match(self._VALID_URL, url,re.VERBOSE)
 981         video_name = m.group('name')
 982         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
 983         self.report_extraction(video_name)
 984         # If the url includes the language we get the title translated
 985         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
 986                                         webpage, 'title')
 987         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
 988                                     webpage, 'json data')
 989         info = json.loads(json_data)
 990         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
 991                                        webpage, 'description', flags = re.DOTALL)
 992
 993         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
 994                                        webpage, 'thumbnail')
 995         info = {
 996                 'id': info['id'],
 997                 'url': info['htmlStreams'][-1]['file'],
 998                 'ext': 'mp4',
 999                 'title': title,
1000                 'thumbnail': thumbnail,
1001                 'description': desc,
1002                 }
1003         return info
1004
1005 class MySpassIE(InfoExtractor):
1006     _VALID_URL = r'http://www.myspass.de/.*'
1007
1008     def _real_extract(self, url):
1009         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1010
1011         # video id is the last path element of the URL
1012         # usually there is a trailing slash, so also try the second but last
1013         url_path = compat_urllib_parse_urlparse(url).path
1014         url_parent_path, video_id = os.path.split(url_path)
1015         if not video_id:
1016             _, video_id = os.path.split(url_parent_path)
1017
1018         # get metadata
1019         metadata_url = META_DATA_URL_TEMPLATE % video_id
1020         metadata_text = self._download_webpage(metadata_url, video_id)
1021         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1022
1023         # extract values from metadata
1024         url_flv_el = metadata.find('url_flv')
1025         if url_flv_el is None:
1026             raise ExtractorError(u'Unable to extract download url')
1027         video_url = url_flv_el.text
1028         extension = os.path.splitext(video_url)[1][1:]
1029         title_el = metadata.find('title')
1030         if title_el is None:
1031             raise ExtractorError(u'Unable to extract title')
1032         title = title_el.text
1033         format_id_el = metadata.find('format_id')
1034         if format_id_el is None:
1035             format = ext
1036         else:
1037             format = format_id_el.text
1038         description_el = metadata.find('description')
1039         if description_el is not None:
1040             description = description_el.text
1041         else:
1042             description = None
1043         imagePreview_el = metadata.find('imagePreview')
1044         if imagePreview_el is not None:
1045             thumbnail = imagePreview_el.text
1046         else:
1047             thumbnail = None
1048         info = {
1049             'id': video_id,
1050             'url': video_url,
1051             'title': title,
1052             'ext': extension,
1053             'format': format,
1054             'thumbnail': thumbnail,
1055             'description': description
1056         }
1057         return [info]
1058
1059 class SpiegelIE(InfoExtractor):
1060     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1061
1062     def _real_extract(self, url):
1063         m = re.match(self._VALID_URL, url)
1064         video_id = m.group('videoID')
1065
1066         webpage = self._download_webpage(url, video_id)
1067
1068         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1069             webpage, u'title')
1070
1071         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1072         xml_code = self._download_webpage(xml_url, video_id,
1073                     note=u'Downloading XML', errnote=u'Failed to download XML')
1074
1075         idoc = xml.etree.ElementTree.fromstring(xml_code)
1076         last_type = idoc[-1]
1077         filename = last_type.findall('./filename')[0].text
1078         duration = float(last_type.findall('./duration')[0].text)
1079
1080         video_url = 'http://video2.spiegel.de/flash/' + filename
1081         video_ext = filename.rpartition('.')[2]
1082         info = {
1083             'id': video_id,
1084             'url': video_url,
1085             'ext': video_ext,
1086             'title': video_title,
1087             'duration': duration,
1088         }
1089         return [info]
1090
1091 class LiveLeakIE(InfoExtractor):
1092
1093     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1094     IE_NAME = u'liveleak'
1095
1096     def _real_extract(self, url):
1097         mobj = re.match(self._VALID_URL, url)
1098         if mobj is None:
1099             raise ExtractorError(u'Invalid URL: %s' % url)
1100
1101         video_id = mobj.group('video_id')
1102
1103         webpage = self._download_webpage(url, video_id)
1104
1105         video_url = self._search_regex(r'file: "(.*?)",',
1106             webpage, u'video URL')
1107
1108         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1109             webpage, u'title').replace('LiveLeak.com -', '').strip()
1110
1111         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1112             webpage, u'description', fatal=False)
1113
1114         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1115             webpage, u'uploader', fatal=False)
1116
1117         info = {
1118             'id':  video_id,
1119             'url': video_url,
1120             'ext': 'mp4',
1121             'title': video_title,
1122             'description': video_description,
1123             'uploader': video_uploader
1124         }
1125
1126         return [info]
1127
1128
1129
1130 class TumblrIE(InfoExtractor):
1131     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1132
1133     def _real_extract(self, url):
1134         m_url = re.match(self._VALID_URL, url)
1135         video_id = m_url.group('id')
1136         blog = m_url.group('blog_name')
1137
1138         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1139         webpage = self._download_webpage(url, video_id)
1140
1141         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1142         video = re.search(re_video, webpage)
1143         if video is None:
1144            raise ExtractorError(u'Unable to extract video')
1145         video_url = video.group('video_url')
1146         ext = video.group('ext')
1147
1148         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1149             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1150         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1151
1152         # The only place where you can get a title, it's not complete,
1153         # but searching in other places doesn't work for all videos
1154         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1155             webpage, u'title', flags=re.DOTALL)
1156
1157         return [{'id': video_id,
1158                  'url': video_url,
1159                  'title': video_title,
1160                  'thumbnail': video_thumbnail,
1161                  'ext': ext
1162                  }]
1163
1164 class BandcampIE(InfoExtractor):
1165     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1166
1167     def _real_extract(self, url):
1168         mobj = re.match(self._VALID_URL, url)
1169         title = mobj.group('title')
1170         webpage = self._download_webpage(url, title)
1171         # We get the link to the free download page
1172         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1173         if m_download is None:
1174             raise ExtractorError(u'No free songs found')
1175
1176         download_link = m_download.group(1)
1177         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1178                        webpage, re.MULTILINE|re.DOTALL).group('id')
1179
1180         download_webpage = self._download_webpage(download_link, id,
1181                                                   'Downloading free downloads page')
1182         # We get the dictionary of the track from some javascrip code
1183         info = re.search(r'items: (.*?),$',
1184                          download_webpage, re.MULTILINE).group(1)
1185         info = json.loads(info)[0]
1186         # We pick mp3-320 for now, until format selection can be easily implemented.
1187         mp3_info = info[u'downloads'][u'mp3-320']
1188         # If we try to use this url it says the link has expired
1189         initial_url = mp3_info[u'url']
1190         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1191         m_url = re.match(re_url, initial_url)
1192         #We build the url we will use to get the final track url
1193         # This url is build in Bandcamp in the script download_bunde_*.js
1194         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1195         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1196         # If we could correctly generate the .rand field the url would be
1197         #in the "download_url" key
1198         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1199
1200         track_info = {'id':id,
1201                       'title' : info[u'title'],
1202                       'ext' :   'mp3',
1203                       'url' :   final_url,
1204                       'thumbnail' : info[u'thumb_url'],
1205                       'uploader' :  info[u'artist']
1206                       }
1207
1208         return [track_info]
1209
1210 class RedTubeIE(InfoExtractor):
1211     """Information Extractor for redtube"""
1212     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1213
1214     def _real_extract(self,url):
1215         mobj = re.match(self._VALID_URL, url)
1216         if mobj is None:
1217             raise ExtractorError(u'Invalid URL: %s' % url)
1218
1219         video_id = mobj.group('id')
1220         video_extension = 'mp4'
1221         webpage = self._download_webpage(url, video_id)
1222
1223         self.report_extraction(video_id)
1224
1225         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1226             webpage, u'video URL')
1227
1228         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1229             webpage, u'title')
1230
1231         return [{
1232             'id':       video_id,
1233             'url':      video_url,
1234             'ext':      video_extension,
1235             'title':    video_title,
1236         }]
1237
1238 class InaIE(InfoExtractor):
1239     """Information Extractor for Ina.fr"""
1240     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1241
1242     def _real_extract(self,url):
1243         mobj = re.match(self._VALID_URL, url)
1244
1245         video_id = mobj.group('id')
1246         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1247         video_extension = 'mp4'
1248         webpage = self._download_webpage(mrss_url, video_id)
1249
1250         self.report_extraction(video_id)
1251
1252         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1253             webpage, u'video URL')
1254
1255         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1256             webpage, u'title')
1257
1258         return [{
1259             'id':       video_id,
1260             'url':      video_url,
1261             'ext':      video_extension,
1262             'title':    video_title,
1263         }]
1264
1265 class HowcastIE(InfoExtractor):
1266     """Information Extractor for Howcast.com"""
1267     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1268
1269     def _real_extract(self, url):
1270         mobj = re.match(self._VALID_URL, url)
1271
1272         video_id = mobj.group('id')
1273         webpage_url = 'http://www.howcast.com/videos/' + video_id
1274         webpage = self._download_webpage(webpage_url, video_id)
1275
1276         self.report_extraction(video_id)
1277
1278         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1279             webpage, u'video URL')
1280
1281         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1282             webpage, u'title')
1283
1284         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1285             webpage, u'description', fatal=False)
1286
1287         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1288             webpage, u'thumbnail', fatal=False)
1289
1290         return [{
1291             'id':       video_id,
1292             'url':      video_url,
1293             'ext':      'mp4',
1294             'title':    video_title,
1295             'description': video_description,
1296             'thumbnail': thumbnail,
1297         }]
1298
1299 class VineIE(InfoExtractor):
1300     """Information Extractor for Vine.co"""
1301     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1302
1303     def _real_extract(self, url):
1304         mobj = re.match(self._VALID_URL, url)
1305
1306         video_id = mobj.group('id')
1307         webpage_url = 'https://vine.co/v/' + video_id
1308         webpage = self._download_webpage(webpage_url, video_id)
1309
1310         self.report_extraction(video_id)
1311
1312         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1313             webpage, u'video URL')
1314
1315         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1316             webpage, u'title')
1317
1318         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1319             webpage, u'thumbnail', fatal=False)
1320
1321         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1322             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1323
1324         return [{
1325             'id':        video_id,
1326             'url':       video_url,
1327             'ext':       'mp4',
1328             'title':     video_title,
1329             'thumbnail': thumbnail,
1330             'uploader':  uploader,
1331         }]
1332
1333 class FlickrIE(InfoExtractor):
1334     """Information Extractor for Flickr videos"""
1335     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1336
1337     def _real_extract(self, url):
1338         mobj = re.match(self._VALID_URL, url)
1339
1340         video_id = mobj.group('id')
1341         video_uploader_id = mobj.group('uploader_id')
1342         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1343         webpage = self._download_webpage(webpage_url, video_id)
1344
1345         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1346
1347         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1348         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1349
1350         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1351             first_xml, u'node_id')
1352
1353         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1354         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1355
1356         self.report_extraction(video_id)
1357
1358         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1359         if mobj is None:
1360             raise ExtractorError(u'Unable to extract video url')
1361         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1362
1363         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1364             webpage, u'video title')
1365
1366         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1367             webpage, u'description', fatal=False)
1368
1369         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1370             webpage, u'thumbnail', fatal=False)
1371
1372         return [{
1373             'id':          video_id,
1374             'url':         video_url,
1375             'ext':         'mp4',
1376             'title':       video_title,
1377             'description': video_description,
1378             'thumbnail':   thumbnail,
1379             'uploader_id': video_uploader_id,
1380         }]
1381
1382 class TeamcocoIE(InfoExtractor):
1383     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1384
1385     def _real_extract(self, url):
1386         mobj = re.match(self._VALID_URL, url)
1387         if mobj is None:
1388             raise ExtractorError(u'Invalid URL: %s' % url)
1389         url_title = mobj.group('url_title')
1390         webpage = self._download_webpage(url, url_title)
1391
1392         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1393             webpage, u'video id')
1394
1395         self.report_extraction(video_id)
1396
1397         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1398             webpage, u'title')
1399
1400         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1401             webpage, u'thumbnail', fatal=False)
1402
1403         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1404             webpage, u'description', fatal=False)
1405
1406         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1407         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1408
1409         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1410             data, u'video URL')
1411
1412         return [{
1413             'id':          video_id,
1414             'url':         video_url,
1415             'ext':         'mp4',
1416             'title':       video_title,
1417             'thumbnail':   thumbnail,
1418             'description': video_description,
1419         }]
1420
1421 class XHamsterIE(InfoExtractor):
1422     """Information Extractor for xHamster"""
1423     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1424
1425     def _real_extract(self,url):
1426         mobj = re.match(self._VALID_URL, url)
1427
1428         video_id = mobj.group('id')
1429         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1430         webpage = self._download_webpage(mrss_url, video_id)
1431
1432         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1433         if mobj is None:
1434             raise ExtractorError(u'Unable to extract media URL')
1435         if len(mobj.group('server')) == 0:
1436             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1437         else:
1438             video_url = mobj.group('server')+'/key='+mobj.group('file')
1439         video_extension = video_url.split('.')[-1]
1440
1441         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1442             webpage, u'title')
1443
1444         # Can't see the description anywhere in the UI
1445         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1446         #     webpage, u'description', fatal=False)
1447         # if video_description: video_description = unescapeHTML(video_description)
1448
1449         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1450         if mobj:
1451             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1452         else:
1453             video_upload_date = None
1454             self._downloader.report_warning(u'Unable to extract upload date')
1455
1456         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1457             webpage, u'uploader id', default=u'anonymous')
1458
1459         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1460             webpage, u'thumbnail', fatal=False)
1461
1462         return [{
1463             'id':       video_id,
1464             'url':      video_url,
1465             'ext':      video_extension,
1466             'title':    video_title,
1467             # 'description': video_description,
1468             'upload_date': video_upload_date,
1469             'uploader_id': video_uploader_id,
1470             'thumbnail': video_thumbnail
1471         }]
1472
1473 class HypemIE(InfoExtractor):
1474     """Information Extractor for hypem"""
1475     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1476
1477     def _real_extract(self, url):
1478         mobj = re.match(self._VALID_URL, url)
1479         if mobj is None:
1480             raise ExtractorError(u'Invalid URL: %s' % url)
1481         track_id = mobj.group(1)
1482
1483         data = { 'ax': 1, 'ts': time.time() }
1484         data_encoded = compat_urllib_parse.urlencode(data)
1485         complete_url = url + "?" + data_encoded
1486         request = compat_urllib_request.Request(complete_url)
1487         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1488         cookie = urlh.headers.get('Set-Cookie', '')
1489
1490         self.report_extraction(track_id)
1491
1492         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1493             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1494         try:
1495             track_list = json.loads(html_tracks)
1496             track = track_list[u'tracks'][0]
1497         except ValueError:
1498             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1499
1500         key = track[u"key"]
1501         track_id = track[u"id"]
1502         artist = track[u"artist"]
1503         title = track[u"song"]
1504
1505         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1506         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1507         request.add_header('cookie', cookie)
1508         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1509         try:
1510             song_data = json.loads(song_data_json)
1511         except ValueError:
1512             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1513         final_url = song_data[u"url"]
1514
1515         return [{
1516             'id':       track_id,
1517             'url':      final_url,
1518             'ext':      "mp3",
1519             'title':    title,
1520             'artist':   artist,
1521         }]
1522
1523 class Vbox7IE(InfoExtractor):
1524     """Information Extractor for Vbox7"""
1525     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1526
1527     def _real_extract(self,url):
1528         mobj = re.match(self._VALID_URL, url)
1529         if mobj is None:
1530             raise ExtractorError(u'Invalid URL: %s' % url)
1531         video_id = mobj.group(1)
1532
1533         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1534         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1535         redirect_url = urlh.geturl() + new_location
1536         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1537
1538         title = self._html_search_regex(r'<title>(.*)</title>',
1539             webpage, u'title').split('/')[0].strip()
1540
1541         ext = "flv"
1542         info_url = "http://vbox7.com/play/magare.do"
1543         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1544         info_request = compat_urllib_request.Request(info_url, data)
1545         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1546         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1547         if info_response is None:
1548             raise ExtractorError(u'Unable to extract the media url')
1549         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1550
1551         return [{
1552             'id':        video_id,
1553             'url':       final_url,
1554             'ext':       ext,
1555             'title':     title,
1556             'thumbnail': thumbnail_url,
1557         }]
1558
1559
1560 def gen_extractors():
1561     """ Return a list of an instance of every supported extractor.
1562     The order does matter; the first extractor matched is the one handling the URL.
1563     """
1564     return [
1565         YoutubePlaylistIE(),
1566         YoutubeChannelIE(),
1567         YoutubeUserIE(),
1568         YoutubeSearchIE(),
1569         YoutubeIE(),
1570         MetacafeIE(),
1571         DailymotionIE(),
1572         GoogleSearchIE(),
1573         PhotobucketIE(),
1574         YahooIE(),
1575         YahooSearchIE(),
1576         DepositFilesIE(),
1577         FacebookIE(),
1578         BlipTVIE(),
1579         BlipTVUserIE(),
1580         VimeoIE(),
1581         MyVideoIE(),
1582         ComedyCentralIE(),
1583         EscapistIE(),
1584         CollegeHumorIE(),
1585         XVideosIE(),
1586         SoundcloudSetIE(),
1587         SoundcloudIE(),
1588         InfoQIE(),
1589         MixcloudIE(),
1590         StanfordOpenClassroomIE(),
1591         MTVIE(),
1592         YoukuIE(),
1593         XNXXIE(),
1594         YouJizzIE(),
1595         PornotubeIE(),
1596         YouPornIE(),
1597         GooglePlusIE(),
1598         ArteTvIE(),
1599         NBAIE(),
1600         WorldStarHipHopIE(),
1601         JustinTVIE(),
1602         FunnyOrDieIE(),
1603         SteamIE(),
1604         UstreamIE(),
1605         RBMARadioIE(),
1606         EightTracksIE(),
1607         KeekIE(),
1608         TEDIE(),
1609         MySpassIE(),
1610         SpiegelIE(),
1611         LiveLeakIE(),
1612         ARDIE(),
1613         ZDFIE(),
1614         TumblrIE(),
1615         BandcampIE(),
1616         RedTubeIE(),
1617         InaIE(),
1618         HowcastIE(),
1619         VineIE(),
1620         FlickrIE(),
1621         TeamcocoIE(),
1622         XHamsterIE(),
1623         HypemIE(),
1624         Vbox7IE(),
1625         GametrailersIE(),
1626         StatigramIE(),
1627         GenericIE()
1628     ]
1629
1630 def get_info_extractor(ie_name):
1631     """Returns the info extractor class with the given ie_name"""
1632     return globals()[ie_name+'IE']