_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.infoq import InfoQIE
  35 from .extractor.metacafe import MetacafeIE
  36 from .extractor.mtv import MTVIE
  37 from .extractor.myvideo import MyVideoIE
  38 from .extractor.nba import NBAIE
  39 from .extractor.statigram import StatigramIE
  40 from .extractor.photobucket import PhotobucketIE
  41 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  42 from .extractor.stanfordoc import StanfordOpenClassroomIE
  43 from .extractor.vimeo import VimeoIE
  44 from .extractor.xvideos import XVideosIE
  45 from .extractor.yahoo import YahooIE, YahooSearchIE
  46 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  47 from .extractor.zdf import ZDFIE
  48
  49
  50
  51 class MixcloudIE(InfoExtractor):
  52     """Information extractor for www.mixcloud.com"""
  53
  54     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
  55     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
  56     IE_NAME = u'mixcloud'
  57
  58     def report_download_json(self, file_id):
  59         """Report JSON download."""
  60         self.to_screen(u'Downloading json')
  61
  62     def get_urls(self, jsonData, fmt, bitrate='best'):
  63         """Get urls from 'audio_formats' section in json"""
  64         file_url = None
  65         try:
  66             bitrate_list = jsonData[fmt]
  67             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
  68                 bitrate = max(bitrate_list) # select highest
  69
  70             url_list = jsonData[fmt][bitrate]
  71         except TypeError: # we have no bitrate info.
  72             url_list = jsonData[fmt]
  73         return url_list
  74
  75     def check_urls(self, url_list):
  76         """Returns 1st active url from list"""
  77         for url in url_list:
  78             try:
  79                 compat_urllib_request.urlopen(url)
  80                 return url
  81             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  82                 url = None
  83
  84         return None
  85
  86     def _print_formats(self, formats):
  87         print('Available formats:')
  88         for fmt in formats.keys():
  89             for b in formats[fmt]:
  90                 try:
  91                     ext = formats[fmt][b][0]
  92                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
  93                 except TypeError: # we have no bitrate info
  94                     ext = formats[fmt][0]
  95                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
  96                     break
  97
  98     def _real_extract(self, url):
  99         mobj = re.match(self._VALID_URL, url)
 100         if mobj is None:
 101             raise ExtractorError(u'Invalid URL: %s' % url)
 102         # extract uploader & filename from url
 103         uploader = mobj.group(1).decode('utf-8')
 104         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 105
 106         # construct API request
 107         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 108         # retrieve .json file with links to files
 109         request = compat_urllib_request.Request(file_url)
 110         try:
 111             self.report_download_json(file_url)
 112             jsonData = compat_urllib_request.urlopen(request).read()
 113         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 114             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 115
 116         # parse JSON
 117         json_data = json.loads(jsonData)
 118         player_url = json_data['player_swf_url']
 119         formats = dict(json_data['audio_formats'])
 120
 121         req_format = self._downloader.params.get('format', None)
 122         bitrate = None
 123
 124         if self._downloader.params.get('listformats', None):
 125             self._print_formats(formats)
 126             return
 127
 128         if req_format is None or req_format == 'best':
 129             for format_param in formats.keys():
 130                 url_list = self.get_urls(formats, format_param)
 131                 # check urls
 132                 file_url = self.check_urls(url_list)
 133                 if file_url is not None:
 134                     break # got it!
 135         else:
 136             if req_format not in formats:
 137                 raise ExtractorError(u'Format is not available')
 138
 139             url_list = self.get_urls(formats, req_format)
 140             file_url = self.check_urls(url_list)
 141             format_param = req_format
 142
 143         return [{
 144             'id': file_id.decode('utf-8'),
 145             'url': file_url.decode('utf-8'),
 146             'uploader': uploader.decode('utf-8'),
 147             'upload_date': None,
 148             'title': json_data['name'],
 149             'ext': file_url.split('.')[-1].decode('utf-8'),
 150             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 151             'thumbnail': json_data['thumbnail_url'],
 152             'description': json_data['description'],
 153             'player_url': player_url.decode('utf-8'),
 154         }]
 155
 156
 157
 158
 159 class YoukuIE(InfoExtractor):
 160     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 161
 162     def _gen_sid(self):
 163         nowTime = int(time.time() * 1000)
 164         random1 = random.randint(1000,1998)
 165         random2 = random.randint(1000,9999)
 166
 167         return "%d%d%d" %(nowTime,random1,random2)
 168
 169     def _get_file_ID_mix_string(self, seed):
 170         mixed = []
 171         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 172         seed = float(seed)
 173         for i in range(len(source)):
 174             seed  =  (seed * 211 + 30031 ) % 65536
 175             index  =  math.floor(seed / 65536 * len(source) )
 176             mixed.append(source[int(index)])
 177             source.remove(source[int(index)])
 178         #return ''.join(mixed)
 179         return mixed
 180
 181     def _get_file_id(self, fileId, seed):
 182         mixed = self._get_file_ID_mix_string(seed)
 183         ids = fileId.split('*')
 184         realId = []
 185         for ch in ids:
 186             if ch:
 187                 realId.append(mixed[int(ch)])
 188         return ''.join(realId)
 189
 190     def _real_extract(self, url):
 191         mobj = re.match(self._VALID_URL, url)
 192         if mobj is None:
 193             raise ExtractorError(u'Invalid URL: %s' % url)
 194         video_id = mobj.group('ID')
 195
 196         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 197
 198         jsondata = self._download_webpage(info_url, video_id)
 199
 200         self.report_extraction(video_id)
 201         try:
 202             config = json.loads(jsondata)
 203
 204             video_title =  config['data'][0]['title']
 205             seed = config['data'][0]['seed']
 206
 207             format = self._downloader.params.get('format', None)
 208             supported_format = list(config['data'][0]['streamfileids'].keys())
 209
 210             if format is None or format == 'best':
 211                 if 'hd2' in supported_format:
 212                     format = 'hd2'
 213                 else:
 214                     format = 'flv'
 215                 ext = u'flv'
 216             elif format == 'worst':
 217                 format = 'mp4'
 218                 ext = u'mp4'
 219             else:
 220                 format = 'flv'
 221                 ext = u'flv'
 222
 223
 224             fileid = config['data'][0]['streamfileids'][format]
 225             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 226         except (UnicodeDecodeError, ValueError, KeyError):
 227             raise ExtractorError(u'Unable to extract info section')
 228
 229         files_info=[]
 230         sid = self._gen_sid()
 231         fileid = self._get_file_id(fileid, seed)
 232
 233         #column 8,9 of fileid represent the segment number
 234         #fileid[7:9] should be changed
 235         for index, key in enumerate(keys):
 236
 237             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 238             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 239
 240             info = {
 241                 'id': '%s_part%02d' % (video_id, index),
 242                 'url': download_url,
 243                 'uploader': None,
 244                 'upload_date': None,
 245                 'title': video_title,
 246                 'ext': ext,
 247             }
 248             files_info.append(info)
 249
 250         return files_info
 251
 252
 253 class XNXXIE(InfoExtractor):
 254     """Information extractor for xnxx.com"""
 255
 256     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 257     IE_NAME = u'xnxx'
 258     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 259     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 260     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 261
 262     def _real_extract(self, url):
 263         mobj = re.match(self._VALID_URL, url)
 264         if mobj is None:
 265             raise ExtractorError(u'Invalid URL: %s' % url)
 266         video_id = mobj.group(1)
 267
 268         # Get webpage content
 269         webpage = self._download_webpage(url, video_id)
 270
 271         video_url = self._search_regex(self.VIDEO_URL_RE,
 272             webpage, u'video URL')
 273         video_url = compat_urllib_parse.unquote(video_url)
 274
 275         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 276             webpage, u'title')
 277
 278         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 279             webpage, u'thumbnail', fatal=False)
 280
 281         return [{
 282             'id': video_id,
 283             'url': video_url,
 284             'uploader': None,
 285             'upload_date': None,
 286             'title': video_title,
 287             'ext': 'flv',
 288             'thumbnail': video_thumbnail,
 289             'description': None,
 290         }]
 291
 292
 293
 294
 295 class JustinTVIE(InfoExtractor):
 296     """Information extractor for justin.tv and twitch.tv"""
 297     # TODO: One broadcast may be split into multiple videos. The key
 298     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 299     # starts at 1 and increases. Can we treat all parts as one video?
 300
 301     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 302         (?:
 303             (?P<channelid>[^/]+)|
 304             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 305             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 306         )
 307         /?(?:\#.*)?$
 308         """
 309     _JUSTIN_PAGE_LIMIT = 100
 310     IE_NAME = u'justin.tv'
 311
 312     def report_download_page(self, channel, offset):
 313         """Report attempt to download a single page of videos."""
 314         self.to_screen(u'%s: Downloading video information from %d to %d' %
 315                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 316
 317     # Return count of items, list of *valid* items
 318     def _parse_page(self, url, video_id):
 319         webpage = self._download_webpage(url, video_id,
 320                                          u'Downloading video info JSON',
 321                                          u'unable to download video info JSON')
 322
 323         response = json.loads(webpage)
 324         if type(response) != list:
 325             error_text = response.get('error', 'unknown error')
 326             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 327         info = []
 328         for clip in response:
 329             video_url = clip['video_file_url']
 330             if video_url:
 331                 video_extension = os.path.splitext(video_url)[1][1:]
 332                 video_date = re.sub('-', '', clip['start_time'][:10])
 333                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 334                 video_id = clip['id']
 335                 video_title = clip.get('title', video_id)
 336                 info.append({
 337                     'id': video_id,
 338                     'url': video_url,
 339                     'title': video_title,
 340                     'uploader': clip.get('channel_name', video_uploader_id),
 341                     'uploader_id': video_uploader_id,
 342                     'upload_date': video_date,
 343                     'ext': video_extension,
 344                 })
 345         return (len(response), info)
 346
 347     def _real_extract(self, url):
 348         mobj = re.match(self._VALID_URL, url)
 349         if mobj is None:
 350             raise ExtractorError(u'invalid URL: %s' % url)
 351
 352         api_base = 'http://api.justin.tv'
 353         paged = False
 354         if mobj.group('channelid'):
 355             paged = True
 356             video_id = mobj.group('channelid')
 357             api = api_base + '/channel/archives/%s.json' % video_id
 358         elif mobj.group('chapterid'):
 359             chapter_id = mobj.group('chapterid')
 360
 361             webpage = self._download_webpage(url, chapter_id)
 362             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 363             if not m:
 364                 raise ExtractorError(u'Cannot find archive of a chapter')
 365             archive_id = m.group(1)
 366
 367             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 368             chapter_info_xml = self._download_webpage(api, chapter_id,
 369                                              note=u'Downloading chapter information',
 370                                              errnote=u'Chapter information download failed')
 371             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 372             for a in doc.findall('.//archive'):
 373                 if archive_id == a.find('./id').text:
 374                     break
 375             else:
 376                 raise ExtractorError(u'Could not find chapter in chapter information')
 377
 378             video_url = a.find('./video_file_url').text
 379             video_ext = video_url.rpartition('.')[2] or u'flv'
 380
 381             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 382             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 383                                    note='Downloading chapter metadata',
 384                                    errnote='Download of chapter metadata failed')
 385             chapter_info = json.loads(chapter_info_json)
 386
 387             bracket_start = int(doc.find('.//bracket_start').text)
 388             bracket_end = int(doc.find('.//bracket_end').text)
 389
 390             # TODO determine start (and probably fix up file)
 391             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 392             #video_url += u'?start=' + TODO:start_timestamp
 393             # bracket_start is 13290, but we want 51670615
 394             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 395                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 396
 397             info = {
 398                 'id': u'c' + chapter_id,
 399                 'url': video_url,
 400                 'ext': video_ext,
 401                 'title': chapter_info['title'],
 402                 'thumbnail': chapter_info['preview'],
 403                 'description': chapter_info['description'],
 404                 'uploader': chapter_info['channel']['display_name'],
 405                 'uploader_id': chapter_info['channel']['name'],
 406             }
 407             return [info]
 408         else:
 409             video_id = mobj.group('videoid')
 410             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 411
 412         self.report_extraction(video_id)
 413
 414         info = []
 415         offset = 0
 416         limit = self._JUSTIN_PAGE_LIMIT
 417         while True:
 418             if paged:
 419                 self.report_download_page(video_id, offset)
 420             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 421             page_count, page_info = self._parse_page(page_url, video_id)
 422             info.extend(page_info)
 423             if not paged or page_count != limit:
 424                 break
 425             offset += limit
 426         return info
 427
 428 class FunnyOrDieIE(InfoExtractor):
 429     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 430
 431     def _real_extract(self, url):
 432         mobj = re.match(self._VALID_URL, url)
 433         if mobj is None:
 434             raise ExtractorError(u'invalid URL: %s' % url)
 435
 436         video_id = mobj.group('id')
 437         webpage = self._download_webpage(url, video_id)
 438
 439         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 440             webpage, u'video URL', flags=re.DOTALL)
 441
 442         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 443             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 444
 445         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 446             webpage, u'description', fatal=False, flags=re.DOTALL)
 447
 448         info = {
 449             'id': video_id,
 450             'url': video_url,
 451             'ext': 'mp4',
 452             'title': title,
 453             'description': video_description,
 454         }
 455         return [info]
 456
 457 class SteamIE(InfoExtractor):
 458     _VALID_URL = r"""http://store\.steampowered\.com/
 459                 (agecheck/)?
 460                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 461                 (?P<gameID>\d+)/?
 462                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 463                 """
 464     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 465     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 466
 467     @classmethod
 468     def suitable(cls, url):
 469         """Receives a URL and returns True if suitable for this IE."""
 470         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 471
 472     def _real_extract(self, url):
 473         m = re.match(self._VALID_URL, url, re.VERBOSE)
 474         gameID = m.group('gameID')
 475
 476         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 477         webpage = self._download_webpage(videourl, gameID)
 478
 479         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 480             videourl = self._AGECHECK_TEMPLATE % gameID
 481             self.report_age_confirmation()
 482             webpage = self._download_webpage(videourl, gameID)
 483
 484         self.report_extraction(gameID)
 485         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 486                                              webpage, 'game title')
 487
 488         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 489         mweb = re.finditer(urlRE, webpage)
 490         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 491         titles = re.finditer(namesRE, webpage)
 492         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 493         thumbs = re.finditer(thumbsRE, webpage)
 494         videos = []
 495         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 496             video_id = vid.group('videoID')
 497             title = vtitle.group('videoName')
 498             video_url = vid.group('videoURL')
 499             video_thumb = thumb.group('thumbnail')
 500             if not video_url:
 501                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 502             info = {
 503                 'id':video_id,
 504                 'url':video_url,
 505                 'ext': 'flv',
 506                 'title': unescapeHTML(title),
 507                 'thumbnail': video_thumb
 508                   }
 509             videos.append(info)
 510         return [self.playlist_result(videos, gameID, game_title)]
 511
 512 class UstreamIE(InfoExtractor):
 513     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 514     IE_NAME = u'ustream'
 515
 516     def _real_extract(self, url):
 517         m = re.match(self._VALID_URL, url)
 518         video_id = m.group('videoID')
 519
 520         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 521         webpage = self._download_webpage(url, video_id)
 522
 523         self.report_extraction(video_id)
 524
 525         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 526             webpage, u'title')
 527
 528         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 529             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 530
 531         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 532             webpage, u'thumbnail', fatal=False)
 533
 534         info = {
 535                 'id': video_id,
 536                 'url': video_url,
 537                 'ext': 'flv',
 538                 'title': video_title,
 539                 'uploader': uploader,
 540                 'thumbnail': thumbnail,
 541                }
 542         return info
 543
 544 class WorldStarHipHopIE(InfoExtractor):
 545     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 546     IE_NAME = u'WorldStarHipHop'
 547
 548     def _real_extract(self, url):
 549         m = re.match(self._VALID_URL, url)
 550         video_id = m.group('id')
 551
 552         webpage_src = self._download_webpage(url, video_id)
 553
 554         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 555             webpage_src, u'video URL')
 556
 557         if 'mp4' in video_url:
 558             ext = 'mp4'
 559         else:
 560             ext = 'flv'
 561
 562         video_title = self._html_search_regex(r"<title>(.*)</title>",
 563             webpage_src, u'title')
 564
 565         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 566         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 567             webpage_src, u'thumbnail', fatal=False)
 568
 569         if not thumbnail:
 570             _title = r"""candytitles.*>(.*)</span>"""
 571             mobj = re.search(_title, webpage_src)
 572             if mobj is not None:
 573                 video_title = mobj.group(1)
 574
 575         results = [{
 576                     'id': video_id,
 577                     'url' : video_url,
 578                     'title' : video_title,
 579                     'thumbnail' : thumbnail,
 580                     'ext' : ext,
 581                     }]
 582         return results
 583
 584 class RBMARadioIE(InfoExtractor):
 585     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 586
 587     def _real_extract(self, url):
 588         m = re.match(self._VALID_URL, url)
 589         video_id = m.group('videoID')
 590
 591         webpage = self._download_webpage(url, video_id)
 592
 593         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 594             webpage, u'json data', flags=re.MULTILINE)
 595
 596         try:
 597             data = json.loads(json_data)
 598         except ValueError as e:
 599             raise ExtractorError(u'Invalid JSON: ' + str(e))
 600
 601         video_url = data['akamai_url'] + '&cbr=256'
 602         url_parts = compat_urllib_parse_urlparse(video_url)
 603         video_ext = url_parts.path.rpartition('.')[2]
 604         info = {
 605                 'id': video_id,
 606                 'url': video_url,
 607                 'ext': video_ext,
 608                 'title': data['title'],
 609                 'description': data.get('teaser_text'),
 610                 'location': data.get('country_of_origin'),
 611                 'uploader': data.get('host', {}).get('name'),
 612                 'uploader_id': data.get('host', {}).get('slug'),
 613                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 614                 'duration': data.get('duration'),
 615         }
 616         return [info]
 617
 618
 619 class YouPornIE(InfoExtractor):
 620     """Information extractor for youporn.com."""
 621     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 622
 623     def _print_formats(self, formats):
 624         """Print all available formats"""
 625         print(u'Available formats:')
 626         print(u'ext\t\tformat')
 627         print(u'---------------------------------')
 628         for format in formats:
 629             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 630
 631     def _specific(self, req_format, formats):
 632         for x in formats:
 633             if(x["format"]==req_format):
 634                 return x
 635         return None
 636
 637     def _real_extract(self, url):
 638         mobj = re.match(self._VALID_URL, url)
 639         if mobj is None:
 640             raise ExtractorError(u'Invalid URL: %s' % url)
 641         video_id = mobj.group('videoid')
 642
 643         req = compat_urllib_request.Request(url)
 644         req.add_header('Cookie', 'age_verified=1')
 645         webpage = self._download_webpage(req, video_id)
 646
 647         # Get JSON parameters
 648         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 649         try:
 650             params = json.loads(json_params)
 651         except:
 652             raise ExtractorError(u'Invalid JSON')
 653
 654         self.report_extraction(video_id)
 655         try:
 656             video_title = params['title']
 657             upload_date = unified_strdate(params['release_date_f'])
 658             video_description = params['description']
 659             video_uploader = params['submitted_by']
 660             thumbnail = params['thumbnails'][0]['image']
 661         except KeyError:
 662             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 663
 664         # Get all of the formats available
 665         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 666         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 667             webpage, u'download list').strip()
 668
 669         # Get all of the links from the page
 670         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 671         links = re.findall(LINK_RE, download_list_html)
 672         if(len(links) == 0):
 673             raise ExtractorError(u'ERROR: no known formats available for video')
 674
 675         self.to_screen(u'Links found: %d' % len(links))
 676
 677         formats = []
 678         for link in links:
 679
 680             # A link looks like this:
 681             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 682             # A path looks like this:
 683             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 684             video_url = unescapeHTML( link )
 685             path = compat_urllib_parse_urlparse( video_url ).path
 686             extension = os.path.splitext( path )[1][1:]
 687             format = path.split('/')[4].split('_')[:2]
 688             size = format[0]
 689             bitrate = format[1]
 690             format = "-".join( format )
 691             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 692
 693             formats.append({
 694                 'id': video_id,
 695                 'url': video_url,
 696                 'uploader': video_uploader,
 697                 'upload_date': upload_date,
 698                 'title': video_title,
 699                 'ext': extension,
 700                 'format': format,
 701                 'thumbnail': thumbnail,
 702                 'description': video_description
 703             })
 704
 705         if self._downloader.params.get('listformats', None):
 706             self._print_formats(formats)
 707             return
 708
 709         req_format = self._downloader.params.get('format', None)
 710         self.to_screen(u'Format: %s' % req_format)
 711
 712         if req_format is None or req_format == 'best':
 713             return [formats[0]]
 714         elif req_format == 'worst':
 715             return [formats[-1]]
 716         elif req_format in ('-1', 'all'):
 717             return formats
 718         else:
 719             format = self._specific( req_format, formats )
 720             if result is None:
 721                 raise ExtractorError(u'Requested format not available')
 722             return [format]
 723
 724
 725
 726 class PornotubeIE(InfoExtractor):
 727     """Information extractor for pornotube.com."""
 728     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 729
 730     def _real_extract(self, url):
 731         mobj = re.match(self._VALID_URL, url)
 732         if mobj is None:
 733             raise ExtractorError(u'Invalid URL: %s' % url)
 734
 735         video_id = mobj.group('videoid')
 736         video_title = mobj.group('title')
 737
 738         # Get webpage content
 739         webpage = self._download_webpage(url, video_id)
 740
 741         # Get the video URL
 742         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 743         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 744         video_url = compat_urllib_parse.unquote(video_url)
 745
 746         #Get the uploaded date
 747         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 748         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 749         if upload_date: upload_date = unified_strdate(upload_date)
 750
 751         info = {'id': video_id,
 752                 'url': video_url,
 753                 'uploader': None,
 754                 'upload_date': upload_date,
 755                 'title': video_title,
 756                 'ext': 'flv',
 757                 'format': 'flv'}
 758
 759         return [info]
 760
 761 class YouJizzIE(InfoExtractor):
 762     """Information extractor for youjizz.com."""
 763     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 764
 765     def _real_extract(self, url):
 766         mobj = re.match(self._VALID_URL, url)
 767         if mobj is None:
 768             raise ExtractorError(u'Invalid URL: %s' % url)
 769
 770         video_id = mobj.group('videoid')
 771
 772         # Get webpage content
 773         webpage = self._download_webpage(url, video_id)
 774
 775         # Get the video title
 776         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 777             webpage, u'title').strip()
 778
 779         # Get the embed page
 780         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 781         if result is None:
 782             raise ExtractorError(u'ERROR: unable to extract embed page')
 783
 784         embed_page_url = result.group(0).strip()
 785         video_id = result.group('videoid')
 786
 787         webpage = self._download_webpage(embed_page_url, video_id)
 788
 789         # Get the video URL
 790         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 791             webpage, u'video URL')
 792
 793         info = {'id': video_id,
 794                 'url': video_url,
 795                 'title': video_title,
 796                 'ext': 'flv',
 797                 'format': 'flv',
 798                 'player_url': embed_page_url}
 799
 800         return [info]
 801
 802 class EightTracksIE(InfoExtractor):
 803     IE_NAME = '8tracks'
 804     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 805
 806     def _real_extract(self, url):
 807         mobj = re.match(self._VALID_URL, url)
 808         if mobj is None:
 809             raise ExtractorError(u'Invalid URL: %s' % url)
 810         playlist_id = mobj.group('id')
 811
 812         webpage = self._download_webpage(url, playlist_id)
 813
 814         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 815         data = json.loads(json_like)
 816
 817         session = str(random.randint(0, 1000000000))
 818         mix_id = data['id']
 819         track_count = data['tracks_count']
 820         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 821         next_url = first_url
 822         res = []
 823         for i in itertools.count():
 824             api_json = self._download_webpage(next_url, playlist_id,
 825                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 826                 errnote=u'Failed to download song information')
 827             api_data = json.loads(api_json)
 828             track_data = api_data[u'set']['track']
 829             info = {
 830                 'id': track_data['id'],
 831                 'url': track_data['track_file_stream_url'],
 832                 'title': track_data['performer'] + u' - ' + track_data['name'],
 833                 'raw_title': track_data['name'],
 834                 'uploader_id': data['user']['login'],
 835                 'ext': 'm4a',
 836             }
 837             res.append(info)
 838             if api_data['set']['at_last_track']:
 839                 break
 840             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 841         return res
 842
 843 class KeekIE(InfoExtractor):
 844     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 845     IE_NAME = u'keek'
 846
 847     def _real_extract(self, url):
 848         m = re.match(self._VALID_URL, url)
 849         video_id = m.group('videoID')
 850
 851         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 852         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 853         webpage = self._download_webpage(url, video_id)
 854
 855         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 856             webpage, u'title')
 857
 858         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 859             webpage, u'uploader', fatal=False)
 860
 861         info = {
 862                 'id': video_id,
 863                 'url': video_url,
 864                 'ext': 'mp4',
 865                 'title': video_title,
 866                 'thumbnail': thumbnail,
 867                 'uploader': uploader
 868         }
 869         return [info]
 870
 871 class TEDIE(InfoExtractor):
 872     _VALID_URL=r'''http://www\.ted\.com/
 873                    (
 874                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
 875                         |
 876                         ((?P<type_talk>talks)) # We have a simple talk
 877                    )
 878                    (/lang/(.*?))? # The url may contain the language
 879                    /(?P<name>\w+) # Here goes the name and then ".html"
 880                    '''
 881
 882     @classmethod
 883     def suitable(cls, url):
 884         """Receives a URL and returns True if suitable for this IE."""
 885         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 886
 887     def _real_extract(self, url):
 888         m=re.match(self._VALID_URL, url, re.VERBOSE)
 889         if m.group('type_talk'):
 890             return [self._talk_info(url)]
 891         else :
 892             playlist_id=m.group('playlist_id')
 893             name=m.group('name')
 894             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
 895             return [self._playlist_videos_info(url,name,playlist_id)]
 896
 897     def _playlist_videos_info(self,url,name,playlist_id=0):
 898         '''Returns the videos of the playlist'''
 899         video_RE=r'''
 900                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
 901                      ([.\s]*?)data-playlist_item_id="(\d+)"
 902                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
 903                      '''
 904         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
 905         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
 906         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
 907         m_names=re.finditer(video_name_RE,webpage)
 908
 909         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
 910                                                  webpage, 'playlist title')
 911
 912         playlist_entries = []
 913         for m_video, m_name in zip(m_videos,m_names):
 914             video_id=m_video.group('video_id')
 915             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
 916             playlist_entries.append(self.url_result(talk_url, 'TED'))
 917         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
 918
 919     def _talk_info(self, url, video_id=0):
 920         """Return the video for the talk in the url"""
 921         m = re.match(self._VALID_URL, url,re.VERBOSE)
 922         video_name = m.group('name')
 923         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
 924         self.report_extraction(video_name)
 925         # If the url includes the language we get the title translated
 926         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
 927                                         webpage, 'title')
 928         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
 929                                     webpage, 'json data')
 930         info = json.loads(json_data)
 931         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
 932                                        webpage, 'description', flags = re.DOTALL)
 933
 934         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
 935                                        webpage, 'thumbnail')
 936         info = {
 937                 'id': info['id'],
 938                 'url': info['htmlStreams'][-1]['file'],
 939                 'ext': 'mp4',
 940                 'title': title,
 941                 'thumbnail': thumbnail,
 942                 'description': desc,
 943                 }
 944         return info
 945
 946 class MySpassIE(InfoExtractor):
 947     _VALID_URL = r'http://www.myspass.de/.*'
 948
 949     def _real_extract(self, url):
 950         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
 951
 952         # video id is the last path element of the URL
 953         # usually there is a trailing slash, so also try the second but last
 954         url_path = compat_urllib_parse_urlparse(url).path
 955         url_parent_path, video_id = os.path.split(url_path)
 956         if not video_id:
 957             _, video_id = os.path.split(url_parent_path)
 958
 959         # get metadata
 960         metadata_url = META_DATA_URL_TEMPLATE % video_id
 961         metadata_text = self._download_webpage(metadata_url, video_id)
 962         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
 963
 964         # extract values from metadata
 965         url_flv_el = metadata.find('url_flv')
 966         if url_flv_el is None:
 967             raise ExtractorError(u'Unable to extract download url')
 968         video_url = url_flv_el.text
 969         extension = os.path.splitext(video_url)[1][1:]
 970         title_el = metadata.find('title')
 971         if title_el is None:
 972             raise ExtractorError(u'Unable to extract title')
 973         title = title_el.text
 974         format_id_el = metadata.find('format_id')
 975         if format_id_el is None:
 976             format = ext
 977         else:
 978             format = format_id_el.text
 979         description_el = metadata.find('description')
 980         if description_el is not None:
 981             description = description_el.text
 982         else:
 983             description = None
 984         imagePreview_el = metadata.find('imagePreview')
 985         if imagePreview_el is not None:
 986             thumbnail = imagePreview_el.text
 987         else:
 988             thumbnail = None
 989         info = {
 990             'id': video_id,
 991             'url': video_url,
 992             'title': title,
 993             'ext': extension,
 994             'format': format,
 995             'thumbnail': thumbnail,
 996             'description': description
 997         }
 998         return [info]
 999
1000 class SpiegelIE(InfoExtractor):
1001     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1002
1003     def _real_extract(self, url):
1004         m = re.match(self._VALID_URL, url)
1005         video_id = m.group('videoID')
1006
1007         webpage = self._download_webpage(url, video_id)
1008
1009         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1010             webpage, u'title')
1011
1012         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1013         xml_code = self._download_webpage(xml_url, video_id,
1014                     note=u'Downloading XML', errnote=u'Failed to download XML')
1015
1016         idoc = xml.etree.ElementTree.fromstring(xml_code)
1017         last_type = idoc[-1]
1018         filename = last_type.findall('./filename')[0].text
1019         duration = float(last_type.findall('./duration')[0].text)
1020
1021         video_url = 'http://video2.spiegel.de/flash/' + filename
1022         video_ext = filename.rpartition('.')[2]
1023         info = {
1024             'id': video_id,
1025             'url': video_url,
1026             'ext': video_ext,
1027             'title': video_title,
1028             'duration': duration,
1029         }
1030         return [info]
1031
1032 class LiveLeakIE(InfoExtractor):
1033
1034     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1035     IE_NAME = u'liveleak'
1036
1037     def _real_extract(self, url):
1038         mobj = re.match(self._VALID_URL, url)
1039         if mobj is None:
1040             raise ExtractorError(u'Invalid URL: %s' % url)
1041
1042         video_id = mobj.group('video_id')
1043
1044         webpage = self._download_webpage(url, video_id)
1045
1046         video_url = self._search_regex(r'file: "(.*?)",',
1047             webpage, u'video URL')
1048
1049         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1050             webpage, u'title').replace('LiveLeak.com -', '').strip()
1051
1052         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1053             webpage, u'description', fatal=False)
1054
1055         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1056             webpage, u'uploader', fatal=False)
1057
1058         info = {
1059             'id':  video_id,
1060             'url': video_url,
1061             'ext': 'mp4',
1062             'title': video_title,
1063             'description': video_description,
1064             'uploader': video_uploader
1065         }
1066
1067         return [info]
1068
1069
1070
1071 class TumblrIE(InfoExtractor):
1072     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1073
1074     def _real_extract(self, url):
1075         m_url = re.match(self._VALID_URL, url)
1076         video_id = m_url.group('id')
1077         blog = m_url.group('blog_name')
1078
1079         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1080         webpage = self._download_webpage(url, video_id)
1081
1082         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1083         video = re.search(re_video, webpage)
1084         if video is None:
1085            raise ExtractorError(u'Unable to extract video')
1086         video_url = video.group('video_url')
1087         ext = video.group('ext')
1088
1089         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1090             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1091         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1092
1093         # The only place where you can get a title, it's not complete,
1094         # but searching in other places doesn't work for all videos
1095         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1096             webpage, u'title', flags=re.DOTALL)
1097
1098         return [{'id': video_id,
1099                  'url': video_url,
1100                  'title': video_title,
1101                  'thumbnail': video_thumbnail,
1102                  'ext': ext
1103                  }]
1104
1105 class BandcampIE(InfoExtractor):
1106     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1107
1108     def _real_extract(self, url):
1109         mobj = re.match(self._VALID_URL, url)
1110         title = mobj.group('title')
1111         webpage = self._download_webpage(url, title)
1112         # We get the link to the free download page
1113         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1114         if m_download is None:
1115             raise ExtractorError(u'No free songs found')
1116
1117         download_link = m_download.group(1)
1118         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1119                        webpage, re.MULTILINE|re.DOTALL).group('id')
1120
1121         download_webpage = self._download_webpage(download_link, id,
1122                                                   'Downloading free downloads page')
1123         # We get the dictionary of the track from some javascrip code
1124         info = re.search(r'items: (.*?),$',
1125                          download_webpage, re.MULTILINE).group(1)
1126         info = json.loads(info)[0]
1127         # We pick mp3-320 for now, until format selection can be easily implemented.
1128         mp3_info = info[u'downloads'][u'mp3-320']
1129         # If we try to use this url it says the link has expired
1130         initial_url = mp3_info[u'url']
1131         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1132         m_url = re.match(re_url, initial_url)
1133         #We build the url we will use to get the final track url
1134         # This url is build in Bandcamp in the script download_bunde_*.js
1135         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1136         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1137         # If we could correctly generate the .rand field the url would be
1138         #in the "download_url" key
1139         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1140
1141         track_info = {'id':id,
1142                       'title' : info[u'title'],
1143                       'ext' :   'mp3',
1144                       'url' :   final_url,
1145                       'thumbnail' : info[u'thumb_url'],
1146                       'uploader' :  info[u'artist']
1147                       }
1148
1149         return [track_info]
1150
1151 class RedTubeIE(InfoExtractor):
1152     """Information Extractor for redtube"""
1153     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1154
1155     def _real_extract(self,url):
1156         mobj = re.match(self._VALID_URL, url)
1157         if mobj is None:
1158             raise ExtractorError(u'Invalid URL: %s' % url)
1159
1160         video_id = mobj.group('id')
1161         video_extension = 'mp4'
1162         webpage = self._download_webpage(url, video_id)
1163
1164         self.report_extraction(video_id)
1165
1166         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1167             webpage, u'video URL')
1168
1169         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1170             webpage, u'title')
1171
1172         return [{
1173             'id':       video_id,
1174             'url':      video_url,
1175             'ext':      video_extension,
1176             'title':    video_title,
1177         }]
1178
1179 class InaIE(InfoExtractor):
1180     """Information Extractor for Ina.fr"""
1181     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1182
1183     def _real_extract(self,url):
1184         mobj = re.match(self._VALID_URL, url)
1185
1186         video_id = mobj.group('id')
1187         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1188         video_extension = 'mp4'
1189         webpage = self._download_webpage(mrss_url, video_id)
1190
1191         self.report_extraction(video_id)
1192
1193         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1194             webpage, u'video URL')
1195
1196         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1197             webpage, u'title')
1198
1199         return [{
1200             'id':       video_id,
1201             'url':      video_url,
1202             'ext':      video_extension,
1203             'title':    video_title,
1204         }]
1205
1206 class HowcastIE(InfoExtractor):
1207     """Information Extractor for Howcast.com"""
1208     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1209
1210     def _real_extract(self, url):
1211         mobj = re.match(self._VALID_URL, url)
1212
1213         video_id = mobj.group('id')
1214         webpage_url = 'http://www.howcast.com/videos/' + video_id
1215         webpage = self._download_webpage(webpage_url, video_id)
1216
1217         self.report_extraction(video_id)
1218
1219         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1220             webpage, u'video URL')
1221
1222         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1223             webpage, u'title')
1224
1225         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1226             webpage, u'description', fatal=False)
1227
1228         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1229             webpage, u'thumbnail', fatal=False)
1230
1231         return [{
1232             'id':       video_id,
1233             'url':      video_url,
1234             'ext':      'mp4',
1235             'title':    video_title,
1236             'description': video_description,
1237             'thumbnail': thumbnail,
1238         }]
1239
1240 class VineIE(InfoExtractor):
1241     """Information Extractor for Vine.co"""
1242     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1243
1244     def _real_extract(self, url):
1245         mobj = re.match(self._VALID_URL, url)
1246
1247         video_id = mobj.group('id')
1248         webpage_url = 'https://vine.co/v/' + video_id
1249         webpage = self._download_webpage(webpage_url, video_id)
1250
1251         self.report_extraction(video_id)
1252
1253         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1254             webpage, u'video URL')
1255
1256         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1257             webpage, u'title')
1258
1259         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1260             webpage, u'thumbnail', fatal=False)
1261
1262         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1263             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1264
1265         return [{
1266             'id':        video_id,
1267             'url':       video_url,
1268             'ext':       'mp4',
1269             'title':     video_title,
1270             'thumbnail': thumbnail,
1271             'uploader':  uploader,
1272         }]
1273
1274 class FlickrIE(InfoExtractor):
1275     """Information Extractor for Flickr videos"""
1276     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1277
1278     def _real_extract(self, url):
1279         mobj = re.match(self._VALID_URL, url)
1280
1281         video_id = mobj.group('id')
1282         video_uploader_id = mobj.group('uploader_id')
1283         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1284         webpage = self._download_webpage(webpage_url, video_id)
1285
1286         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1287
1288         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1289         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1290
1291         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1292             first_xml, u'node_id')
1293
1294         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1295         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1296
1297         self.report_extraction(video_id)
1298
1299         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1300         if mobj is None:
1301             raise ExtractorError(u'Unable to extract video url')
1302         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1303
1304         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1305             webpage, u'video title')
1306
1307         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1308             webpage, u'description', fatal=False)
1309
1310         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1311             webpage, u'thumbnail', fatal=False)
1312
1313         return [{
1314             'id':          video_id,
1315             'url':         video_url,
1316             'ext':         'mp4',
1317             'title':       video_title,
1318             'description': video_description,
1319             'thumbnail':   thumbnail,
1320             'uploader_id': video_uploader_id,
1321         }]
1322
1323 class TeamcocoIE(InfoExtractor):
1324     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1325
1326     def _real_extract(self, url):
1327         mobj = re.match(self._VALID_URL, url)
1328         if mobj is None:
1329             raise ExtractorError(u'Invalid URL: %s' % url)
1330         url_title = mobj.group('url_title')
1331         webpage = self._download_webpage(url, url_title)
1332
1333         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1334             webpage, u'video id')
1335
1336         self.report_extraction(video_id)
1337
1338         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1339             webpage, u'title')
1340
1341         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1342             webpage, u'thumbnail', fatal=False)
1343
1344         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1345             webpage, u'description', fatal=False)
1346
1347         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1348         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1349
1350         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1351             data, u'video URL')
1352
1353         return [{
1354             'id':          video_id,
1355             'url':         video_url,
1356             'ext':         'mp4',
1357             'title':       video_title,
1358             'thumbnail':   thumbnail,
1359             'description': video_description,
1360         }]
1361
1362 class XHamsterIE(InfoExtractor):
1363     """Information Extractor for xHamster"""
1364     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1365
1366     def _real_extract(self,url):
1367         mobj = re.match(self._VALID_URL, url)
1368
1369         video_id = mobj.group('id')
1370         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1371         webpage = self._download_webpage(mrss_url, video_id)
1372
1373         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1374         if mobj is None:
1375             raise ExtractorError(u'Unable to extract media URL')
1376         if len(mobj.group('server')) == 0:
1377             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1378         else:
1379             video_url = mobj.group('server')+'/key='+mobj.group('file')
1380         video_extension = video_url.split('.')[-1]
1381
1382         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1383             webpage, u'title')
1384
1385         # Can't see the description anywhere in the UI
1386         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1387         #     webpage, u'description', fatal=False)
1388         # if video_description: video_description = unescapeHTML(video_description)
1389
1390         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1391         if mobj:
1392             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1393         else:
1394             video_upload_date = None
1395             self._downloader.report_warning(u'Unable to extract upload date')
1396
1397         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1398             webpage, u'uploader id', default=u'anonymous')
1399
1400         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1401             webpage, u'thumbnail', fatal=False)
1402
1403         return [{
1404             'id':       video_id,
1405             'url':      video_url,
1406             'ext':      video_extension,
1407             'title':    video_title,
1408             # 'description': video_description,
1409             'upload_date': video_upload_date,
1410             'uploader_id': video_uploader_id,
1411             'thumbnail': video_thumbnail
1412         }]
1413
1414 class HypemIE(InfoExtractor):
1415     """Information Extractor for hypem"""
1416     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1417
1418     def _real_extract(self, url):
1419         mobj = re.match(self._VALID_URL, url)
1420         if mobj is None:
1421             raise ExtractorError(u'Invalid URL: %s' % url)
1422         track_id = mobj.group(1)
1423
1424         data = { 'ax': 1, 'ts': time.time() }
1425         data_encoded = compat_urllib_parse.urlencode(data)
1426         complete_url = url + "?" + data_encoded
1427         request = compat_urllib_request.Request(complete_url)
1428         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1429         cookie = urlh.headers.get('Set-Cookie', '')
1430
1431         self.report_extraction(track_id)
1432
1433         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1434             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1435         try:
1436             track_list = json.loads(html_tracks)
1437             track = track_list[u'tracks'][0]
1438         except ValueError:
1439             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1440
1441         key = track[u"key"]
1442         track_id = track[u"id"]
1443         artist = track[u"artist"]
1444         title = track[u"song"]
1445
1446         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1447         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1448         request.add_header('cookie', cookie)
1449         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1450         try:
1451             song_data = json.loads(song_data_json)
1452         except ValueError:
1453             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1454         final_url = song_data[u"url"]
1455
1456         return [{
1457             'id':       track_id,
1458             'url':      final_url,
1459             'ext':      "mp3",
1460             'title':    title,
1461             'artist':   artist,
1462         }]
1463
1464 class Vbox7IE(InfoExtractor):
1465     """Information Extractor for Vbox7"""
1466     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1467
1468     def _real_extract(self,url):
1469         mobj = re.match(self._VALID_URL, url)
1470         if mobj is None:
1471             raise ExtractorError(u'Invalid URL: %s' % url)
1472         video_id = mobj.group(1)
1473
1474         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1475         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1476         redirect_url = urlh.geturl() + new_location
1477         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1478
1479         title = self._html_search_regex(r'<title>(.*)</title>',
1480             webpage, u'title').split('/')[0].strip()
1481
1482         ext = "flv"
1483         info_url = "http://vbox7.com/play/magare.do"
1484         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1485         info_request = compat_urllib_request.Request(info_url, data)
1486         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1487         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1488         if info_response is None:
1489             raise ExtractorError(u'Unable to extract the media url')
1490         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1491
1492         return [{
1493             'id':        video_id,
1494             'url':       final_url,
1495             'ext':       ext,
1496             'title':     title,
1497             'thumbnail': thumbnail_url,
1498         }]
1499
1500
1501 def gen_extractors():
1502     """ Return a list of an instance of every supported extractor.
1503     The order does matter; the first extractor matched is the one handling the URL.
1504     """
1505     return [
1506         YoutubePlaylistIE(),
1507         YoutubeChannelIE(),
1508         YoutubeUserIE(),
1509         YoutubeSearchIE(),
1510         YoutubeIE(),
1511         MetacafeIE(),
1512         DailymotionIE(),
1513         GoogleSearchIE(),
1514         PhotobucketIE(),
1515         YahooIE(),
1516         YahooSearchIE(),
1517         DepositFilesIE(),
1518         FacebookIE(),
1519         BlipTVIE(),
1520         BlipTVUserIE(),
1521         VimeoIE(),
1522         MyVideoIE(),
1523         ComedyCentralIE(),
1524         EscapistIE(),
1525         CollegeHumorIE(),
1526         XVideosIE(),
1527         SoundcloudSetIE(),
1528         SoundcloudIE(),
1529         InfoQIE(),
1530         MixcloudIE(),
1531         StanfordOpenClassroomIE(),
1532         MTVIE(),
1533         YoukuIE(),
1534         XNXXIE(),
1535         YouJizzIE(),
1536         PornotubeIE(),
1537         YouPornIE(),
1538         GooglePlusIE(),
1539         ArteTvIE(),
1540         NBAIE(),
1541         WorldStarHipHopIE(),
1542         JustinTVIE(),
1543         FunnyOrDieIE(),
1544         SteamIE(),
1545         UstreamIE(),
1546         RBMARadioIE(),
1547         EightTracksIE(),
1548         KeekIE(),
1549         TEDIE(),
1550         MySpassIE(),
1551         SpiegelIE(),
1552         LiveLeakIE(),
1553         ARDIE(),
1554         ZDFIE(),
1555         TumblrIE(),
1556         BandcampIE(),
1557         RedTubeIE(),
1558         InaIE(),
1559         HowcastIE(),
1560         VineIE(),
1561         FlickrIE(),
1562         TeamcocoIE(),
1563         XHamsterIE(),
1564         HypemIE(),
1565         Vbox7IE(),
1566         GametrailersIE(),
1567         StatigramIE(),
1568         GenericIE()
1569     ]
1570
1571 def get_info_extractor(ie_name):
1572     """Returns the info extractor class with the given ie_name"""
1573     return globals()[ie_name+'IE']