_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.collegehumor import CollegeHumorIE
  26 from .extractor.dailymotion import DailymotionIE
  27 from .extractor.depositfiles import DepositFilesIE
  28 from .extractor.escapist import EscapistIE
  29 from .extractor.facebook import FacebookIE
  30 from .extractor.gametrailers import GametrailersIE
  31 from .extractor.generic import GenericIE
  32 from .extractor.googleplus import GooglePlusIE
  33 from .extractor.googlesearch import GoogleSearchIE
  34 from .extractor.infoq import InfoQIE
  35 from .extractor.metacafe import MetacafeIE
  36 from .extractor.myvideo import MyVideoIE
  37 from .extractor.statigram import StatigramIE
  38 from .extractor.photobucket import PhotobucketIE
  39 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  40 from .extractor.stanfordoc import StanfordOpenClassroomIE
  41 from .extractor.vimeo import VimeoIE
  42 from .extractor.xvideos import XVideosIE
  43 from .extractor.yahoo import YahooIE, YahooSearchIE
  44 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  45 from .extractor.zdf import ZDFIE
  46
  47
  48
  49 class MixcloudIE(InfoExtractor):
  50     """Information extractor for www.mixcloud.com"""
  51
  52     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
  53     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
  54     IE_NAME = u'mixcloud'
  55
  56     def report_download_json(self, file_id):
  57         """Report JSON download."""
  58         self.to_screen(u'Downloading json')
  59
  60     def get_urls(self, jsonData, fmt, bitrate='best'):
  61         """Get urls from 'audio_formats' section in json"""
  62         file_url = None
  63         try:
  64             bitrate_list = jsonData[fmt]
  65             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
  66                 bitrate = max(bitrate_list) # select highest
  67
  68             url_list = jsonData[fmt][bitrate]
  69         except TypeError: # we have no bitrate info.
  70             url_list = jsonData[fmt]
  71         return url_list
  72
  73     def check_urls(self, url_list):
  74         """Returns 1st active url from list"""
  75         for url in url_list:
  76             try:
  77                 compat_urllib_request.urlopen(url)
  78                 return url
  79             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  80                 url = None
  81
  82         return None
  83
  84     def _print_formats(self, formats):
  85         print('Available formats:')
  86         for fmt in formats.keys():
  87             for b in formats[fmt]:
  88                 try:
  89                     ext = formats[fmt][b][0]
  90                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
  91                 except TypeError: # we have no bitrate info
  92                     ext = formats[fmt][0]
  93                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
  94                     break
  95
  96     def _real_extract(self, url):
  97         mobj = re.match(self._VALID_URL, url)
  98         if mobj is None:
  99             raise ExtractorError(u'Invalid URL: %s' % url)
 100         # extract uploader & filename from url
 101         uploader = mobj.group(1).decode('utf-8')
 102         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 103
 104         # construct API request
 105         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 106         # retrieve .json file with links to files
 107         request = compat_urllib_request.Request(file_url)
 108         try:
 109             self.report_download_json(file_url)
 110             jsonData = compat_urllib_request.urlopen(request).read()
 111         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 112             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 113
 114         # parse JSON
 115         json_data = json.loads(jsonData)
 116         player_url = json_data['player_swf_url']
 117         formats = dict(json_data['audio_formats'])
 118
 119         req_format = self._downloader.params.get('format', None)
 120         bitrate = None
 121
 122         if self._downloader.params.get('listformats', None):
 123             self._print_formats(formats)
 124             return
 125
 126         if req_format is None or req_format == 'best':
 127             for format_param in formats.keys():
 128                 url_list = self.get_urls(formats, format_param)
 129                 # check urls
 130                 file_url = self.check_urls(url_list)
 131                 if file_url is not None:
 132                     break # got it!
 133         else:
 134             if req_format not in formats:
 135                 raise ExtractorError(u'Format is not available')
 136
 137             url_list = self.get_urls(formats, req_format)
 138             file_url = self.check_urls(url_list)
 139             format_param = req_format
 140
 141         return [{
 142             'id': file_id.decode('utf-8'),
 143             'url': file_url.decode('utf-8'),
 144             'uploader': uploader.decode('utf-8'),
 145             'upload_date': None,
 146             'title': json_data['name'],
 147             'ext': file_url.split('.')[-1].decode('utf-8'),
 148             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 149             'thumbnail': json_data['thumbnail_url'],
 150             'description': json_data['description'],
 151             'player_url': player_url.decode('utf-8'),
 152         }]
 153
 154
 155 class MTVIE(InfoExtractor):
 156     """Information extractor for MTV.com"""
 157
 158     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 159     IE_NAME = u'mtv'
 160
 161     def _real_extract(self, url):
 162         mobj = re.match(self._VALID_URL, url)
 163         if mobj is None:
 164             raise ExtractorError(u'Invalid URL: %s' % url)
 165         if not mobj.group('proto'):
 166             url = 'http://' + url
 167         video_id = mobj.group('videoid')
 168
 169         webpage = self._download_webpage(url, video_id)
 170
 171         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 172             webpage, u'song name', fatal=False)
 173
 174         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 175             webpage, u'title')
 176
 177         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 178             webpage, u'mtvn_uri', fatal=False)
 179
 180         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 181             webpage, u'content id', fatal=False)
 182
 183         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 184         self.report_extraction(video_id)
 185         request = compat_urllib_request.Request(videogen_url)
 186         try:
 187             metadataXml = compat_urllib_request.urlopen(request).read()
 188         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 189             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 190
 191         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 192         renditions = mdoc.findall('.//rendition')
 193
 194         # For now, always pick the highest quality.
 195         rendition = renditions[-1]
 196
 197         try:
 198             _,_,ext = rendition.attrib['type'].partition('/')
 199             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 200             video_url = rendition.find('./src').text
 201         except KeyError:
 202             raise ExtractorError('Invalid rendition field.')
 203
 204         info = {
 205             'id': video_id,
 206             'url': video_url,
 207             'uploader': performer,
 208             'upload_date': None,
 209             'title': video_title,
 210             'ext': ext,
 211             'format': format,
 212         }
 213
 214         return [info]
 215
 216
 217 class YoukuIE(InfoExtractor):
 218     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 219
 220     def _gen_sid(self):
 221         nowTime = int(time.time() * 1000)
 222         random1 = random.randint(1000,1998)
 223         random2 = random.randint(1000,9999)
 224
 225         return "%d%d%d" %(nowTime,random1,random2)
 226
 227     def _get_file_ID_mix_string(self, seed):
 228         mixed = []
 229         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 230         seed = float(seed)
 231         for i in range(len(source)):
 232             seed  =  (seed * 211 + 30031 ) % 65536
 233             index  =  math.floor(seed / 65536 * len(source) )
 234             mixed.append(source[int(index)])
 235             source.remove(source[int(index)])
 236         #return ''.join(mixed)
 237         return mixed
 238
 239     def _get_file_id(self, fileId, seed):
 240         mixed = self._get_file_ID_mix_string(seed)
 241         ids = fileId.split('*')
 242         realId = []
 243         for ch in ids:
 244             if ch:
 245                 realId.append(mixed[int(ch)])
 246         return ''.join(realId)
 247
 248     def _real_extract(self, url):
 249         mobj = re.match(self._VALID_URL, url)
 250         if mobj is None:
 251             raise ExtractorError(u'Invalid URL: %s' % url)
 252         video_id = mobj.group('ID')
 253
 254         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 255
 256         jsondata = self._download_webpage(info_url, video_id)
 257
 258         self.report_extraction(video_id)
 259         try:
 260             config = json.loads(jsondata)
 261
 262             video_title =  config['data'][0]['title']
 263             seed = config['data'][0]['seed']
 264
 265             format = self._downloader.params.get('format', None)
 266             supported_format = list(config['data'][0]['streamfileids'].keys())
 267
 268             if format is None or format == 'best':
 269                 if 'hd2' in supported_format:
 270                     format = 'hd2'
 271                 else:
 272                     format = 'flv'
 273                 ext = u'flv'
 274             elif format == 'worst':
 275                 format = 'mp4'
 276                 ext = u'mp4'
 277             else:
 278                 format = 'flv'
 279                 ext = u'flv'
 280
 281
 282             fileid = config['data'][0]['streamfileids'][format]
 283             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 284         except (UnicodeDecodeError, ValueError, KeyError):
 285             raise ExtractorError(u'Unable to extract info section')
 286
 287         files_info=[]
 288         sid = self._gen_sid()
 289         fileid = self._get_file_id(fileid, seed)
 290
 291         #column 8,9 of fileid represent the segment number
 292         #fileid[7:9] should be changed
 293         for index, key in enumerate(keys):
 294
 295             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 296             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 297
 298             info = {
 299                 'id': '%s_part%02d' % (video_id, index),
 300                 'url': download_url,
 301                 'uploader': None,
 302                 'upload_date': None,
 303                 'title': video_title,
 304                 'ext': ext,
 305             }
 306             files_info.append(info)
 307
 308         return files_info
 309
 310
 311 class XNXXIE(InfoExtractor):
 312     """Information extractor for xnxx.com"""
 313
 314     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 315     IE_NAME = u'xnxx'
 316     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 317     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 318     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 319
 320     def _real_extract(self, url):
 321         mobj = re.match(self._VALID_URL, url)
 322         if mobj is None:
 323             raise ExtractorError(u'Invalid URL: %s' % url)
 324         video_id = mobj.group(1)
 325
 326         # Get webpage content
 327         webpage = self._download_webpage(url, video_id)
 328
 329         video_url = self._search_regex(self.VIDEO_URL_RE,
 330             webpage, u'video URL')
 331         video_url = compat_urllib_parse.unquote(video_url)
 332
 333         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 334             webpage, u'title')
 335
 336         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 337             webpage, u'thumbnail', fatal=False)
 338
 339         return [{
 340             'id': video_id,
 341             'url': video_url,
 342             'uploader': None,
 343             'upload_date': None,
 344             'title': video_title,
 345             'ext': 'flv',
 346             'thumbnail': video_thumbnail,
 347             'description': None,
 348         }]
 349
 350
 351
 352 class NBAIE(InfoExtractor):
 353     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
 354     IE_NAME = u'nba'
 355
 356     def _real_extract(self, url):
 357         mobj = re.match(self._VALID_URL, url)
 358         if mobj is None:
 359             raise ExtractorError(u'Invalid URL: %s' % url)
 360
 361         video_id = mobj.group(1)
 362
 363         webpage = self._download_webpage(url, video_id)
 364
 365         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 366
 367         shortened_video_id = video_id.rpartition('/')[2]
 368         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
 369             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 370
 371         # It isn't there in the HTML it returns to us
 372         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 373
 374         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 375
 376         info = {
 377             'id': shortened_video_id,
 378             'url': video_url,
 379             'ext': 'mp4',
 380             'title': title,
 381             # 'uploader_date': uploader_date,
 382             'description': description,
 383         }
 384         return [info]
 385
 386 class JustinTVIE(InfoExtractor):
 387     """Information extractor for justin.tv and twitch.tv"""
 388     # TODO: One broadcast may be split into multiple videos. The key
 389     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 390     # starts at 1 and increases. Can we treat all parts as one video?
 391
 392     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 393         (?:
 394             (?P<channelid>[^/]+)|
 395             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 396             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 397         )
 398         /?(?:\#.*)?$
 399         """
 400     _JUSTIN_PAGE_LIMIT = 100
 401     IE_NAME = u'justin.tv'
 402
 403     def report_download_page(self, channel, offset):
 404         """Report attempt to download a single page of videos."""
 405         self.to_screen(u'%s: Downloading video information from %d to %d' %
 406                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 407
 408     # Return count of items, list of *valid* items
 409     def _parse_page(self, url, video_id):
 410         webpage = self._download_webpage(url, video_id,
 411                                          u'Downloading video info JSON',
 412                                          u'unable to download video info JSON')
 413
 414         response = json.loads(webpage)
 415         if type(response) != list:
 416             error_text = response.get('error', 'unknown error')
 417             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 418         info = []
 419         for clip in response:
 420             video_url = clip['video_file_url']
 421             if video_url:
 422                 video_extension = os.path.splitext(video_url)[1][1:]
 423                 video_date = re.sub('-', '', clip['start_time'][:10])
 424                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 425                 video_id = clip['id']
 426                 video_title = clip.get('title', video_id)
 427                 info.append({
 428                     'id': video_id,
 429                     'url': video_url,
 430                     'title': video_title,
 431                     'uploader': clip.get('channel_name', video_uploader_id),
 432                     'uploader_id': video_uploader_id,
 433                     'upload_date': video_date,
 434                     'ext': video_extension,
 435                 })
 436         return (len(response), info)
 437
 438     def _real_extract(self, url):
 439         mobj = re.match(self._VALID_URL, url)
 440         if mobj is None:
 441             raise ExtractorError(u'invalid URL: %s' % url)
 442
 443         api_base = 'http://api.justin.tv'
 444         paged = False
 445         if mobj.group('channelid'):
 446             paged = True
 447             video_id = mobj.group('channelid')
 448             api = api_base + '/channel/archives/%s.json' % video_id
 449         elif mobj.group('chapterid'):
 450             chapter_id = mobj.group('chapterid')
 451
 452             webpage = self._download_webpage(url, chapter_id)
 453             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 454             if not m:
 455                 raise ExtractorError(u'Cannot find archive of a chapter')
 456             archive_id = m.group(1)
 457
 458             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 459             chapter_info_xml = self._download_webpage(api, chapter_id,
 460                                              note=u'Downloading chapter information',
 461                                              errnote=u'Chapter information download failed')
 462             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 463             for a in doc.findall('.//archive'):
 464                 if archive_id == a.find('./id').text:
 465                     break
 466             else:
 467                 raise ExtractorError(u'Could not find chapter in chapter information')
 468
 469             video_url = a.find('./video_file_url').text
 470             video_ext = video_url.rpartition('.')[2] or u'flv'
 471
 472             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 473             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 474                                    note='Downloading chapter metadata',
 475                                    errnote='Download of chapter metadata failed')
 476             chapter_info = json.loads(chapter_info_json)
 477
 478             bracket_start = int(doc.find('.//bracket_start').text)
 479             bracket_end = int(doc.find('.//bracket_end').text)
 480
 481             # TODO determine start (and probably fix up file)
 482             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 483             #video_url += u'?start=' + TODO:start_timestamp
 484             # bracket_start is 13290, but we want 51670615
 485             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 486                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 487
 488             info = {
 489                 'id': u'c' + chapter_id,
 490                 'url': video_url,
 491                 'ext': video_ext,
 492                 'title': chapter_info['title'],
 493                 'thumbnail': chapter_info['preview'],
 494                 'description': chapter_info['description'],
 495                 'uploader': chapter_info['channel']['display_name'],
 496                 'uploader_id': chapter_info['channel']['name'],
 497             }
 498             return [info]
 499         else:
 500             video_id = mobj.group('videoid')
 501             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 502
 503         self.report_extraction(video_id)
 504
 505         info = []
 506         offset = 0
 507         limit = self._JUSTIN_PAGE_LIMIT
 508         while True:
 509             if paged:
 510                 self.report_download_page(video_id, offset)
 511             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 512             page_count, page_info = self._parse_page(page_url, video_id)
 513             info.extend(page_info)
 514             if not paged or page_count != limit:
 515                 break
 516             offset += limit
 517         return info
 518
 519 class FunnyOrDieIE(InfoExtractor):
 520     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 521
 522     def _real_extract(self, url):
 523         mobj = re.match(self._VALID_URL, url)
 524         if mobj is None:
 525             raise ExtractorError(u'invalid URL: %s' % url)
 526
 527         video_id = mobj.group('id')
 528         webpage = self._download_webpage(url, video_id)
 529
 530         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 531             webpage, u'video URL', flags=re.DOTALL)
 532
 533         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 534             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 535
 536         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 537             webpage, u'description', fatal=False, flags=re.DOTALL)
 538
 539         info = {
 540             'id': video_id,
 541             'url': video_url,
 542             'ext': 'mp4',
 543             'title': title,
 544             'description': video_description,
 545         }
 546         return [info]
 547
 548 class SteamIE(InfoExtractor):
 549     _VALID_URL = r"""http://store\.steampowered\.com/
 550                 (agecheck/)?
 551                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 552                 (?P<gameID>\d+)/?
 553                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 554                 """
 555     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 556     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 557
 558     @classmethod
 559     def suitable(cls, url):
 560         """Receives a URL and returns True if suitable for this IE."""
 561         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 562
 563     def _real_extract(self, url):
 564         m = re.match(self._VALID_URL, url, re.VERBOSE)
 565         gameID = m.group('gameID')
 566
 567         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 568         webpage = self._download_webpage(videourl, gameID)
 569
 570         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 571             videourl = self._AGECHECK_TEMPLATE % gameID
 572             self.report_age_confirmation()
 573             webpage = self._download_webpage(videourl, gameID)
 574
 575         self.report_extraction(gameID)
 576         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 577                                              webpage, 'game title')
 578
 579         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 580         mweb = re.finditer(urlRE, webpage)
 581         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 582         titles = re.finditer(namesRE, webpage)
 583         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 584         thumbs = re.finditer(thumbsRE, webpage)
 585         videos = []
 586         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 587             video_id = vid.group('videoID')
 588             title = vtitle.group('videoName')
 589             video_url = vid.group('videoURL')
 590             video_thumb = thumb.group('thumbnail')
 591             if not video_url:
 592                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 593             info = {
 594                 'id':video_id,
 595                 'url':video_url,
 596                 'ext': 'flv',
 597                 'title': unescapeHTML(title),
 598                 'thumbnail': video_thumb
 599                   }
 600             videos.append(info)
 601         return [self.playlist_result(videos, gameID, game_title)]
 602
 603 class UstreamIE(InfoExtractor):
 604     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 605     IE_NAME = u'ustream'
 606
 607     def _real_extract(self, url):
 608         m = re.match(self._VALID_URL, url)
 609         video_id = m.group('videoID')
 610
 611         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 612         webpage = self._download_webpage(url, video_id)
 613
 614         self.report_extraction(video_id)
 615
 616         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 617             webpage, u'title')
 618
 619         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 620             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 621
 622         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 623             webpage, u'thumbnail', fatal=False)
 624
 625         info = {
 626                 'id': video_id,
 627                 'url': video_url,
 628                 'ext': 'flv',
 629                 'title': video_title,
 630                 'uploader': uploader,
 631                 'thumbnail': thumbnail,
 632                }
 633         return info
 634
 635 class WorldStarHipHopIE(InfoExtractor):
 636     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 637     IE_NAME = u'WorldStarHipHop'
 638
 639     def _real_extract(self, url):
 640         m = re.match(self._VALID_URL, url)
 641         video_id = m.group('id')
 642
 643         webpage_src = self._download_webpage(url, video_id)
 644
 645         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 646             webpage_src, u'video URL')
 647
 648         if 'mp4' in video_url:
 649             ext = 'mp4'
 650         else:
 651             ext = 'flv'
 652
 653         video_title = self._html_search_regex(r"<title>(.*)</title>",
 654             webpage_src, u'title')
 655
 656         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 657         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 658             webpage_src, u'thumbnail', fatal=False)
 659
 660         if not thumbnail:
 661             _title = r"""candytitles.*>(.*)</span>"""
 662             mobj = re.search(_title, webpage_src)
 663             if mobj is not None:
 664                 video_title = mobj.group(1)
 665
 666         results = [{
 667                     'id': video_id,
 668                     'url' : video_url,
 669                     'title' : video_title,
 670                     'thumbnail' : thumbnail,
 671                     'ext' : ext,
 672                     }]
 673         return results
 674
 675 class RBMARadioIE(InfoExtractor):
 676     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 677
 678     def _real_extract(self, url):
 679         m = re.match(self._VALID_URL, url)
 680         video_id = m.group('videoID')
 681
 682         webpage = self._download_webpage(url, video_id)
 683
 684         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 685             webpage, u'json data', flags=re.MULTILINE)
 686
 687         try:
 688             data = json.loads(json_data)
 689         except ValueError as e:
 690             raise ExtractorError(u'Invalid JSON: ' + str(e))
 691
 692         video_url = data['akamai_url'] + '&cbr=256'
 693         url_parts = compat_urllib_parse_urlparse(video_url)
 694         video_ext = url_parts.path.rpartition('.')[2]
 695         info = {
 696                 'id': video_id,
 697                 'url': video_url,
 698                 'ext': video_ext,
 699                 'title': data['title'],
 700                 'description': data.get('teaser_text'),
 701                 'location': data.get('country_of_origin'),
 702                 'uploader': data.get('host', {}).get('name'),
 703                 'uploader_id': data.get('host', {}).get('slug'),
 704                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 705                 'duration': data.get('duration'),
 706         }
 707         return [info]
 708
 709
 710 class YouPornIE(InfoExtractor):
 711     """Information extractor for youporn.com."""
 712     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 713
 714     def _print_formats(self, formats):
 715         """Print all available formats"""
 716         print(u'Available formats:')
 717         print(u'ext\t\tformat')
 718         print(u'---------------------------------')
 719         for format in formats:
 720             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 721
 722     def _specific(self, req_format, formats):
 723         for x in formats:
 724             if(x["format"]==req_format):
 725                 return x
 726         return None
 727
 728     def _real_extract(self, url):
 729         mobj = re.match(self._VALID_URL, url)
 730         if mobj is None:
 731             raise ExtractorError(u'Invalid URL: %s' % url)
 732         video_id = mobj.group('videoid')
 733
 734         req = compat_urllib_request.Request(url)
 735         req.add_header('Cookie', 'age_verified=1')
 736         webpage = self._download_webpage(req, video_id)
 737
 738         # Get JSON parameters
 739         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
 740         try:
 741             params = json.loads(json_params)
 742         except:
 743             raise ExtractorError(u'Invalid JSON')
 744
 745         self.report_extraction(video_id)
 746         try:
 747             video_title = params['title']
 748             upload_date = unified_strdate(params['release_date_f'])
 749             video_description = params['description']
 750             video_uploader = params['submitted_by']
 751             thumbnail = params['thumbnails'][0]['image']
 752         except KeyError:
 753             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
 754
 755         # Get all of the formats available
 756         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
 757         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
 758             webpage, u'download list').strip()
 759
 760         # Get all of the links from the page
 761         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
 762         links = re.findall(LINK_RE, download_list_html)
 763         if(len(links) == 0):
 764             raise ExtractorError(u'ERROR: no known formats available for video')
 765
 766         self.to_screen(u'Links found: %d' % len(links))
 767
 768         formats = []
 769         for link in links:
 770
 771             # A link looks like this:
 772             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
 773             # A path looks like this:
 774             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
 775             video_url = unescapeHTML( link )
 776             path = compat_urllib_parse_urlparse( video_url ).path
 777             extension = os.path.splitext( path )[1][1:]
 778             format = path.split('/')[4].split('_')[:2]
 779             size = format[0]
 780             bitrate = format[1]
 781             format = "-".join( format )
 782             # title = u'%s-%s-%s' % (video_title, size, bitrate)
 783
 784             formats.append({
 785                 'id': video_id,
 786                 'url': video_url,
 787                 'uploader': video_uploader,
 788                 'upload_date': upload_date,
 789                 'title': video_title,
 790                 'ext': extension,
 791                 'format': format,
 792                 'thumbnail': thumbnail,
 793                 'description': video_description
 794             })
 795
 796         if self._downloader.params.get('listformats', None):
 797             self._print_formats(formats)
 798             return
 799
 800         req_format = self._downloader.params.get('format', None)
 801         self.to_screen(u'Format: %s' % req_format)
 802
 803         if req_format is None or req_format == 'best':
 804             return [formats[0]]
 805         elif req_format == 'worst':
 806             return [formats[-1]]
 807         elif req_format in ('-1', 'all'):
 808             return formats
 809         else:
 810             format = self._specific( req_format, formats )
 811             if result is None:
 812                 raise ExtractorError(u'Requested format not available')
 813             return [format]
 814
 815
 816
 817 class PornotubeIE(InfoExtractor):
 818     """Information extractor for pornotube.com."""
 819     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
 820
 821     def _real_extract(self, url):
 822         mobj = re.match(self._VALID_URL, url)
 823         if mobj is None:
 824             raise ExtractorError(u'Invalid URL: %s' % url)
 825
 826         video_id = mobj.group('videoid')
 827         video_title = mobj.group('title')
 828
 829         # Get webpage content
 830         webpage = self._download_webpage(url, video_id)
 831
 832         # Get the video URL
 833         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
 834         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
 835         video_url = compat_urllib_parse.unquote(video_url)
 836
 837         #Get the uploaded date
 838         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
 839         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
 840         if upload_date: upload_date = unified_strdate(upload_date)
 841
 842         info = {'id': video_id,
 843                 'url': video_url,
 844                 'uploader': None,
 845                 'upload_date': upload_date,
 846                 'title': video_title,
 847                 'ext': 'flv',
 848                 'format': 'flv'}
 849
 850         return [info]
 851
 852 class YouJizzIE(InfoExtractor):
 853     """Information extractor for youjizz.com."""
 854     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
 855
 856     def _real_extract(self, url):
 857         mobj = re.match(self._VALID_URL, url)
 858         if mobj is None:
 859             raise ExtractorError(u'Invalid URL: %s' % url)
 860
 861         video_id = mobj.group('videoid')
 862
 863         # Get webpage content
 864         webpage = self._download_webpage(url, video_id)
 865
 866         # Get the video title
 867         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
 868             webpage, u'title').strip()
 869
 870         # Get the embed page
 871         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
 872         if result is None:
 873             raise ExtractorError(u'ERROR: unable to extract embed page')
 874
 875         embed_page_url = result.group(0).strip()
 876         video_id = result.group('videoid')
 877
 878         webpage = self._download_webpage(embed_page_url, video_id)
 879
 880         # Get the video URL
 881         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
 882             webpage, u'video URL')
 883
 884         info = {'id': video_id,
 885                 'url': video_url,
 886                 'title': video_title,
 887                 'ext': 'flv',
 888                 'format': 'flv',
 889                 'player_url': embed_page_url}
 890
 891         return [info]
 892
 893 class EightTracksIE(InfoExtractor):
 894     IE_NAME = '8tracks'
 895     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
 896
 897     def _real_extract(self, url):
 898         mobj = re.match(self._VALID_URL, url)
 899         if mobj is None:
 900             raise ExtractorError(u'Invalid URL: %s' % url)
 901         playlist_id = mobj.group('id')
 902
 903         webpage = self._download_webpage(url, playlist_id)
 904
 905         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
 906         data = json.loads(json_like)
 907
 908         session = str(random.randint(0, 1000000000))
 909         mix_id = data['id']
 910         track_count = data['tracks_count']
 911         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
 912         next_url = first_url
 913         res = []
 914         for i in itertools.count():
 915             api_json = self._download_webpage(next_url, playlist_id,
 916                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
 917                 errnote=u'Failed to download song information')
 918             api_data = json.loads(api_json)
 919             track_data = api_data[u'set']['track']
 920             info = {
 921                 'id': track_data['id'],
 922                 'url': track_data['track_file_stream_url'],
 923                 'title': track_data['performer'] + u' - ' + track_data['name'],
 924                 'raw_title': track_data['name'],
 925                 'uploader_id': data['user']['login'],
 926                 'ext': 'm4a',
 927             }
 928             res.append(info)
 929             if api_data['set']['at_last_track']:
 930                 break
 931             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
 932         return res
 933
 934 class KeekIE(InfoExtractor):
 935     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
 936     IE_NAME = u'keek'
 937
 938     def _real_extract(self, url):
 939         m = re.match(self._VALID_URL, url)
 940         video_id = m.group('videoID')
 941
 942         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
 943         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
 944         webpage = self._download_webpage(url, video_id)
 945
 946         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
 947             webpage, u'title')
 948
 949         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
 950             webpage, u'uploader', fatal=False)
 951
 952         info = {
 953                 'id': video_id,
 954                 'url': video_url,
 955                 'ext': 'mp4',
 956                 'title': video_title,
 957                 'thumbnail': thumbnail,
 958                 'uploader': uploader
 959         }
 960         return [info]
 961
 962 class TEDIE(InfoExtractor):
 963     _VALID_URL=r'''http://www\.ted\.com/
 964                    (
 965                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
 966                         |
 967                         ((?P<type_talk>talks)) # We have a simple talk
 968                    )
 969                    (/lang/(.*?))? # The url may contain the language
 970                    /(?P<name>\w+) # Here goes the name and then ".html"
 971                    '''
 972
 973     @classmethod
 974     def suitable(cls, url):
 975         """Receives a URL and returns True if suitable for this IE."""
 976         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 977
 978     def _real_extract(self, url):
 979         m=re.match(self._VALID_URL, url, re.VERBOSE)
 980         if m.group('type_talk'):
 981             return [self._talk_info(url)]
 982         else :
 983             playlist_id=m.group('playlist_id')
 984             name=m.group('name')
 985             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
 986             return [self._playlist_videos_info(url,name,playlist_id)]
 987
 988     def _playlist_videos_info(self,url,name,playlist_id=0):
 989         '''Returns the videos of the playlist'''
 990         video_RE=r'''
 991                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
 992                      ([.\s]*?)data-playlist_item_id="(\d+)"
 993                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
 994                      '''
 995         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
 996         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
 997         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
 998         m_names=re.finditer(video_name_RE,webpage)
 999
1000         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1001                                                  webpage, 'playlist title')
1002
1003         playlist_entries = []
1004         for m_video, m_name in zip(m_videos,m_names):
1005             video_id=m_video.group('video_id')
1006             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1007             playlist_entries.append(self.url_result(talk_url, 'TED'))
1008         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1009
1010     def _talk_info(self, url, video_id=0):
1011         """Return the video for the talk in the url"""
1012         m = re.match(self._VALID_URL, url,re.VERBOSE)
1013         video_name = m.group('name')
1014         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1015         self.report_extraction(video_name)
1016         # If the url includes the language we get the title translated
1017         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1018                                         webpage, 'title')
1019         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1020                                     webpage, 'json data')
1021         info = json.loads(json_data)
1022         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1023                                        webpage, 'description', flags = re.DOTALL)
1024
1025         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1026                                        webpage, 'thumbnail')
1027         info = {
1028                 'id': info['id'],
1029                 'url': info['htmlStreams'][-1]['file'],
1030                 'ext': 'mp4',
1031                 'title': title,
1032                 'thumbnail': thumbnail,
1033                 'description': desc,
1034                 }
1035         return info
1036
1037 class MySpassIE(InfoExtractor):
1038     _VALID_URL = r'http://www.myspass.de/.*'
1039
1040     def _real_extract(self, url):
1041         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1042
1043         # video id is the last path element of the URL
1044         # usually there is a trailing slash, so also try the second but last
1045         url_path = compat_urllib_parse_urlparse(url).path
1046         url_parent_path, video_id = os.path.split(url_path)
1047         if not video_id:
1048             _, video_id = os.path.split(url_parent_path)
1049
1050         # get metadata
1051         metadata_url = META_DATA_URL_TEMPLATE % video_id
1052         metadata_text = self._download_webpage(metadata_url, video_id)
1053         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1054
1055         # extract values from metadata
1056         url_flv_el = metadata.find('url_flv')
1057         if url_flv_el is None:
1058             raise ExtractorError(u'Unable to extract download url')
1059         video_url = url_flv_el.text
1060         extension = os.path.splitext(video_url)[1][1:]
1061         title_el = metadata.find('title')
1062         if title_el is None:
1063             raise ExtractorError(u'Unable to extract title')
1064         title = title_el.text
1065         format_id_el = metadata.find('format_id')
1066         if format_id_el is None:
1067             format = ext
1068         else:
1069             format = format_id_el.text
1070         description_el = metadata.find('description')
1071         if description_el is not None:
1072             description = description_el.text
1073         else:
1074             description = None
1075         imagePreview_el = metadata.find('imagePreview')
1076         if imagePreview_el is not None:
1077             thumbnail = imagePreview_el.text
1078         else:
1079             thumbnail = None
1080         info = {
1081             'id': video_id,
1082             'url': video_url,
1083             'title': title,
1084             'ext': extension,
1085             'format': format,
1086             'thumbnail': thumbnail,
1087             'description': description
1088         }
1089         return [info]
1090
1091 class SpiegelIE(InfoExtractor):
1092     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1093
1094     def _real_extract(self, url):
1095         m = re.match(self._VALID_URL, url)
1096         video_id = m.group('videoID')
1097
1098         webpage = self._download_webpage(url, video_id)
1099
1100         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1101             webpage, u'title')
1102
1103         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1104         xml_code = self._download_webpage(xml_url, video_id,
1105                     note=u'Downloading XML', errnote=u'Failed to download XML')
1106
1107         idoc = xml.etree.ElementTree.fromstring(xml_code)
1108         last_type = idoc[-1]
1109         filename = last_type.findall('./filename')[0].text
1110         duration = float(last_type.findall('./duration')[0].text)
1111
1112         video_url = 'http://video2.spiegel.de/flash/' + filename
1113         video_ext = filename.rpartition('.')[2]
1114         info = {
1115             'id': video_id,
1116             'url': video_url,
1117             'ext': video_ext,
1118             'title': video_title,
1119             'duration': duration,
1120         }
1121         return [info]
1122
1123 class LiveLeakIE(InfoExtractor):
1124
1125     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1126     IE_NAME = u'liveleak'
1127
1128     def _real_extract(self, url):
1129         mobj = re.match(self._VALID_URL, url)
1130         if mobj is None:
1131             raise ExtractorError(u'Invalid URL: %s' % url)
1132
1133         video_id = mobj.group('video_id')
1134
1135         webpage = self._download_webpage(url, video_id)
1136
1137         video_url = self._search_regex(r'file: "(.*?)",',
1138             webpage, u'video URL')
1139
1140         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1141             webpage, u'title').replace('LiveLeak.com -', '').strip()
1142
1143         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1144             webpage, u'description', fatal=False)
1145
1146         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1147             webpage, u'uploader', fatal=False)
1148
1149         info = {
1150             'id':  video_id,
1151             'url': video_url,
1152             'ext': 'mp4',
1153             'title': video_title,
1154             'description': video_description,
1155             'uploader': video_uploader
1156         }
1157
1158         return [info]
1159
1160
1161
1162 class TumblrIE(InfoExtractor):
1163     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1164
1165     def _real_extract(self, url):
1166         m_url = re.match(self._VALID_URL, url)
1167         video_id = m_url.group('id')
1168         blog = m_url.group('blog_name')
1169
1170         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1171         webpage = self._download_webpage(url, video_id)
1172
1173         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1174         video = re.search(re_video, webpage)
1175         if video is None:
1176            raise ExtractorError(u'Unable to extract video')
1177         video_url = video.group('video_url')
1178         ext = video.group('ext')
1179
1180         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1181             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1182         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1183
1184         # The only place where you can get a title, it's not complete,
1185         # but searching in other places doesn't work for all videos
1186         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1187             webpage, u'title', flags=re.DOTALL)
1188
1189         return [{'id': video_id,
1190                  'url': video_url,
1191                  'title': video_title,
1192                  'thumbnail': video_thumbnail,
1193                  'ext': ext
1194                  }]
1195
1196 class BandcampIE(InfoExtractor):
1197     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1198
1199     def _real_extract(self, url):
1200         mobj = re.match(self._VALID_URL, url)
1201         title = mobj.group('title')
1202         webpage = self._download_webpage(url, title)
1203         # We get the link to the free download page
1204         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1205         if m_download is None:
1206             raise ExtractorError(u'No free songs found')
1207
1208         download_link = m_download.group(1)
1209         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1210                        webpage, re.MULTILINE|re.DOTALL).group('id')
1211
1212         download_webpage = self._download_webpage(download_link, id,
1213                                                   'Downloading free downloads page')
1214         # We get the dictionary of the track from some javascrip code
1215         info = re.search(r'items: (.*?),$',
1216                          download_webpage, re.MULTILINE).group(1)
1217         info = json.loads(info)[0]
1218         # We pick mp3-320 for now, until format selection can be easily implemented.
1219         mp3_info = info[u'downloads'][u'mp3-320']
1220         # If we try to use this url it says the link has expired
1221         initial_url = mp3_info[u'url']
1222         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1223         m_url = re.match(re_url, initial_url)
1224         #We build the url we will use to get the final track url
1225         # This url is build in Bandcamp in the script download_bunde_*.js
1226         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1227         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1228         # If we could correctly generate the .rand field the url would be
1229         #in the "download_url" key
1230         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1231
1232         track_info = {'id':id,
1233                       'title' : info[u'title'],
1234                       'ext' :   'mp3',
1235                       'url' :   final_url,
1236                       'thumbnail' : info[u'thumb_url'],
1237                       'uploader' :  info[u'artist']
1238                       }
1239
1240         return [track_info]
1241
1242 class RedTubeIE(InfoExtractor):
1243     """Information Extractor for redtube"""
1244     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1245
1246     def _real_extract(self,url):
1247         mobj = re.match(self._VALID_URL, url)
1248         if mobj is None:
1249             raise ExtractorError(u'Invalid URL: %s' % url)
1250
1251         video_id = mobj.group('id')
1252         video_extension = 'mp4'
1253         webpage = self._download_webpage(url, video_id)
1254
1255         self.report_extraction(video_id)
1256
1257         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1258             webpage, u'video URL')
1259
1260         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1261             webpage, u'title')
1262
1263         return [{
1264             'id':       video_id,
1265             'url':      video_url,
1266             'ext':      video_extension,
1267             'title':    video_title,
1268         }]
1269
1270 class InaIE(InfoExtractor):
1271     """Information Extractor for Ina.fr"""
1272     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1273
1274     def _real_extract(self,url):
1275         mobj = re.match(self._VALID_URL, url)
1276
1277         video_id = mobj.group('id')
1278         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1279         video_extension = 'mp4'
1280         webpage = self._download_webpage(mrss_url, video_id)
1281
1282         self.report_extraction(video_id)
1283
1284         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1285             webpage, u'video URL')
1286
1287         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1288             webpage, u'title')
1289
1290         return [{
1291             'id':       video_id,
1292             'url':      video_url,
1293             'ext':      video_extension,
1294             'title':    video_title,
1295         }]
1296
1297 class HowcastIE(InfoExtractor):
1298     """Information Extractor for Howcast.com"""
1299     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1300
1301     def _real_extract(self, url):
1302         mobj = re.match(self._VALID_URL, url)
1303
1304         video_id = mobj.group('id')
1305         webpage_url = 'http://www.howcast.com/videos/' + video_id
1306         webpage = self._download_webpage(webpage_url, video_id)
1307
1308         self.report_extraction(video_id)
1309
1310         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1311             webpage, u'video URL')
1312
1313         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1314             webpage, u'title')
1315
1316         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1317             webpage, u'description', fatal=False)
1318
1319         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1320             webpage, u'thumbnail', fatal=False)
1321
1322         return [{
1323             'id':       video_id,
1324             'url':      video_url,
1325             'ext':      'mp4',
1326             'title':    video_title,
1327             'description': video_description,
1328             'thumbnail': thumbnail,
1329         }]
1330
1331 class VineIE(InfoExtractor):
1332     """Information Extractor for Vine.co"""
1333     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1334
1335     def _real_extract(self, url):
1336         mobj = re.match(self._VALID_URL, url)
1337
1338         video_id = mobj.group('id')
1339         webpage_url = 'https://vine.co/v/' + video_id
1340         webpage = self._download_webpage(webpage_url, video_id)
1341
1342         self.report_extraction(video_id)
1343
1344         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1345             webpage, u'video URL')
1346
1347         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1348             webpage, u'title')
1349
1350         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1351             webpage, u'thumbnail', fatal=False)
1352
1353         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1354             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1355
1356         return [{
1357             'id':        video_id,
1358             'url':       video_url,
1359             'ext':       'mp4',
1360             'title':     video_title,
1361             'thumbnail': thumbnail,
1362             'uploader':  uploader,
1363         }]
1364
1365 class FlickrIE(InfoExtractor):
1366     """Information Extractor for Flickr videos"""
1367     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1368
1369     def _real_extract(self, url):
1370         mobj = re.match(self._VALID_URL, url)
1371
1372         video_id = mobj.group('id')
1373         video_uploader_id = mobj.group('uploader_id')
1374         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1375         webpage = self._download_webpage(webpage_url, video_id)
1376
1377         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1378
1379         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1380         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1381
1382         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1383             first_xml, u'node_id')
1384
1385         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1386         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1387
1388         self.report_extraction(video_id)
1389
1390         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1391         if mobj is None:
1392             raise ExtractorError(u'Unable to extract video url')
1393         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1394
1395         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1396             webpage, u'video title')
1397
1398         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1399             webpage, u'description', fatal=False)
1400
1401         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1402             webpage, u'thumbnail', fatal=False)
1403
1404         return [{
1405             'id':          video_id,
1406             'url':         video_url,
1407             'ext':         'mp4',
1408             'title':       video_title,
1409             'description': video_description,
1410             'thumbnail':   thumbnail,
1411             'uploader_id': video_uploader_id,
1412         }]
1413
1414 class TeamcocoIE(InfoExtractor):
1415     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1416
1417     def _real_extract(self, url):
1418         mobj = re.match(self._VALID_URL, url)
1419         if mobj is None:
1420             raise ExtractorError(u'Invalid URL: %s' % url)
1421         url_title = mobj.group('url_title')
1422         webpage = self._download_webpage(url, url_title)
1423
1424         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1425             webpage, u'video id')
1426
1427         self.report_extraction(video_id)
1428
1429         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1430             webpage, u'title')
1431
1432         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1433             webpage, u'thumbnail', fatal=False)
1434
1435         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1436             webpage, u'description', fatal=False)
1437
1438         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1439         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1440
1441         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1442             data, u'video URL')
1443
1444         return [{
1445             'id':          video_id,
1446             'url':         video_url,
1447             'ext':         'mp4',
1448             'title':       video_title,
1449             'thumbnail':   thumbnail,
1450             'description': video_description,
1451         }]
1452
1453 class XHamsterIE(InfoExtractor):
1454     """Information Extractor for xHamster"""
1455     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1456
1457     def _real_extract(self,url):
1458         mobj = re.match(self._VALID_URL, url)
1459
1460         video_id = mobj.group('id')
1461         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1462         webpage = self._download_webpage(mrss_url, video_id)
1463
1464         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1465         if mobj is None:
1466             raise ExtractorError(u'Unable to extract media URL')
1467         if len(mobj.group('server')) == 0:
1468             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1469         else:
1470             video_url = mobj.group('server')+'/key='+mobj.group('file')
1471         video_extension = video_url.split('.')[-1]
1472
1473         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1474             webpage, u'title')
1475
1476         # Can't see the description anywhere in the UI
1477         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1478         #     webpage, u'description', fatal=False)
1479         # if video_description: video_description = unescapeHTML(video_description)
1480
1481         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1482         if mobj:
1483             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1484         else:
1485             video_upload_date = None
1486             self._downloader.report_warning(u'Unable to extract upload date')
1487
1488         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1489             webpage, u'uploader id', default=u'anonymous')
1490
1491         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1492             webpage, u'thumbnail', fatal=False)
1493
1494         return [{
1495             'id':       video_id,
1496             'url':      video_url,
1497             'ext':      video_extension,
1498             'title':    video_title,
1499             # 'description': video_description,
1500             'upload_date': video_upload_date,
1501             'uploader_id': video_uploader_id,
1502             'thumbnail': video_thumbnail
1503         }]
1504
1505 class HypemIE(InfoExtractor):
1506     """Information Extractor for hypem"""
1507     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1508
1509     def _real_extract(self, url):
1510         mobj = re.match(self._VALID_URL, url)
1511         if mobj is None:
1512             raise ExtractorError(u'Invalid URL: %s' % url)
1513         track_id = mobj.group(1)
1514
1515         data = { 'ax': 1, 'ts': time.time() }
1516         data_encoded = compat_urllib_parse.urlencode(data)
1517         complete_url = url + "?" + data_encoded
1518         request = compat_urllib_request.Request(complete_url)
1519         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1520         cookie = urlh.headers.get('Set-Cookie', '')
1521
1522         self.report_extraction(track_id)
1523
1524         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1525             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1526         try:
1527             track_list = json.loads(html_tracks)
1528             track = track_list[u'tracks'][0]
1529         except ValueError:
1530             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1531
1532         key = track[u"key"]
1533         track_id = track[u"id"]
1534         artist = track[u"artist"]
1535         title = track[u"song"]
1536
1537         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1538         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1539         request.add_header('cookie', cookie)
1540         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1541         try:
1542             song_data = json.loads(song_data_json)
1543         except ValueError:
1544             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1545         final_url = song_data[u"url"]
1546
1547         return [{
1548             'id':       track_id,
1549             'url':      final_url,
1550             'ext':      "mp3",
1551             'title':    title,
1552             'artist':   artist,
1553         }]
1554
1555 class Vbox7IE(InfoExtractor):
1556     """Information Extractor for Vbox7"""
1557     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1558
1559     def _real_extract(self,url):
1560         mobj = re.match(self._VALID_URL, url)
1561         if mobj is None:
1562             raise ExtractorError(u'Invalid URL: %s' % url)
1563         video_id = mobj.group(1)
1564
1565         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1566         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1567         redirect_url = urlh.geturl() + new_location
1568         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1569
1570         title = self._html_search_regex(r'<title>(.*)</title>',
1571             webpage, u'title').split('/')[0].strip()
1572
1573         ext = "flv"
1574         info_url = "http://vbox7.com/play/magare.do"
1575         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1576         info_request = compat_urllib_request.Request(info_url, data)
1577         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1578         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1579         if info_response is None:
1580             raise ExtractorError(u'Unable to extract the media url')
1581         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1582
1583         return [{
1584             'id':        video_id,
1585             'url':       final_url,
1586             'ext':       ext,
1587             'title':     title,
1588             'thumbnail': thumbnail_url,
1589         }]
1590
1591
1592 def gen_extractors():
1593     """ Return a list of an instance of every supported extractor.
1594     The order does matter; the first extractor matched is the one handling the URL.
1595     """
1596     return [
1597         YoutubePlaylistIE(),
1598         YoutubeChannelIE(),
1599         YoutubeUserIE(),
1600         YoutubeSearchIE(),
1601         YoutubeIE(),
1602         MetacafeIE(),
1603         DailymotionIE(),
1604         GoogleSearchIE(),
1605         PhotobucketIE(),
1606         YahooIE(),
1607         YahooSearchIE(),
1608         DepositFilesIE(),
1609         FacebookIE(),
1610         BlipTVIE(),
1611         BlipTVUserIE(),
1612         VimeoIE(),
1613         MyVideoIE(),
1614         ComedyCentralIE(),
1615         EscapistIE(),
1616         CollegeHumorIE(),
1617         XVideosIE(),
1618         SoundcloudSetIE(),
1619         SoundcloudIE(),
1620         InfoQIE(),
1621         MixcloudIE(),
1622         StanfordOpenClassroomIE(),
1623         MTVIE(),
1624         YoukuIE(),
1625         XNXXIE(),
1626         YouJizzIE(),
1627         PornotubeIE(),
1628         YouPornIE(),
1629         GooglePlusIE(),
1630         ArteTvIE(),
1631         NBAIE(),
1632         WorldStarHipHopIE(),
1633         JustinTVIE(),
1634         FunnyOrDieIE(),
1635         SteamIE(),
1636         UstreamIE(),
1637         RBMARadioIE(),
1638         EightTracksIE(),
1639         KeekIE(),
1640         TEDIE(),
1641         MySpassIE(),
1642         SpiegelIE(),
1643         LiveLeakIE(),
1644         ARDIE(),
1645         ZDFIE(),
1646         TumblrIE(),
1647         BandcampIE(),
1648         RedTubeIE(),
1649         InaIE(),
1650         HowcastIE(),
1651         VineIE(),
1652         FlickrIE(),
1653         TeamcocoIE(),
1654         XHamsterIE(),
1655         HypemIE(),
1656         Vbox7IE(),
1657         GametrailersIE(),
1658         StatigramIE(),
1659         GenericIE()
1660     ]
1661
1662 def get_info_extractor(ie_name):
1663     """Returns the info extractor class with the given ie_name"""
1664     return globals()[ie_name+'IE']