_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 import base64
   2 import datetime
   3 import itertools
   4 import netrc
   5 import os
   6 import re
   7 import socket
   8 import time
   9 import email.utils
  10 import xml.etree.ElementTree
  11 import random
  12 import math
  13 import operator
  14 import hashlib
  15 import binascii
  16 import urllib
  17
  18 from .utils import *
  19 from .extractor.common import InfoExtractor, SearchInfoExtractor
  20
  21 from .extractor.ard import ARDIE
  22 from .extractor.arte import ArteTvIE
  23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
  24 from .extractor.comedycentral import ComedyCentralIE
  25 from .extractor.dailymotion import DailymotionIE
  26 from .extractor.depositfiles import DepositFilesIE
  27 from .extractor.escapist import EscapistIE
  28 from .extractor.facebook import FacebookIE
  29 from .extractor.gametrailers import GametrailersIE
  30 from .extractor.generic import GenericIE
  31 from .extractor.googleplus import GooglePlusIE
  32 from .extractor.googlesearch import GoogleSearchIE
  33 from .extractor.metacafe import MetacafeIE
  34 from .extractor.myvideo import MyVideoIE
  35 from .extractor.statigram import StatigramIE
  36 from .extractor.photobucket import PhotobucketIE
  37 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
  38 from .extractor.vimeo import VimeoIE
  39 from .extractor.yahoo import YahooIE, YahooSearchIE
  40 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  41 from .extractor.zdf import ZDFIE
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53
  54
  55
  56
  57
  58
  59
  60
  61
  62
  63
  64
  65
  66
  67
  68
  69
  70
  71 class CollegeHumorIE(InfoExtractor):
  72     """Information extractor for collegehumor.com"""
  73
  74     _WORKING = False
  75     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
  76     IE_NAME = u'collegehumor'
  77
  78     def report_manifest(self, video_id):
  79         """Report information extraction."""
  80         self.to_screen(u'%s: Downloading XML manifest' % video_id)
  81
  82     def _real_extract(self, url):
  83         mobj = re.match(self._VALID_URL, url)
  84         if mobj is None:
  85             raise ExtractorError(u'Invalid URL: %s' % url)
  86         video_id = mobj.group('videoid')
  87
  88         info = {
  89             'id': video_id,
  90             'uploader': None,
  91             'upload_date': None,
  92         }
  93
  94         self.report_extraction(video_id)
  95         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
  96         try:
  97             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
  98         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  99             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 100
 101         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 102         try:
 103             videoNode = mdoc.findall('./video')[0]
 104             info['description'] = videoNode.findall('./description')[0].text
 105             info['title'] = videoNode.findall('./caption')[0].text
 106             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 107             manifest_url = videoNode.findall('./file')[0].text
 108         except IndexError:
 109             raise ExtractorError(u'Invalid metadata XML file')
 110
 111         manifest_url += '?hdcore=2.10.3'
 112         self.report_manifest(video_id)
 113         try:
 114             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 115         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 116             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 117
 118         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 119         try:
 120             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 121             node_id = media_node.attrib['url']
 122             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 123         except IndexError as err:
 124             raise ExtractorError(u'Invalid manifest file')
 125
 126         url_pr = compat_urllib_parse_urlparse(manifest_url)
 127         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 128
 129         info['url'] = url
 130         info['ext'] = 'f4f'
 131         return [info]
 132
 133
 134 class XVideosIE(InfoExtractor):
 135     """Information extractor for xvideos.com"""
 136
 137     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 138     IE_NAME = u'xvideos'
 139
 140     def _real_extract(self, url):
 141         mobj = re.match(self._VALID_URL, url)
 142         if mobj is None:
 143             raise ExtractorError(u'Invalid URL: %s' % url)
 144         video_id = mobj.group(1)
 145
 146         webpage = self._download_webpage(url, video_id)
 147
 148         self.report_extraction(video_id)
 149
 150         # Extract video URL
 151         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 152             webpage, u'video URL'))
 153
 154         # Extract title
 155         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 156             webpage, u'title')
 157
 158         # Extract video thumbnail
 159         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 160             webpage, u'thumbnail', fatal=False)
 161
 162         info = {
 163             'id': video_id,
 164             'url': video_url,
 165             'uploader': None,
 166             'upload_date': None,
 167             'title': video_title,
 168             'ext': 'flv',
 169             'thumbnail': video_thumbnail,
 170             'description': None,
 171         }
 172
 173         return [info]
 174
 175
 176
 177
 178 class InfoQIE(InfoExtractor):
 179     """Information extractor for infoq.com"""
 180     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 181
 182     def _real_extract(self, url):
 183         mobj = re.match(self._VALID_URL, url)
 184         if mobj is None:
 185             raise ExtractorError(u'Invalid URL: %s' % url)
 186
 187         webpage = self._download_webpage(url, video_id=url)
 188         self.report_extraction(url)
 189
 190         # Extract video URL
 191         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
 192         if mobj is None:
 193             raise ExtractorError(u'Unable to extract video url')
 194         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
 195         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
 196
 197         # Extract title
 198         video_title = self._search_regex(r'contentTitle = "(.*?)";',
 199             webpage, u'title')
 200
 201         # Extract description
 202         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
 203             webpage, u'description', fatal=False)
 204
 205         video_filename = video_url.split('/')[-1]
 206         video_id, extension = video_filename.split('.')
 207
 208         info = {
 209             'id': video_id,
 210             'url': video_url,
 211             'uploader': None,
 212             'upload_date': None,
 213             'title': video_title,
 214             'ext': extension, # Extension is always(?) mp4, but seems to be flv
 215             'thumbnail': None,
 216             'description': video_description,
 217         }
 218
 219         return [info]
 220
 221 class MixcloudIE(InfoExtractor):
 222     """Information extractor for www.mixcloud.com"""
 223
 224     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
 225     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
 226     IE_NAME = u'mixcloud'
 227
 228     def report_download_json(self, file_id):
 229         """Report JSON download."""
 230         self.to_screen(u'Downloading json')
 231
 232     def get_urls(self, jsonData, fmt, bitrate='best'):
 233         """Get urls from 'audio_formats' section in json"""
 234         file_url = None
 235         try:
 236             bitrate_list = jsonData[fmt]
 237             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
 238                 bitrate = max(bitrate_list) # select highest
 239
 240             url_list = jsonData[fmt][bitrate]
 241         except TypeError: # we have no bitrate info.
 242             url_list = jsonData[fmt]
 243         return url_list
 244
 245     def check_urls(self, url_list):
 246         """Returns 1st active url from list"""
 247         for url in url_list:
 248             try:
 249                 compat_urllib_request.urlopen(url)
 250                 return url
 251             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 252                 url = None
 253
 254         return None
 255
 256     def _print_formats(self, formats):
 257         print('Available formats:')
 258         for fmt in formats.keys():
 259             for b in formats[fmt]:
 260                 try:
 261                     ext = formats[fmt][b][0]
 262                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
 263                 except TypeError: # we have no bitrate info
 264                     ext = formats[fmt][0]
 265                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
 266                     break
 267
 268     def _real_extract(self, url):
 269         mobj = re.match(self._VALID_URL, url)
 270         if mobj is None:
 271             raise ExtractorError(u'Invalid URL: %s' % url)
 272         # extract uploader & filename from url
 273         uploader = mobj.group(1).decode('utf-8')
 274         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
 275
 276         # construct API request
 277         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
 278         # retrieve .json file with links to files
 279         request = compat_urllib_request.Request(file_url)
 280         try:
 281             self.report_download_json(file_url)
 282             jsonData = compat_urllib_request.urlopen(request).read()
 283         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 284             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
 285
 286         # parse JSON
 287         json_data = json.loads(jsonData)
 288         player_url = json_data['player_swf_url']
 289         formats = dict(json_data['audio_formats'])
 290
 291         req_format = self._downloader.params.get('format', None)
 292         bitrate = None
 293
 294         if self._downloader.params.get('listformats', None):
 295             self._print_formats(formats)
 296             return
 297
 298         if req_format is None or req_format == 'best':
 299             for format_param in formats.keys():
 300                 url_list = self.get_urls(formats, format_param)
 301                 # check urls
 302                 file_url = self.check_urls(url_list)
 303                 if file_url is not None:
 304                     break # got it!
 305         else:
 306             if req_format not in formats:
 307                 raise ExtractorError(u'Format is not available')
 308
 309             url_list = self.get_urls(formats, req_format)
 310             file_url = self.check_urls(url_list)
 311             format_param = req_format
 312
 313         return [{
 314             'id': file_id.decode('utf-8'),
 315             'url': file_url.decode('utf-8'),
 316             'uploader': uploader.decode('utf-8'),
 317             'upload_date': None,
 318             'title': json_data['name'],
 319             'ext': file_url.split('.')[-1].decode('utf-8'),
 320             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
 321             'thumbnail': json_data['thumbnail_url'],
 322             'description': json_data['description'],
 323             'player_url': player_url.decode('utf-8'),
 324         }]
 325
 326 class StanfordOpenClassroomIE(InfoExtractor):
 327     """Information extractor for Stanford's Open ClassRoom"""
 328
 329     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
 330     IE_NAME = u'stanfordoc'
 331
 332     def _real_extract(self, url):
 333         mobj = re.match(self._VALID_URL, url)
 334         if mobj is None:
 335             raise ExtractorError(u'Invalid URL: %s' % url)
 336
 337         if mobj.group('course') and mobj.group('video'): # A specific video
 338             course = mobj.group('course')
 339             video = mobj.group('video')
 340             info = {
 341                 'id': course + '_' + video,
 342                 'uploader': None,
 343                 'upload_date': None,
 344             }
 345
 346             self.report_extraction(info['id'])
 347             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
 348             xmlUrl = baseUrl + video + '.xml'
 349             try:
 350                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 351             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 352                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 353             mdoc = xml.etree.ElementTree.fromstring(metaXml)
 354             try:
 355                 info['title'] = mdoc.findall('./title')[0].text
 356                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
 357             except IndexError:
 358                 raise ExtractorError(u'Invalid metadata XML file')
 359             info['ext'] = info['url'].rpartition('.')[2]
 360             return [info]
 361         elif mobj.group('course'): # A course page
 362             course = mobj.group('course')
 363             info = {
 364                 'id': course,
 365                 'type': 'playlist',
 366                 'uploader': None,
 367                 'upload_date': None,
 368             }
 369
 370             coursepage = self._download_webpage(url, info['id'],
 371                                         note='Downloading course info page',
 372                                         errnote='Unable to download course info page')
 373
 374             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
 375
 376             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
 377                 coursepage, u'description', fatal=False)
 378
 379             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
 380             info['list'] = [
 381                 {
 382                     'type': 'reference',
 383                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
 384                 }
 385                     for vpage in links]
 386             results = []
 387             for entry in info['list']:
 388                 assert entry['type'] == 'reference'
 389                 results += self.extract(entry['url'])
 390             return results
 391         else: # Root page
 392             info = {
 393                 'id': 'Stanford OpenClassroom',
 394                 'type': 'playlist',
 395                 'uploader': None,
 396                 'upload_date': None,
 397             }
 398
 399             self.report_download_webpage(info['id'])
 400             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
 401             try:
 402                 rootpage = compat_urllib_request.urlopen(rootURL).read()
 403             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 404                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
 405
 406             info['title'] = info['id']
 407
 408             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
 409             info['list'] = [
 410                 {
 411                     'type': 'reference',
 412                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
 413                 }
 414                     for cpage in links]
 415
 416             results = []
 417             for entry in info['list']:
 418                 assert entry['type'] == 'reference'
 419                 results += self.extract(entry['url'])
 420             return results
 421
 422 class MTVIE(InfoExtractor):
 423     """Information extractor for MTV.com"""
 424
 425     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
 426     IE_NAME = u'mtv'
 427
 428     def _real_extract(self, url):
 429         mobj = re.match(self._VALID_URL, url)
 430         if mobj is None:
 431             raise ExtractorError(u'Invalid URL: %s' % url)
 432         if not mobj.group('proto'):
 433             url = 'http://' + url
 434         video_id = mobj.group('videoid')
 435
 436         webpage = self._download_webpage(url, video_id)
 437
 438         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
 439             webpage, u'song name', fatal=False)
 440
 441         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
 442             webpage, u'title')
 443
 444         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
 445             webpage, u'mtvn_uri', fatal=False)
 446
 447         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
 448             webpage, u'content id', fatal=False)
 449
 450         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
 451         self.report_extraction(video_id)
 452         request = compat_urllib_request.Request(videogen_url)
 453         try:
 454             metadataXml = compat_urllib_request.urlopen(request).read()
 455         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 456             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
 457
 458         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
 459         renditions = mdoc.findall('.//rendition')
 460
 461         # For now, always pick the highest quality.
 462         rendition = renditions[-1]
 463
 464         try:
 465             _,_,ext = rendition.attrib['type'].partition('/')
 466             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
 467             video_url = rendition.find('./src').text
 468         except KeyError:
 469             raise ExtractorError('Invalid rendition field.')
 470
 471         info = {
 472             'id': video_id,
 473             'url': video_url,
 474             'uploader': performer,
 475             'upload_date': None,
 476             'title': video_title,
 477             'ext': ext,
 478             'format': format,
 479         }
 480
 481         return [info]
 482
 483
 484 class YoukuIE(InfoExtractor):
 485     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
 486
 487     def _gen_sid(self):
 488         nowTime = int(time.time() * 1000)
 489         random1 = random.randint(1000,1998)
 490         random2 = random.randint(1000,9999)
 491
 492         return "%d%d%d" %(nowTime,random1,random2)
 493
 494     def _get_file_ID_mix_string(self, seed):
 495         mixed = []
 496         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
 497         seed = float(seed)
 498         for i in range(len(source)):
 499             seed  =  (seed * 211 + 30031 ) % 65536
 500             index  =  math.floor(seed / 65536 * len(source) )
 501             mixed.append(source[int(index)])
 502             source.remove(source[int(index)])
 503         #return ''.join(mixed)
 504         return mixed
 505
 506     def _get_file_id(self, fileId, seed):
 507         mixed = self._get_file_ID_mix_string(seed)
 508         ids = fileId.split('*')
 509         realId = []
 510         for ch in ids:
 511             if ch:
 512                 realId.append(mixed[int(ch)])
 513         return ''.join(realId)
 514
 515     def _real_extract(self, url):
 516         mobj = re.match(self._VALID_URL, url)
 517         if mobj is None:
 518             raise ExtractorError(u'Invalid URL: %s' % url)
 519         video_id = mobj.group('ID')
 520
 521         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
 522
 523         jsondata = self._download_webpage(info_url, video_id)
 524
 525         self.report_extraction(video_id)
 526         try:
 527             config = json.loads(jsondata)
 528
 529             video_title =  config['data'][0]['title']
 530             seed = config['data'][0]['seed']
 531
 532             format = self._downloader.params.get('format', None)
 533             supported_format = list(config['data'][0]['streamfileids'].keys())
 534
 535             if format is None or format == 'best':
 536                 if 'hd2' in supported_format:
 537                     format = 'hd2'
 538                 else:
 539                     format = 'flv'
 540                 ext = u'flv'
 541             elif format == 'worst':
 542                 format = 'mp4'
 543                 ext = u'mp4'
 544             else:
 545                 format = 'flv'
 546                 ext = u'flv'
 547
 548
 549             fileid = config['data'][0]['streamfileids'][format]
 550             keys = [s['k'] for s in config['data'][0]['segs'][format]]
 551         except (UnicodeDecodeError, ValueError, KeyError):
 552             raise ExtractorError(u'Unable to extract info section')
 553
 554         files_info=[]
 555         sid = self._gen_sid()
 556         fileid = self._get_file_id(fileid, seed)
 557
 558         #column 8,9 of fileid represent the segment number
 559         #fileid[7:9] should be changed
 560         for index, key in enumerate(keys):
 561
 562             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
 563             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
 564
 565             info = {
 566                 'id': '%s_part%02d' % (video_id, index),
 567                 'url': download_url,
 568                 'uploader': None,
 569                 'upload_date': None,
 570                 'title': video_title,
 571                 'ext': ext,
 572             }
 573             files_info.append(info)
 574
 575         return files_info
 576
 577
 578 class XNXXIE(InfoExtractor):
 579     """Information extractor for xnxx.com"""
 580
 581     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
 582     IE_NAME = u'xnxx'
 583     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
 584     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
 585     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
 586
 587     def _real_extract(self, url):
 588         mobj = re.match(self._VALID_URL, url)
 589         if mobj is None:
 590             raise ExtractorError(u'Invalid URL: %s' % url)
 591         video_id = mobj.group(1)
 592
 593         # Get webpage content
 594         webpage = self._download_webpage(url, video_id)
 595
 596         video_url = self._search_regex(self.VIDEO_URL_RE,
 597             webpage, u'video URL')
 598         video_url = compat_urllib_parse.unquote(video_url)
 599
 600         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
 601             webpage, u'title')
 602
 603         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
 604             webpage, u'thumbnail', fatal=False)
 605
 606         return [{
 607             'id': video_id,
 608             'url': video_url,
 609             'uploader': None,
 610             'upload_date': None,
 611             'title': video_title,
 612             'ext': 'flv',
 613             'thumbnail': video_thumbnail,
 614             'description': None,
 615         }]
 616
 617
 618
 619 class NBAIE(InfoExtractor):
 620     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
 621     IE_NAME = u'nba'
 622
 623     def _real_extract(self, url):
 624         mobj = re.match(self._VALID_URL, url)
 625         if mobj is None:
 626             raise ExtractorError(u'Invalid URL: %s' % url)
 627
 628         video_id = mobj.group(1)
 629
 630         webpage = self._download_webpage(url, video_id)
 631
 632         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
 633
 634         shortened_video_id = video_id.rpartition('/')[2]
 635         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
 636             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
 637
 638         # It isn't there in the HTML it returns to us
 639         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
 640
 641         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
 642
 643         info = {
 644             'id': shortened_video_id,
 645             'url': video_url,
 646             'ext': 'mp4',
 647             'title': title,
 648             # 'uploader_date': uploader_date,
 649             'description': description,
 650         }
 651         return [info]
 652
 653 class JustinTVIE(InfoExtractor):
 654     """Information extractor for justin.tv and twitch.tv"""
 655     # TODO: One broadcast may be split into multiple videos. The key
 656     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
 657     # starts at 1 and increases. Can we treat all parts as one video?
 658
 659     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
 660         (?:
 661             (?P<channelid>[^/]+)|
 662             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
 663             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
 664         )
 665         /?(?:\#.*)?$
 666         """
 667     _JUSTIN_PAGE_LIMIT = 100
 668     IE_NAME = u'justin.tv'
 669
 670     def report_download_page(self, channel, offset):
 671         """Report attempt to download a single page of videos."""
 672         self.to_screen(u'%s: Downloading video information from %d to %d' %
 673                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
 674
 675     # Return count of items, list of *valid* items
 676     def _parse_page(self, url, video_id):
 677         webpage = self._download_webpage(url, video_id,
 678                                          u'Downloading video info JSON',
 679                                          u'unable to download video info JSON')
 680
 681         response = json.loads(webpage)
 682         if type(response) != list:
 683             error_text = response.get('error', 'unknown error')
 684             raise ExtractorError(u'Justin.tv API: %s' % error_text)
 685         info = []
 686         for clip in response:
 687             video_url = clip['video_file_url']
 688             if video_url:
 689                 video_extension = os.path.splitext(video_url)[1][1:]
 690                 video_date = re.sub('-', '', clip['start_time'][:10])
 691                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
 692                 video_id = clip['id']
 693                 video_title = clip.get('title', video_id)
 694                 info.append({
 695                     'id': video_id,
 696                     'url': video_url,
 697                     'title': video_title,
 698                     'uploader': clip.get('channel_name', video_uploader_id),
 699                     'uploader_id': video_uploader_id,
 700                     'upload_date': video_date,
 701                     'ext': video_extension,
 702                 })
 703         return (len(response), info)
 704
 705     def _real_extract(self, url):
 706         mobj = re.match(self._VALID_URL, url)
 707         if mobj is None:
 708             raise ExtractorError(u'invalid URL: %s' % url)
 709
 710         api_base = 'http://api.justin.tv'
 711         paged = False
 712         if mobj.group('channelid'):
 713             paged = True
 714             video_id = mobj.group('channelid')
 715             api = api_base + '/channel/archives/%s.json' % video_id
 716         elif mobj.group('chapterid'):
 717             chapter_id = mobj.group('chapterid')
 718
 719             webpage = self._download_webpage(url, chapter_id)
 720             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
 721             if not m:
 722                 raise ExtractorError(u'Cannot find archive of a chapter')
 723             archive_id = m.group(1)
 724
 725             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
 726             chapter_info_xml = self._download_webpage(api, chapter_id,
 727                                              note=u'Downloading chapter information',
 728                                              errnote=u'Chapter information download failed')
 729             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
 730             for a in doc.findall('.//archive'):
 731                 if archive_id == a.find('./id').text:
 732                     break
 733             else:
 734                 raise ExtractorError(u'Could not find chapter in chapter information')
 735
 736             video_url = a.find('./video_file_url').text
 737             video_ext = video_url.rpartition('.')[2] or u'flv'
 738
 739             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
 740             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
 741                                    note='Downloading chapter metadata',
 742                                    errnote='Download of chapter metadata failed')
 743             chapter_info = json.loads(chapter_info_json)
 744
 745             bracket_start = int(doc.find('.//bracket_start').text)
 746             bracket_end = int(doc.find('.//bracket_end').text)
 747
 748             # TODO determine start (and probably fix up file)
 749             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
 750             #video_url += u'?start=' + TODO:start_timestamp
 751             # bracket_start is 13290, but we want 51670615
 752             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
 753                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
 754
 755             info = {
 756                 'id': u'c' + chapter_id,
 757                 'url': video_url,
 758                 'ext': video_ext,
 759                 'title': chapter_info['title'],
 760                 'thumbnail': chapter_info['preview'],
 761                 'description': chapter_info['description'],
 762                 'uploader': chapter_info['channel']['display_name'],
 763                 'uploader_id': chapter_info['channel']['name'],
 764             }
 765             return [info]
 766         else:
 767             video_id = mobj.group('videoid')
 768             api = api_base + '/broadcast/by_archive/%s.json' % video_id
 769
 770         self.report_extraction(video_id)
 771
 772         info = []
 773         offset = 0
 774         limit = self._JUSTIN_PAGE_LIMIT
 775         while True:
 776             if paged:
 777                 self.report_download_page(video_id, offset)
 778             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
 779             page_count, page_info = self._parse_page(page_url, video_id)
 780             info.extend(page_info)
 781             if not paged or page_count != limit:
 782                 break
 783             offset += limit
 784         return info
 785
 786 class FunnyOrDieIE(InfoExtractor):
 787     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
 788
 789     def _real_extract(self, url):
 790         mobj = re.match(self._VALID_URL, url)
 791         if mobj is None:
 792             raise ExtractorError(u'invalid URL: %s' % url)
 793
 794         video_id = mobj.group('id')
 795         webpage = self._download_webpage(url, video_id)
 796
 797         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
 798             webpage, u'video URL', flags=re.DOTALL)
 799
 800         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
 801             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
 802
 803         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
 804             webpage, u'description', fatal=False, flags=re.DOTALL)
 805
 806         info = {
 807             'id': video_id,
 808             'url': video_url,
 809             'ext': 'mp4',
 810             'title': title,
 811             'description': video_description,
 812         }
 813         return [info]
 814
 815 class SteamIE(InfoExtractor):
 816     _VALID_URL = r"""http://store\.steampowered\.com/
 817                 (agecheck/)?
 818                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
 819                 (?P<gameID>\d+)/?
 820                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
 821                 """
 822     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
 823     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
 824
 825     @classmethod
 826     def suitable(cls, url):
 827         """Receives a URL and returns True if suitable for this IE."""
 828         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 829
 830     def _real_extract(self, url):
 831         m = re.match(self._VALID_URL, url, re.VERBOSE)
 832         gameID = m.group('gameID')
 833
 834         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
 835         webpage = self._download_webpage(videourl, gameID)
 836
 837         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
 838             videourl = self._AGECHECK_TEMPLATE % gameID
 839             self.report_age_confirmation()
 840             webpage = self._download_webpage(videourl, gameID)
 841
 842         self.report_extraction(gameID)
 843         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
 844                                              webpage, 'game title')
 845
 846         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
 847         mweb = re.finditer(urlRE, webpage)
 848         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
 849         titles = re.finditer(namesRE, webpage)
 850         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
 851         thumbs = re.finditer(thumbsRE, webpage)
 852         videos = []
 853         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
 854             video_id = vid.group('videoID')
 855             title = vtitle.group('videoName')
 856             video_url = vid.group('videoURL')
 857             video_thumb = thumb.group('thumbnail')
 858             if not video_url:
 859                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
 860             info = {
 861                 'id':video_id,
 862                 'url':video_url,
 863                 'ext': 'flv',
 864                 'title': unescapeHTML(title),
 865                 'thumbnail': video_thumb
 866                   }
 867             videos.append(info)
 868         return [self.playlist_result(videos, gameID, game_title)]
 869
 870 class UstreamIE(InfoExtractor):
 871     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
 872     IE_NAME = u'ustream'
 873
 874     def _real_extract(self, url):
 875         m = re.match(self._VALID_URL, url)
 876         video_id = m.group('videoID')
 877
 878         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
 879         webpage = self._download_webpage(url, video_id)
 880
 881         self.report_extraction(video_id)
 882
 883         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
 884             webpage, u'title')
 885
 886         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
 887             webpage, u'uploader', fatal=False, flags=re.DOTALL)
 888
 889         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
 890             webpage, u'thumbnail', fatal=False)
 891
 892         info = {
 893                 'id': video_id,
 894                 'url': video_url,
 895                 'ext': 'flv',
 896                 'title': video_title,
 897                 'uploader': uploader,
 898                 'thumbnail': thumbnail,
 899                }
 900         return info
 901
 902 class WorldStarHipHopIE(InfoExtractor):
 903     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
 904     IE_NAME = u'WorldStarHipHop'
 905
 906     def _real_extract(self, url):
 907         m = re.match(self._VALID_URL, url)
 908         video_id = m.group('id')
 909
 910         webpage_src = self._download_webpage(url, video_id)
 911
 912         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
 913             webpage_src, u'video URL')
 914
 915         if 'mp4' in video_url:
 916             ext = 'mp4'
 917         else:
 918             ext = 'flv'
 919
 920         video_title = self._html_search_regex(r"<title>(.*)</title>",
 921             webpage_src, u'title')
 922
 923         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
 924         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
 925             webpage_src, u'thumbnail', fatal=False)
 926
 927         if not thumbnail:
 928             _title = r"""candytitles.*>(.*)</span>"""
 929             mobj = re.search(_title, webpage_src)
 930             if mobj is not None:
 931                 video_title = mobj.group(1)
 932
 933         results = [{
 934                     'id': video_id,
 935                     'url' : video_url,
 936                     'title' : video_title,
 937                     'thumbnail' : thumbnail,
 938                     'ext' : ext,
 939                     }]
 940         return results
 941
 942 class RBMARadioIE(InfoExtractor):
 943     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 944
 945     def _real_extract(self, url):
 946         m = re.match(self._VALID_URL, url)
 947         video_id = m.group('videoID')
 948
 949         webpage = self._download_webpage(url, video_id)
 950
 951         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
 952             webpage, u'json data', flags=re.MULTILINE)
 953
 954         try:
 955             data = json.loads(json_data)
 956         except ValueError as e:
 957             raise ExtractorError(u'Invalid JSON: ' + str(e))
 958
 959         video_url = data['akamai_url'] + '&cbr=256'
 960         url_parts = compat_urllib_parse_urlparse(video_url)
 961         video_ext = url_parts.path.rpartition('.')[2]
 962         info = {
 963                 'id': video_id,
 964                 'url': video_url,
 965                 'ext': video_ext,
 966                 'title': data['title'],
 967                 'description': data.get('teaser_text'),
 968                 'location': data.get('country_of_origin'),
 969                 'uploader': data.get('host', {}).get('name'),
 970                 'uploader_id': data.get('host', {}).get('slug'),
 971                 'thumbnail': data.get('image', {}).get('large_url_2x'),
 972                 'duration': data.get('duration'),
 973         }
 974         return [info]
 975
 976
 977 class YouPornIE(InfoExtractor):
 978     """Information extractor for youporn.com."""
 979     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
 980
 981     def _print_formats(self, formats):
 982         """Print all available formats"""
 983         print(u'Available formats:')
 984         print(u'ext\t\tformat')
 985         print(u'---------------------------------')
 986         for format in formats:
 987             print(u'%s\t\t%s'  % (format['ext'], format['format']))
 988
 989     def _specific(self, req_format, formats):
 990         for x in formats:
 991             if(x["format"]==req_format):
 992                 return x
 993         return None
 994
 995     def _real_extract(self, url):
 996         mobj = re.match(self._VALID_URL, url)
 997         if mobj is None:
 998             raise ExtractorError(u'Invalid URL: %s' % url)
 999         video_id = mobj.group('videoid')
1000
1001         req = compat_urllib_request.Request(url)
1002         req.add_header('Cookie', 'age_verified=1')
1003         webpage = self._download_webpage(req, video_id)
1004
1005         # Get JSON parameters
1006         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1007         try:
1008             params = json.loads(json_params)
1009         except:
1010             raise ExtractorError(u'Invalid JSON')
1011
1012         self.report_extraction(video_id)
1013         try:
1014             video_title = params['title']
1015             upload_date = unified_strdate(params['release_date_f'])
1016             video_description = params['description']
1017             video_uploader = params['submitted_by']
1018             thumbnail = params['thumbnails'][0]['image']
1019         except KeyError:
1020             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1021
1022         # Get all of the formats available
1023         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1024         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1025             webpage, u'download list').strip()
1026
1027         # Get all of the links from the page
1028         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1029         links = re.findall(LINK_RE, download_list_html)
1030         if(len(links) == 0):
1031             raise ExtractorError(u'ERROR: no known formats available for video')
1032
1033         self.to_screen(u'Links found: %d' % len(links))
1034
1035         formats = []
1036         for link in links:
1037
1038             # A link looks like this:
1039             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1040             # A path looks like this:
1041             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1042             video_url = unescapeHTML( link )
1043             path = compat_urllib_parse_urlparse( video_url ).path
1044             extension = os.path.splitext( path )[1][1:]
1045             format = path.split('/')[4].split('_')[:2]
1046             size = format[0]
1047             bitrate = format[1]
1048             format = "-".join( format )
1049             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1050
1051             formats.append({
1052                 'id': video_id,
1053                 'url': video_url,
1054                 'uploader': video_uploader,
1055                 'upload_date': upload_date,
1056                 'title': video_title,
1057                 'ext': extension,
1058                 'format': format,
1059                 'thumbnail': thumbnail,
1060                 'description': video_description
1061             })
1062
1063         if self._downloader.params.get('listformats', None):
1064             self._print_formats(formats)
1065             return
1066
1067         req_format = self._downloader.params.get('format', None)
1068         self.to_screen(u'Format: %s' % req_format)
1069
1070         if req_format is None or req_format == 'best':
1071             return [formats[0]]
1072         elif req_format == 'worst':
1073             return [formats[-1]]
1074         elif req_format in ('-1', 'all'):
1075             return formats
1076         else:
1077             format = self._specific( req_format, formats )
1078             if result is None:
1079                 raise ExtractorError(u'Requested format not available')
1080             return [format]
1081
1082
1083
1084 class PornotubeIE(InfoExtractor):
1085     """Information extractor for pornotube.com."""
1086     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1087
1088     def _real_extract(self, url):
1089         mobj = re.match(self._VALID_URL, url)
1090         if mobj is None:
1091             raise ExtractorError(u'Invalid URL: %s' % url)
1092
1093         video_id = mobj.group('videoid')
1094         video_title = mobj.group('title')
1095
1096         # Get webpage content
1097         webpage = self._download_webpage(url, video_id)
1098
1099         # Get the video URL
1100         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1101         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1102         video_url = compat_urllib_parse.unquote(video_url)
1103
1104         #Get the uploaded date
1105         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1106         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1107         if upload_date: upload_date = unified_strdate(upload_date)
1108
1109         info = {'id': video_id,
1110                 'url': video_url,
1111                 'uploader': None,
1112                 'upload_date': upload_date,
1113                 'title': video_title,
1114                 'ext': 'flv',
1115                 'format': 'flv'}
1116
1117         return [info]
1118
1119 class YouJizzIE(InfoExtractor):
1120     """Information extractor for youjizz.com."""
1121     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
1122
1123     def _real_extract(self, url):
1124         mobj = re.match(self._VALID_URL, url)
1125         if mobj is None:
1126             raise ExtractorError(u'Invalid URL: %s' % url)
1127
1128         video_id = mobj.group('videoid')
1129
1130         # Get webpage content
1131         webpage = self._download_webpage(url, video_id)
1132
1133         # Get the video title
1134         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
1135             webpage, u'title').strip()
1136
1137         # Get the embed page
1138         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
1139         if result is None:
1140             raise ExtractorError(u'ERROR: unable to extract embed page')
1141
1142         embed_page_url = result.group(0).strip()
1143         video_id = result.group('videoid')
1144
1145         webpage = self._download_webpage(embed_page_url, video_id)
1146
1147         # Get the video URL
1148         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
1149             webpage, u'video URL')
1150
1151         info = {'id': video_id,
1152                 'url': video_url,
1153                 'title': video_title,
1154                 'ext': 'flv',
1155                 'format': 'flv',
1156                 'player_url': embed_page_url}
1157
1158         return [info]
1159
1160 class EightTracksIE(InfoExtractor):
1161     IE_NAME = '8tracks'
1162     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
1163
1164     def _real_extract(self, url):
1165         mobj = re.match(self._VALID_URL, url)
1166         if mobj is None:
1167             raise ExtractorError(u'Invalid URL: %s' % url)
1168         playlist_id = mobj.group('id')
1169
1170         webpage = self._download_webpage(url, playlist_id)
1171
1172         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
1173         data = json.loads(json_like)
1174
1175         session = str(random.randint(0, 1000000000))
1176         mix_id = data['id']
1177         track_count = data['tracks_count']
1178         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
1179         next_url = first_url
1180         res = []
1181         for i in itertools.count():
1182             api_json = self._download_webpage(next_url, playlist_id,
1183                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
1184                 errnote=u'Failed to download song information')
1185             api_data = json.loads(api_json)
1186             track_data = api_data[u'set']['track']
1187             info = {
1188                 'id': track_data['id'],
1189                 'url': track_data['track_file_stream_url'],
1190                 'title': track_data['performer'] + u' - ' + track_data['name'],
1191                 'raw_title': track_data['name'],
1192                 'uploader_id': data['user']['login'],
1193                 'ext': 'm4a',
1194             }
1195             res.append(info)
1196             if api_data['set']['at_last_track']:
1197                 break
1198             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
1199         return res
1200
1201 class KeekIE(InfoExtractor):
1202     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
1203     IE_NAME = u'keek'
1204
1205     def _real_extract(self, url):
1206         m = re.match(self._VALID_URL, url)
1207         video_id = m.group('videoID')
1208
1209         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
1210         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
1211         webpage = self._download_webpage(url, video_id)
1212
1213         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1214             webpage, u'title')
1215
1216         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
1217             webpage, u'uploader', fatal=False)
1218
1219         info = {
1220                 'id': video_id,
1221                 'url': video_url,
1222                 'ext': 'mp4',
1223                 'title': video_title,
1224                 'thumbnail': thumbnail,
1225                 'uploader': uploader
1226         }
1227         return [info]
1228
1229 class TEDIE(InfoExtractor):
1230     _VALID_URL=r'''http://www\.ted\.com/
1231                    (
1232                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
1233                         |
1234                         ((?P<type_talk>talks)) # We have a simple talk
1235                    )
1236                    (/lang/(.*?))? # The url may contain the language
1237                    /(?P<name>\w+) # Here goes the name and then ".html"
1238                    '''
1239
1240     @classmethod
1241     def suitable(cls, url):
1242         """Receives a URL and returns True if suitable for this IE."""
1243         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1244
1245     def _real_extract(self, url):
1246         m=re.match(self._VALID_URL, url, re.VERBOSE)
1247         if m.group('type_talk'):
1248             return [self._talk_info(url)]
1249         else :
1250             playlist_id=m.group('playlist_id')
1251             name=m.group('name')
1252             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
1253             return [self._playlist_videos_info(url,name,playlist_id)]
1254
1255     def _playlist_videos_info(self,url,name,playlist_id=0):
1256         '''Returns the videos of the playlist'''
1257         video_RE=r'''
1258                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
1259                      ([.\s]*?)data-playlist_item_id="(\d+)"
1260                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
1261                      '''
1262         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
1263         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
1264         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
1265         m_names=re.finditer(video_name_RE,webpage)
1266
1267         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
1268                                                  webpage, 'playlist title')
1269
1270         playlist_entries = []
1271         for m_video, m_name in zip(m_videos,m_names):
1272             video_id=m_video.group('video_id')
1273             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
1274             playlist_entries.append(self.url_result(talk_url, 'TED'))
1275         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
1276
1277     def _talk_info(self, url, video_id=0):
1278         """Return the video for the talk in the url"""
1279         m = re.match(self._VALID_URL, url,re.VERBOSE)
1280         video_name = m.group('name')
1281         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
1282         self.report_extraction(video_name)
1283         # If the url includes the language we get the title translated
1284         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
1285                                         webpage, 'title')
1286         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
1287                                     webpage, 'json data')
1288         info = json.loads(json_data)
1289         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
1290                                        webpage, 'description', flags = re.DOTALL)
1291
1292         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
1293                                        webpage, 'thumbnail')
1294         info = {
1295                 'id': info['id'],
1296                 'url': info['htmlStreams'][-1]['file'],
1297                 'ext': 'mp4',
1298                 'title': title,
1299                 'thumbnail': thumbnail,
1300                 'description': desc,
1301                 }
1302         return info
1303
1304 class MySpassIE(InfoExtractor):
1305     _VALID_URL = r'http://www.myspass.de/.*'
1306
1307     def _real_extract(self, url):
1308         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1309
1310         # video id is the last path element of the URL
1311         # usually there is a trailing slash, so also try the second but last
1312         url_path = compat_urllib_parse_urlparse(url).path
1313         url_parent_path, video_id = os.path.split(url_path)
1314         if not video_id:
1315             _, video_id = os.path.split(url_parent_path)
1316
1317         # get metadata
1318         metadata_url = META_DATA_URL_TEMPLATE % video_id
1319         metadata_text = self._download_webpage(metadata_url, video_id)
1320         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1321
1322         # extract values from metadata
1323         url_flv_el = metadata.find('url_flv')
1324         if url_flv_el is None:
1325             raise ExtractorError(u'Unable to extract download url')
1326         video_url = url_flv_el.text
1327         extension = os.path.splitext(video_url)[1][1:]
1328         title_el = metadata.find('title')
1329         if title_el is None:
1330             raise ExtractorError(u'Unable to extract title')
1331         title = title_el.text
1332         format_id_el = metadata.find('format_id')
1333         if format_id_el is None:
1334             format = ext
1335         else:
1336             format = format_id_el.text
1337         description_el = metadata.find('description')
1338         if description_el is not None:
1339             description = description_el.text
1340         else:
1341             description = None
1342         imagePreview_el = metadata.find('imagePreview')
1343         if imagePreview_el is not None:
1344             thumbnail = imagePreview_el.text
1345         else:
1346             thumbnail = None
1347         info = {
1348             'id': video_id,
1349             'url': video_url,
1350             'title': title,
1351             'ext': extension,
1352             'format': format,
1353             'thumbnail': thumbnail,
1354             'description': description
1355         }
1356         return [info]
1357
1358 class SpiegelIE(InfoExtractor):
1359     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1360
1361     def _real_extract(self, url):
1362         m = re.match(self._VALID_URL, url)
1363         video_id = m.group('videoID')
1364
1365         webpage = self._download_webpage(url, video_id)
1366
1367         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1368             webpage, u'title')
1369
1370         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1371         xml_code = self._download_webpage(xml_url, video_id,
1372                     note=u'Downloading XML', errnote=u'Failed to download XML')
1373
1374         idoc = xml.etree.ElementTree.fromstring(xml_code)
1375         last_type = idoc[-1]
1376         filename = last_type.findall('./filename')[0].text
1377         duration = float(last_type.findall('./duration')[0].text)
1378
1379         video_url = 'http://video2.spiegel.de/flash/' + filename
1380         video_ext = filename.rpartition('.')[2]
1381         info = {
1382             'id': video_id,
1383             'url': video_url,
1384             'ext': video_ext,
1385             'title': video_title,
1386             'duration': duration,
1387         }
1388         return [info]
1389
1390 class LiveLeakIE(InfoExtractor):
1391
1392     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1393     IE_NAME = u'liveleak'
1394
1395     def _real_extract(self, url):
1396         mobj = re.match(self._VALID_URL, url)
1397         if mobj is None:
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399
1400         video_id = mobj.group('video_id')
1401
1402         webpage = self._download_webpage(url, video_id)
1403
1404         video_url = self._search_regex(r'file: "(.*?)",',
1405             webpage, u'video URL')
1406
1407         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1408             webpage, u'title').replace('LiveLeak.com -', '').strip()
1409
1410         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1411             webpage, u'description', fatal=False)
1412
1413         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1414             webpage, u'uploader', fatal=False)
1415
1416         info = {
1417             'id':  video_id,
1418             'url': video_url,
1419             'ext': 'mp4',
1420             'title': video_title,
1421             'description': video_description,
1422             'uploader': video_uploader
1423         }
1424
1425         return [info]
1426
1427
1428
1429 class TumblrIE(InfoExtractor):
1430     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1431
1432     def _real_extract(self, url):
1433         m_url = re.match(self._VALID_URL, url)
1434         video_id = m_url.group('id')
1435         blog = m_url.group('blog_name')
1436
1437         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1438         webpage = self._download_webpage(url, video_id)
1439
1440         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1441         video = re.search(re_video, webpage)
1442         if video is None:
1443            raise ExtractorError(u'Unable to extract video')
1444         video_url = video.group('video_url')
1445         ext = video.group('ext')
1446
1447         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1448             webpage, u'thumbnail', fatal=False)  # We pick the first poster
1449         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1450
1451         # The only place where you can get a title, it's not complete,
1452         # but searching in other places doesn't work for all videos
1453         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1454             webpage, u'title', flags=re.DOTALL)
1455
1456         return [{'id': video_id,
1457                  'url': video_url,
1458                  'title': video_title,
1459                  'thumbnail': video_thumbnail,
1460                  'ext': ext
1461                  }]
1462
1463 class BandcampIE(InfoExtractor):
1464     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1465
1466     def _real_extract(self, url):
1467         mobj = re.match(self._VALID_URL, url)
1468         title = mobj.group('title')
1469         webpage = self._download_webpage(url, title)
1470         # We get the link to the free download page
1471         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1472         if m_download is None:
1473             raise ExtractorError(u'No free songs found')
1474
1475         download_link = m_download.group(1)
1476         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1477                        webpage, re.MULTILINE|re.DOTALL).group('id')
1478
1479         download_webpage = self._download_webpage(download_link, id,
1480                                                   'Downloading free downloads page')
1481         # We get the dictionary of the track from some javascrip code
1482         info = re.search(r'items: (.*?),$',
1483                          download_webpage, re.MULTILINE).group(1)
1484         info = json.loads(info)[0]
1485         # We pick mp3-320 for now, until format selection can be easily implemented.
1486         mp3_info = info[u'downloads'][u'mp3-320']
1487         # If we try to use this url it says the link has expired
1488         initial_url = mp3_info[u'url']
1489         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1490         m_url = re.match(re_url, initial_url)
1491         #We build the url we will use to get the final track url
1492         # This url is build in Bandcamp in the script download_bunde_*.js
1493         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1494         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1495         # If we could correctly generate the .rand field the url would be
1496         #in the "download_url" key
1497         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1498
1499         track_info = {'id':id,
1500                       'title' : info[u'title'],
1501                       'ext' :   'mp3',
1502                       'url' :   final_url,
1503                       'thumbnail' : info[u'thumb_url'],
1504                       'uploader' :  info[u'artist']
1505                       }
1506
1507         return [track_info]
1508
1509 class RedTubeIE(InfoExtractor):
1510     """Information Extractor for redtube"""
1511     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1512
1513     def _real_extract(self,url):
1514         mobj = re.match(self._VALID_URL, url)
1515         if mobj is None:
1516             raise ExtractorError(u'Invalid URL: %s' % url)
1517
1518         video_id = mobj.group('id')
1519         video_extension = 'mp4'
1520         webpage = self._download_webpage(url, video_id)
1521
1522         self.report_extraction(video_id)
1523
1524         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1525             webpage, u'video URL')
1526
1527         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1528             webpage, u'title')
1529
1530         return [{
1531             'id':       video_id,
1532             'url':      video_url,
1533             'ext':      video_extension,
1534             'title':    video_title,
1535         }]
1536
1537 class InaIE(InfoExtractor):
1538     """Information Extractor for Ina.fr"""
1539     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1540
1541     def _real_extract(self,url):
1542         mobj = re.match(self._VALID_URL, url)
1543
1544         video_id = mobj.group('id')
1545         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1546         video_extension = 'mp4'
1547         webpage = self._download_webpage(mrss_url, video_id)
1548
1549         self.report_extraction(video_id)
1550
1551         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1552             webpage, u'video URL')
1553
1554         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1555             webpage, u'title')
1556
1557         return [{
1558             'id':       video_id,
1559             'url':      video_url,
1560             'ext':      video_extension,
1561             'title':    video_title,
1562         }]
1563
1564 class HowcastIE(InfoExtractor):
1565     """Information Extractor for Howcast.com"""
1566     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1567
1568     def _real_extract(self, url):
1569         mobj = re.match(self._VALID_URL, url)
1570
1571         video_id = mobj.group('id')
1572         webpage_url = 'http://www.howcast.com/videos/' + video_id
1573         webpage = self._download_webpage(webpage_url, video_id)
1574
1575         self.report_extraction(video_id)
1576
1577         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1578             webpage, u'video URL')
1579
1580         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1581             webpage, u'title')
1582
1583         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1584             webpage, u'description', fatal=False)
1585
1586         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1587             webpage, u'thumbnail', fatal=False)
1588
1589         return [{
1590             'id':       video_id,
1591             'url':      video_url,
1592             'ext':      'mp4',
1593             'title':    video_title,
1594             'description': video_description,
1595             'thumbnail': thumbnail,
1596         }]
1597
1598 class VineIE(InfoExtractor):
1599     """Information Extractor for Vine.co"""
1600     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1601
1602     def _real_extract(self, url):
1603         mobj = re.match(self._VALID_URL, url)
1604
1605         video_id = mobj.group('id')
1606         webpage_url = 'https://vine.co/v/' + video_id
1607         webpage = self._download_webpage(webpage_url, video_id)
1608
1609         self.report_extraction(video_id)
1610
1611         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1612             webpage, u'video URL')
1613
1614         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1615             webpage, u'title')
1616
1617         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1618             webpage, u'thumbnail', fatal=False)
1619
1620         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1621             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1622
1623         return [{
1624             'id':        video_id,
1625             'url':       video_url,
1626             'ext':       'mp4',
1627             'title':     video_title,
1628             'thumbnail': thumbnail,
1629             'uploader':  uploader,
1630         }]
1631
1632 class FlickrIE(InfoExtractor):
1633     """Information Extractor for Flickr videos"""
1634     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1635
1636     def _real_extract(self, url):
1637         mobj = re.match(self._VALID_URL, url)
1638
1639         video_id = mobj.group('id')
1640         video_uploader_id = mobj.group('uploader_id')
1641         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1642         webpage = self._download_webpage(webpage_url, video_id)
1643
1644         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1645
1646         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1647         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1648
1649         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1650             first_xml, u'node_id')
1651
1652         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1653         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1654
1655         self.report_extraction(video_id)
1656
1657         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1658         if mobj is None:
1659             raise ExtractorError(u'Unable to extract video url')
1660         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1661
1662         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1663             webpage, u'video title')
1664
1665         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1666             webpage, u'description', fatal=False)
1667
1668         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1669             webpage, u'thumbnail', fatal=False)
1670
1671         return [{
1672             'id':          video_id,
1673             'url':         video_url,
1674             'ext':         'mp4',
1675             'title':       video_title,
1676             'description': video_description,
1677             'thumbnail':   thumbnail,
1678             'uploader_id': video_uploader_id,
1679         }]
1680
1681 class TeamcocoIE(InfoExtractor):
1682     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1683
1684     def _real_extract(self, url):
1685         mobj = re.match(self._VALID_URL, url)
1686         if mobj is None:
1687             raise ExtractorError(u'Invalid URL: %s' % url)
1688         url_title = mobj.group('url_title')
1689         webpage = self._download_webpage(url, url_title)
1690
1691         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1692             webpage, u'video id')
1693
1694         self.report_extraction(video_id)
1695
1696         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1697             webpage, u'title')
1698
1699         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1700             webpage, u'thumbnail', fatal=False)
1701
1702         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1703             webpage, u'description', fatal=False)
1704
1705         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1706         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1707
1708         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1709             data, u'video URL')
1710
1711         return [{
1712             'id':          video_id,
1713             'url':         video_url,
1714             'ext':         'mp4',
1715             'title':       video_title,
1716             'thumbnail':   thumbnail,
1717             'description': video_description,
1718         }]
1719
1720 class XHamsterIE(InfoExtractor):
1721     """Information Extractor for xHamster"""
1722     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1723
1724     def _real_extract(self,url):
1725         mobj = re.match(self._VALID_URL, url)
1726
1727         video_id = mobj.group('id')
1728         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1729         webpage = self._download_webpage(mrss_url, video_id)
1730
1731         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1732         if mobj is None:
1733             raise ExtractorError(u'Unable to extract media URL')
1734         if len(mobj.group('server')) == 0:
1735             video_url = compat_urllib_parse.unquote(mobj.group('file'))
1736         else:
1737             video_url = mobj.group('server')+'/key='+mobj.group('file')
1738         video_extension = video_url.split('.')[-1]
1739
1740         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1741             webpage, u'title')
1742
1743         # Can't see the description anywhere in the UI
1744         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1745         #     webpage, u'description', fatal=False)
1746         # if video_description: video_description = unescapeHTML(video_description)
1747
1748         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1749         if mobj:
1750             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1751         else:
1752             video_upload_date = None
1753             self._downloader.report_warning(u'Unable to extract upload date')
1754
1755         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1756             webpage, u'uploader id', default=u'anonymous')
1757
1758         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1759             webpage, u'thumbnail', fatal=False)
1760
1761         return [{
1762             'id':       video_id,
1763             'url':      video_url,
1764             'ext':      video_extension,
1765             'title':    video_title,
1766             # 'description': video_description,
1767             'upload_date': video_upload_date,
1768             'uploader_id': video_uploader_id,
1769             'thumbnail': video_thumbnail
1770         }]
1771
1772 class HypemIE(InfoExtractor):
1773     """Information Extractor for hypem"""
1774     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1775
1776     def _real_extract(self, url):
1777         mobj = re.match(self._VALID_URL, url)
1778         if mobj is None:
1779             raise ExtractorError(u'Invalid URL: %s' % url)
1780         track_id = mobj.group(1)
1781
1782         data = { 'ax': 1, 'ts': time.time() }
1783         data_encoded = compat_urllib_parse.urlencode(data)
1784         complete_url = url + "?" + data_encoded
1785         request = compat_urllib_request.Request(complete_url)
1786         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1787         cookie = urlh.headers.get('Set-Cookie', '')
1788
1789         self.report_extraction(track_id)
1790
1791         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1792             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1793         try:
1794             track_list = json.loads(html_tracks)
1795             track = track_list[u'tracks'][0]
1796         except ValueError:
1797             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1798
1799         key = track[u"key"]
1800         track_id = track[u"id"]
1801         artist = track[u"artist"]
1802         title = track[u"song"]
1803
1804         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1805         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1806         request.add_header('cookie', cookie)
1807         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1808         try:
1809             song_data = json.loads(song_data_json)
1810         except ValueError:
1811             raise ExtractorError(u'Hypemachine contained invalid JSON.')
1812         final_url = song_data[u"url"]
1813
1814         return [{
1815             'id':       track_id,
1816             'url':      final_url,
1817             'ext':      "mp3",
1818             'title':    title,
1819             'artist':   artist,
1820         }]
1821
1822 class Vbox7IE(InfoExtractor):
1823     """Information Extractor for Vbox7"""
1824     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1825
1826     def _real_extract(self,url):
1827         mobj = re.match(self._VALID_URL, url)
1828         if mobj is None:
1829             raise ExtractorError(u'Invalid URL: %s' % url)
1830         video_id = mobj.group(1)
1831
1832         redirect_page, urlh = self._download_webpage_handle(url, video_id)
1833         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1834         redirect_url = urlh.geturl() + new_location
1835         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1836
1837         title = self._html_search_regex(r'<title>(.*)</title>',
1838             webpage, u'title').split('/')[0].strip()
1839
1840         ext = "flv"
1841         info_url = "http://vbox7.com/play/magare.do"
1842         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1843         info_request = compat_urllib_request.Request(info_url, data)
1844         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1845         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1846         if info_response is None:
1847             raise ExtractorError(u'Unable to extract the media url')
1848         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1849
1850         return [{
1851             'id':        video_id,
1852             'url':       final_url,
1853             'ext':       ext,
1854             'title':     title,
1855             'thumbnail': thumbnail_url,
1856         }]
1857
1858
1859 def gen_extractors():
1860     """ Return a list of an instance of every supported extractor.
1861     The order does matter; the first extractor matched is the one handling the URL.
1862     """
1863     return [
1864         YoutubePlaylistIE(),
1865         YoutubeChannelIE(),
1866         YoutubeUserIE(),
1867         YoutubeSearchIE(),
1868         YoutubeIE(),
1869         MetacafeIE(),
1870         DailymotionIE(),
1871         GoogleSearchIE(),
1872         PhotobucketIE(),
1873         YahooIE(),
1874         YahooSearchIE(),
1875         DepositFilesIE(),
1876         FacebookIE(),
1877         BlipTVIE(),
1878         BlipTVUserIE(),
1879         VimeoIE(),
1880         MyVideoIE(),
1881         ComedyCentralIE(),
1882         EscapistIE(),
1883         CollegeHumorIE(),
1884         XVideosIE(),
1885         SoundcloudSetIE(),
1886         SoundcloudIE(),
1887         InfoQIE(),
1888         MixcloudIE(),
1889         StanfordOpenClassroomIE(),
1890         MTVIE(),
1891         YoukuIE(),
1892         XNXXIE(),
1893         YouJizzIE(),
1894         PornotubeIE(),
1895         YouPornIE(),
1896         GooglePlusIE(),
1897         ArteTvIE(),
1898         NBAIE(),
1899         WorldStarHipHopIE(),
1900         JustinTVIE(),
1901         FunnyOrDieIE(),
1902         SteamIE(),
1903         UstreamIE(),
1904         RBMARadioIE(),
1905         EightTracksIE(),
1906         KeekIE(),
1907         TEDIE(),
1908         MySpassIE(),
1909         SpiegelIE(),
1910         LiveLeakIE(),
1911         ARDIE(),
1912         ZDFIE(),
1913         TumblrIE(),
1914         BandcampIE(),
1915         RedTubeIE(),
1916         InaIE(),
1917         HowcastIE(),
1918         VineIE(),
1919         FlickrIE(),
1920         TeamcocoIE(),
1921         XHamsterIE(),
1922         HypemIE(),
1923         Vbox7IE(),
1924         GametrailersIE(),
1925         StatigramIE(),
1926         GenericIE()
1927     ]
1928
1929 def get_info_extractor(ie_name):
1930     """Returns the info extractor class with the given ie_name"""
1931     return globals()[ie_name+'IE']