_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24 from .extractor.common import InfoExtractor, SearchInfoExtractor
  25
  26 from .extractor.ard import ARDIE
  27 from .extractor.arte import ArteTvIE
  28 from .extractor.dailymotion import DailymotionIE
  29 from .extractor.gametrailers import GametrailersIE
  30 from .extractor.generic import GenericIE
  31 from .extractor.metacafe import MetacafeIE
  32 from .extractor.statigram import StatigramIE
  33 from .extractor.photobucket import PhotobucketIE
  34 from .extractor.vimeo import VimeoIE
  35 from .extractor.yahoo import YahooIE
  36 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
  37 from .extractor.zdf import ZDFIE
  38
  39
  40
  41
  42
  43
  44
  45
  46
  47
  48
  49
  50
  51
  52
  53 class YahooSearchIE(SearchInfoExtractor):
  54     """Information Extractor for Yahoo! Video search queries."""
  55
  56     _MAX_RESULTS = 1000
  57     IE_NAME = u'screen.yahoo:search'
  58     _SEARCH_KEY = 'yvsearch'
  59
  60     def _get_n_results(self, query, n):
  61         """Get a specified number of results for a query"""
  62
  63         res = {
  64             '_type': 'playlist',
  65             'id': query,
  66             'entries': []
  67         }
  68         for pagenum in itertools.count(0):
  69             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  70             webpage = self._download_webpage(result_url, query,
  71                                              note='Downloading results page '+str(pagenum+1))
  72             info = json.loads(webpage)
  73             m = info[u'm']
  74             results = info[u'results']
  75
  76             for (i, r) in enumerate(results):
  77                 if (pagenum * 30) +i >= n:
  78                     break
  79                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  80                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  81                 res['entries'].append(e)
  82             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
  83                 break
  84
  85         return res
  86
  87
  88 class BlipTVUserIE(InfoExtractor):
  89     """Information Extractor for blip.tv users."""
  90
  91     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
  92     _PAGE_SIZE = 12
  93     IE_NAME = u'blip.tv:user'
  94
  95     def _real_extract(self, url):
  96         # Extract username
  97         mobj = re.match(self._VALID_URL, url)
  98         if mobj is None:
  99             raise ExtractorError(u'Invalid URL: %s' % url)
 100
 101         username = mobj.group(1)
 102
 103         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 104
 105         page = self._download_webpage(url, username, u'Downloading user page')
 106         mobj = re.search(r'data-users-id="([^"]+)"', page)
 107         page_base = page_base % mobj.group(1)
 108
 109
 110         # Download video ids using BlipTV Ajax calls. Result size per
 111         # query is limited (currently to 12 videos) so we need to query
 112         # page by page until there are no video ids - it means we got
 113         # all of them.
 114
 115         video_ids = []
 116         pagenum = 1
 117
 118         while True:
 119             url = page_base + "&page=" + str(pagenum)
 120             page = self._download_webpage(url, username,
 121                                           u'Downloading video ids from page %d' % pagenum)
 122
 123             # Extract video identifiers
 124             ids_in_page = []
 125
 126             for mobj in re.finditer(r'href="/([^"]+)"', page):
 127                 if mobj.group(1) not in ids_in_page:
 128                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 129
 130             video_ids.extend(ids_in_page)
 131
 132             # A little optimization - if current page is not
 133             # "full", ie. does not contain PAGE_SIZE video ids then
 134             # we can assume that this page is the last one - there
 135             # are no more ids on further pages - no need to query
 136             # again.
 137
 138             if len(ids_in_page) < self._PAGE_SIZE:
 139                 break
 140
 141             pagenum += 1
 142
 143         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 144         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 145         return [self.playlist_result(url_entries, playlist_title = username)]
 146
 147
 148 class DepositFilesIE(InfoExtractor):
 149     """Information extractor for depositfiles.com"""
 150
 151     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 152
 153     def _real_extract(self, url):
 154         file_id = url.split('/')[-1]
 155         # Rebuild url in english locale
 156         url = 'http://depositfiles.com/en/files/' + file_id
 157
 158         # Retrieve file webpage with 'Free download' button pressed
 159         free_download_indication = { 'gateway_result' : '1' }
 160         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 161         try:
 162             self.report_download_webpage(file_id)
 163             webpage = compat_urllib_request.urlopen(request).read()
 164         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 165             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 166
 167         # Search for the real file URL
 168         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 169         if (mobj is None) or (mobj.group(1) is None):
 170             # Try to figure out reason of the error.
 171             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 172             if (mobj is not None) and (mobj.group(1) is not None):
 173                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 174                 raise ExtractorError(u'%s' % restriction_message)
 175             else:
 176                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 177
 178         file_url = mobj.group(1)
 179         file_extension = os.path.splitext(file_url)[1][1:]
 180
 181         # Search for file title
 182         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 183
 184         return [{
 185             'id':       file_id.decode('utf-8'),
 186             'url':      file_url.decode('utf-8'),
 187             'uploader': None,
 188             'upload_date':  None,
 189             'title':    file_title,
 190             'ext':      file_extension.decode('utf-8'),
 191         }]
 192
 193
 194 class FacebookIE(InfoExtractor):
 195     """Information Extractor for Facebook"""
 196
 197     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 198     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 199     _NETRC_MACHINE = 'facebook'
 200     IE_NAME = u'facebook'
 201
 202     def report_login(self):
 203         """Report attempt to log in."""
 204         self.to_screen(u'Logging in')
 205
 206     def _real_initialize(self):
 207         if self._downloader is None:
 208             return
 209
 210         useremail = None
 211         password = None
 212         downloader_params = self._downloader.params
 213
 214         # Attempt to use provided username and password or .netrc data
 215         if downloader_params.get('username', None) is not None:
 216             useremail = downloader_params['username']
 217             password = downloader_params['password']
 218         elif downloader_params.get('usenetrc', False):
 219             try:
 220                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 221                 if info is not None:
 222                     useremail = info[0]
 223                     password = info[2]
 224                 else:
 225                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 226             except (IOError, netrc.NetrcParseError) as err:
 227                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 228                 return
 229
 230         if useremail is None:
 231             return
 232
 233         # Log in
 234         login_form = {
 235             'email': useremail,
 236             'pass': password,
 237             'login': 'Log+In'
 238             }
 239         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 240         try:
 241             self.report_login()
 242             login_results = compat_urllib_request.urlopen(request).read()
 243             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 244                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 245                 return
 246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 247             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 248             return
 249
 250     def _real_extract(self, url):
 251         mobj = re.match(self._VALID_URL, url)
 252         if mobj is None:
 253             raise ExtractorError(u'Invalid URL: %s' % url)
 254         video_id = mobj.group('ID')
 255
 256         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 257         webpage = self._download_webpage(url, video_id)
 258
 259         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 260         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 261         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 262         if not m:
 263             raise ExtractorError(u'Cannot parse data')
 264         data = dict(json.loads(m.group(1)))
 265         params_raw = compat_urllib_parse.unquote(data['params'])
 266         params = json.loads(params_raw)
 267         video_data = params['video_data'][0]
 268         video_url = video_data.get('hd_src')
 269         if not video_url:
 270             video_url = video_data['sd_src']
 271         if not video_url:
 272             raise ExtractorError(u'Cannot find video URL')
 273         video_duration = int(video_data['video_duration'])
 274         thumbnail = video_data['thumbnail_src']
 275
 276         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 277             webpage, u'title')
 278
 279         info = {
 280             'id': video_id,
 281             'title': video_title,
 282             'url': video_url,
 283             'ext': 'mp4',
 284             'duration': video_duration,
 285             'thumbnail': thumbnail,
 286         }
 287         return [info]
 288
 289
 290 class BlipTVIE(InfoExtractor):
 291     """Information extractor for blip.tv"""
 292
 293     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 294     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 295     IE_NAME = u'blip.tv'
 296
 297     def report_direct_download(self, title):
 298         """Report information extraction."""
 299         self.to_screen(u'%s: Direct download detected' % title)
 300
 301     def _real_extract(self, url):
 302         mobj = re.match(self._VALID_URL, url)
 303         if mobj is None:
 304             raise ExtractorError(u'Invalid URL: %s' % url)
 305
 306         # See https://github.com/rg3/youtube-dl/issues/857
 307         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 308         if api_mobj is not None:
 309             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 310         urlp = compat_urllib_parse_urlparse(url)
 311         if urlp.path.startswith('/play/'):
 312             request = compat_urllib_request.Request(url)
 313             response = compat_urllib_request.urlopen(request)
 314             redirecturl = response.geturl()
 315             rurlp = compat_urllib_parse_urlparse(redirecturl)
 316             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 317             url = 'http://blip.tv/a/a-' + file_id
 318             return self._real_extract(url)
 319
 320
 321         if '?' in url:
 322             cchar = '&'
 323         else:
 324             cchar = '?'
 325         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 326         request = compat_urllib_request.Request(json_url)
 327         request.add_header('User-Agent', 'iTunes/10.6.1')
 328         self.report_extraction(mobj.group(1))
 329         info = None
 330         try:
 331             urlh = compat_urllib_request.urlopen(request)
 332             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 333                 basename = url.split('/')[-1]
 334                 title,ext = os.path.splitext(basename)
 335                 title = title.decode('UTF-8')
 336                 ext = ext.replace('.', '')
 337                 self.report_direct_download(title)
 338                 info = {
 339                     'id': title,
 340                     'url': url,
 341                     'uploader': None,
 342                     'upload_date': None,
 343                     'title': title,
 344                     'ext': ext,
 345                     'urlhandle': urlh
 346                 }
 347         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 348             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 349         if info is None: # Regular URL
 350             try:
 351                 json_code_bytes = urlh.read()
 352                 json_code = json_code_bytes.decode('utf-8')
 353             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 354                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 355
 356             try:
 357                 json_data = json.loads(json_code)
 358                 if 'Post' in json_data:
 359                     data = json_data['Post']
 360                 else:
 361                     data = json_data
 362
 363                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 364                 video_url = data['media']['url']
 365                 umobj = re.match(self._URL_EXT, video_url)
 366                 if umobj is None:
 367                     raise ValueError('Can not determine filename extension')
 368                 ext = umobj.group(1)
 369
 370                 info = {
 371                     'id': data['item_id'],
 372                     'url': video_url,
 373                     'uploader': data['display_name'],
 374                     'upload_date': upload_date,
 375                     'title': data['title'],
 376                     'ext': ext,
 377                     'format': data['media']['mimeType'],
 378                     'thumbnail': data['thumbnailUrl'],
 379                     'description': data['description'],
 380                     'player_url': data['embedUrl'],
 381                     'user_agent': 'iTunes/10.6.1',
 382                 }
 383             except (ValueError,KeyError) as err:
 384                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 385
 386         return [info]
 387
 388
 389 class MyVideoIE(InfoExtractor):
 390     """Information Extractor for myvideo.de."""
 391
 392     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 393     IE_NAME = u'myvideo'
 394
 395     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 396     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 397     # https://github.com/rg3/youtube-dl/pull/842
 398     def __rc4crypt(self,data, key):
 399         x = 0
 400         box = list(range(256))
 401         for i in list(range(256)):
 402             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 403             box[i], box[x] = box[x], box[i]
 404         x = 0
 405         y = 0
 406         out = ''
 407         for char in data:
 408             x = (x + 1) % 256
 409             y = (y + box[x]) % 256
 410             box[x], box[y] = box[y], box[x]
 411             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 412         return out
 413
 414     def __md5(self,s):
 415         return hashlib.md5(s).hexdigest().encode()
 416
 417     def _real_extract(self,url):
 418         mobj = re.match(self._VALID_URL, url)
 419         if mobj is None:
 420             raise ExtractorError(u'invalid URL: %s' % url)
 421
 422         video_id = mobj.group(1)
 423
 424         GK = (
 425           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 426           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 427           b'TnpsbA0KTVRkbU1tSTRNdz09'
 428         )
 429
 430         # Get video webpage
 431         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 432         webpage = self._download_webpage(webpage_url, video_id)
 433
 434         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 435         if mobj is not None:
 436             self.report_extraction(video_id)
 437             video_url = mobj.group(1) + '.flv'
 438
 439             video_title = self._html_search_regex('<title>([^<]+)</title>',
 440                 webpage, u'title')
 441
 442             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 443
 444             return [{
 445                 'id':       video_id,
 446                 'url':      video_url,
 447                 'uploader': None,
 448                 'upload_date':  None,
 449                 'title':    video_title,
 450                 'ext':      u'flv',
 451             }]
 452
 453         # try encxml
 454         mobj = re.search('var flashvars={(.+?)}', webpage)
 455         if mobj is None:
 456             raise ExtractorError(u'Unable to extract video')
 457
 458         params = {}
 459         encxml = ''
 460         sec = mobj.group(1)
 461         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 462             if not a == '_encxml':
 463                 params[a] = b
 464             else:
 465                 encxml = compat_urllib_parse.unquote(b)
 466         if not params.get('domain'):
 467             params['domain'] = 'www.myvideo.de'
 468         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 469         if 'flash_playertype=MTV' in xmldata_url:
 470             self._downloader.report_warning(u'avoiding MTV player')
 471             xmldata_url = (
 472                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 473                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 474             ) % video_id
 475
 476         # get enc data
 477         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 478         enc_data_b = binascii.unhexlify(enc_data)
 479         sk = self.__md5(
 480             base64.b64decode(base64.b64decode(GK)) +
 481             self.__md5(
 482                 str(video_id).encode('utf-8')
 483             )
 484         )
 485         dec_data = self.__rc4crypt(enc_data_b, sk)
 486
 487         # extracting infos
 488         self.report_extraction(video_id)
 489
 490         video_url = None
 491         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
 492         if mobj:
 493             video_url = compat_urllib_parse.unquote(mobj.group(1))
 494             if 'myvideo2flash' in video_url:
 495                 self._downloader.report_warning(u'forcing RTMPT ...')
 496                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
 497
 498         if not video_url:
 499             # extract non rtmp videos
 500             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
 501             if mobj is None:
 502                 raise ExtractorError(u'unable to extract url')
 503             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
 504
 505         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
 506         video_file = compat_urllib_parse.unquote(video_file)
 507
 508         if not video_file.endswith('f4m'):
 509             ppath, prefix = video_file.split('.')
 510             video_playpath = '%s:%s' % (prefix, ppath)
 511             video_hls_playlist = ''
 512         else:
 513             video_playpath = ''
 514             video_hls_playlist = (
 515                 video_filepath + video_file
 516             ).replace('.f4m', '.m3u8')
 517
 518         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
 519         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
 520
 521         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
 522             webpage, u'title')
 523
 524         return [{
 525             'id':                 video_id,
 526             'url':                video_url,
 527             'tc_url':             video_url,
 528             'uploader':           None,
 529             'upload_date':        None,
 530             'title':              video_title,
 531             'ext':                u'flv',
 532             'play_path':          video_playpath,
 533             'video_file':         video_file,
 534             'video_hls_playlist': video_hls_playlist,
 535             'player_url':         video_swfobj,
 536         }]
 537
 538
 539 class ComedyCentralIE(InfoExtractor):
 540     """Information extractor for The Daily Show and Colbert Report """
 541
 542     # urls can be abbreviations like :thedailyshow or :colbert
 543     # urls for episodes like:
 544     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
 545     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
 546     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
 547     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
 548                       |(https?://)?(www\.)?
 549                           (?P<showname>thedailyshow|colbertnation)\.com/
 550                          (full-episodes/(?P<episode>.*)|
 551                           (?P<clip>
 552                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
 553                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
 554                      $"""
 555
 556     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
 557
 558     _video_extensions = {
 559         '3500': 'mp4',
 560         '2200': 'mp4',
 561         '1700': 'mp4',
 562         '1200': 'mp4',
 563         '750': 'mp4',
 564         '400': 'mp4',
 565     }
 566     _video_dimensions = {
 567         '3500': '1280x720',
 568         '2200': '960x540',
 569         '1700': '768x432',
 570         '1200': '640x360',
 571         '750': '512x288',
 572         '400': '384x216',
 573     }
 574
 575     @classmethod
 576     def suitable(cls, url):
 577         """Receives a URL and returns True if suitable for this IE."""
 578         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 579
 580     def _print_formats(self, formats):
 581         print('Available formats:')
 582         for x in formats:
 583             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
 584
 585
 586     def _real_extract(self, url):
 587         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 588         if mobj is None:
 589             raise ExtractorError(u'Invalid URL: %s' % url)
 590
 591         if mobj.group('shortname'):
 592             if mobj.group('shortname') in ('tds', 'thedailyshow'):
 593                 url = u'http://www.thedailyshow.com/full-episodes/'
 594             else:
 595                 url = u'http://www.colbertnation.com/full-episodes/'
 596             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 597             assert mobj is not None
 598
 599         if mobj.group('clip'):
 600             if mobj.group('showname') == 'thedailyshow':
 601                 epTitle = mobj.group('tdstitle')
 602             else:
 603                 epTitle = mobj.group('cntitle')
 604             dlNewest = False
 605         else:
 606             dlNewest = not mobj.group('episode')
 607             if dlNewest:
 608                 epTitle = mobj.group('showname')
 609             else:
 610                 epTitle = mobj.group('episode')
 611
 612         self.report_extraction(epTitle)
 613         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
 614         if dlNewest:
 615             url = htmlHandle.geturl()
 616             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 617             if mobj is None:
 618                 raise ExtractorError(u'Invalid redirected URL: ' + url)
 619             if mobj.group('episode') == '':
 620                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
 621             epTitle = mobj.group('episode')
 622
 623         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
 624
 625         if len(mMovieParams) == 0:
 626             # The Colbert Report embeds the information in a without
 627             # a URL prefix; so extract the alternate reference
 628             # and then add the URL prefix manually.
 629
 630             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
 631             if len(altMovieParams) == 0:
 632                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
 633             else:
 634                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
 635
 636         uri = mMovieParams[0][1]
 637         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
 638         indexXml = self._download_webpage(indexUrl, epTitle,
 639                                           u'Downloading show index',
 640                                           u'unable to download episode index')
 641
 642         results = []
 643
 644         idoc = xml.etree.ElementTree.fromstring(indexXml)
 645         itemEls = idoc.findall('.//item')
 646         for partNum,itemEl in enumerate(itemEls):
 647             mediaId = itemEl.findall('./guid')[0].text
 648             shortMediaId = mediaId.split(':')[-1]
 649             showId = mediaId.split(':')[-2].replace('.com', '')
 650             officialTitle = itemEl.findall('./title')[0].text
 651             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
 652
 653             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
 654                         compat_urllib_parse.urlencode({'uri': mediaId}))
 655             configXml = self._download_webpage(configUrl, epTitle,
 656                                                u'Downloading configuration for %s' % shortMediaId)
 657
 658             cdoc = xml.etree.ElementTree.fromstring(configXml)
 659             turls = []
 660             for rendition in cdoc.findall('.//rendition'):
 661                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
 662                 turls.append(finfo)
 663
 664             if len(turls) == 0:
 665                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
 666                 continue
 667
 668             if self._downloader.params.get('listformats', None):
 669                 self._print_formats([i[0] for i in turls])
 670                 return
 671
 672             # For now, just pick the highest bitrate
 673             format,rtmp_video_url = turls[-1]
 674
 675             # Get the format arg from the arg stream
 676             req_format = self._downloader.params.get('format', None)
 677
 678             # Select format if we can find one
 679             for f,v in turls:
 680                 if f == req_format:
 681                     format, rtmp_video_url = f, v
 682                     break
 683
 684             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
 685             if not m:
 686                 raise ExtractorError(u'Cannot transform RTMP url')
 687             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
 688             video_url = base + m.group('finalid')
 689
 690             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
 691             info = {
 692                 'id': shortMediaId,
 693                 'url': video_url,
 694                 'uploader': showId,
 695                 'upload_date': officialDate,
 696                 'title': effTitle,
 697                 'ext': 'mp4',
 698                 'format': format,
 699                 'thumbnail': None,
 700                 'description': officialTitle,
 701             }
 702             results.append(info)
 703
 704         return results
 705
 706
 707 class EscapistIE(InfoExtractor):
 708     """Information extractor for The Escapist """
 709
 710     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
 711     IE_NAME = u'escapist'
 712
 713     def _real_extract(self, url):
 714         mobj = re.match(self._VALID_URL, url)
 715         if mobj is None:
 716             raise ExtractorError(u'Invalid URL: %s' % url)
 717         showName = mobj.group('showname')
 718         videoId = mobj.group('episode')
 719
 720         self.report_extraction(videoId)
 721         webpage = self._download_webpage(url, videoId)
 722
 723         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
 724             webpage, u'description', fatal=False)
 725
 726         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
 727             webpage, u'thumbnail', fatal=False)
 728
 729         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
 730             webpage, u'player url')
 731
 732         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
 733             webpage, u'player url').split(' : ')[-1]
 734
 735         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
 736         configUrl = compat_urllib_parse.unquote(configUrl)
 737
 738         configJSON = self._download_webpage(configUrl, videoId,
 739                                             u'Downloading configuration',
 740                                             u'unable to download configuration')
 741
 742         # Technically, it's JavaScript, not JSON
 743         configJSON = configJSON.replace("'", '"')
 744
 745         try:
 746             config = json.loads(configJSON)
 747         except (ValueError,) as err:
 748             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
 749
 750         playlist = config['playlist']
 751         videoUrl = playlist[1]['url']
 752
 753         info = {
 754             'id': videoId,
 755             'url': videoUrl,
 756             'uploader': showName,
 757             'upload_date': None,
 758             'title': title,
 759             'ext': 'mp4',
 760             'thumbnail': imgUrl,
 761             'description': videoDesc,
 762             'player_url': playerUrl,
 763         }
 764
 765         return [info]
 766
 767 class CollegeHumorIE(InfoExtractor):
 768     """Information extractor for collegehumor.com"""
 769
 770     _WORKING = False
 771     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
 772     IE_NAME = u'collegehumor'
 773
 774     def report_manifest(self, video_id):
 775         """Report information extraction."""
 776         self.to_screen(u'%s: Downloading XML manifest' % video_id)
 777
 778     def _real_extract(self, url):
 779         mobj = re.match(self._VALID_URL, url)
 780         if mobj is None:
 781             raise ExtractorError(u'Invalid URL: %s' % url)
 782         video_id = mobj.group('videoid')
 783
 784         info = {
 785             'id': video_id,
 786             'uploader': None,
 787             'upload_date': None,
 788         }
 789
 790         self.report_extraction(video_id)
 791         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
 792         try:
 793             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
 794         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 795             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 796
 797         mdoc = xml.etree.ElementTree.fromstring(metaXml)
 798         try:
 799             videoNode = mdoc.findall('./video')[0]
 800             info['description'] = videoNode.findall('./description')[0].text
 801             info['title'] = videoNode.findall('./caption')[0].text
 802             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
 803             manifest_url = videoNode.findall('./file')[0].text
 804         except IndexError:
 805             raise ExtractorError(u'Invalid metadata XML file')
 806
 807         manifest_url += '?hdcore=2.10.3'
 808         self.report_manifest(video_id)
 809         try:
 810             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
 811         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 812             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
 813
 814         adoc = xml.etree.ElementTree.fromstring(manifestXml)
 815         try:
 816             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
 817             node_id = media_node.attrib['url']
 818             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
 819         except IndexError as err:
 820             raise ExtractorError(u'Invalid manifest file')
 821
 822         url_pr = compat_urllib_parse_urlparse(manifest_url)
 823         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
 824
 825         info['url'] = url
 826         info['ext'] = 'f4f'
 827         return [info]
 828
 829
 830 class XVideosIE(InfoExtractor):
 831     """Information extractor for xvideos.com"""
 832
 833     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
 834     IE_NAME = u'xvideos'
 835
 836     def _real_extract(self, url):
 837         mobj = re.match(self._VALID_URL, url)
 838         if mobj is None:
 839             raise ExtractorError(u'Invalid URL: %s' % url)
 840         video_id = mobj.group(1)
 841
 842         webpage = self._download_webpage(url, video_id)
 843
 844         self.report_extraction(video_id)
 845
 846         # Extract video URL
 847         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
 848             webpage, u'video URL'))
 849
 850         # Extract title
 851         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
 852             webpage, u'title')
 853
 854         # Extract video thumbnail
 855         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
 856             webpage, u'thumbnail', fatal=False)
 857
 858         info = {
 859             'id': video_id,
 860             'url': video_url,
 861             'uploader': None,
 862             'upload_date': None,
 863             'title': video_title,
 864             'ext': 'flv',
 865             'thumbnail': video_thumbnail,
 866             'description': None,
 867         }
 868
 869         return [info]
 870
 871
 872 class SoundcloudIE(InfoExtractor):
 873     """Information extractor for soundcloud.com
 874        To access the media, the uid of the song and a stream token
 875        must be extracted from the page source and the script must make
 876        a request to media.soundcloud.com/crossdomain.xml. Then
 877        the media can be grabbed by requesting from an url composed
 878        of the stream token and uid
 879      """
 880
 881     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
 882     IE_NAME = u'soundcloud'
 883
 884     def report_resolve(self, video_id):
 885         """Report information extraction."""
 886         self.to_screen(u'%s: Resolving id' % video_id)
 887
 888     def _real_extract(self, url):
 889         mobj = re.match(self._VALID_URL, url)
 890         if mobj is None:
 891             raise ExtractorError(u'Invalid URL: %s' % url)
 892
 893         # extract uploader (which is in the url)
 894         uploader = mobj.group(1)
 895         # extract simple title (uploader + slug of song title)
 896         slug_title =  mobj.group(2)
 897         simple_title = uploader + u'-' + slug_title
 898         full_title = '%s/%s' % (uploader, slug_title)
 899
 900         self.report_resolve(full_title)
 901
 902         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
 903         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 904         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
 905
 906         info = json.loads(info_json)
 907         video_id = info['id']
 908         self.report_extraction(full_title)
 909
 910         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 911         stream_json = self._download_webpage(streams_url, full_title,
 912                                              u'Downloading stream definitions',
 913                                              u'unable to download stream definitions')
 914
 915         streams = json.loads(stream_json)
 916         mediaURL = streams['http_mp3_128_url']
 917         upload_date = unified_strdate(info['created_at'])
 918
 919         return [{
 920             'id':       info['id'],
 921             'url':      mediaURL,
 922             'uploader': info['user']['username'],
 923             'upload_date': upload_date,
 924             'title':    info['title'],
 925             'ext':      u'mp3',
 926             'description': info['description'],
 927         }]
 928
 929 class SoundcloudSetIE(InfoExtractor):
 930     """Information extractor for soundcloud.com sets
 931        To access the media, the uid of the song and a stream token
 932        must be extracted from the page source and the script must make
 933        a request to media.soundcloud.com/crossdomain.xml. Then
 934        the media can be grabbed by requesting from an url composed
 935        of the stream token and uid
 936      """
 937
 938     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
 939     IE_NAME = u'soundcloud:set'
 940
 941     def report_resolve(self, video_id):
 942         """Report information extraction."""
 943         self.to_screen(u'%s: Resolving id' % video_id)
 944
 945     def _real_extract(self, url):
 946         mobj = re.match(self._VALID_URL, url)
 947         if mobj is None:
 948             raise ExtractorError(u'Invalid URL: %s' % url)
 949
 950         # extract uploader (which is in the url)
 951         uploader = mobj.group(1)
 952         # extract simple title (uploader + slug of song title)
 953         slug_title =  mobj.group(2)
 954         simple_title = uploader + u'-' + slug_title
 955         full_title = '%s/sets/%s' % (uploader, slug_title)
 956
 957         self.report_resolve(full_title)
 958
 959         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
 960         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 961         info_json = self._download_webpage(resolv_url, full_title)
 962
 963         videos = []
 964         info = json.loads(info_json)
 965         if 'errors' in info:
 966             for err in info['errors']:
 967                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
 968             return
 969
 970         self.report_extraction(full_title)
 971         for track in info['tracks']:
 972             video_id = track['id']
 973
 974             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
 975             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
 976
 977             self.report_extraction(video_id)
 978             streams = json.loads(stream_json)
 979             mediaURL = streams['http_mp3_128_url']
 980
 981             videos.append({
 982                 'id':       video_id,
 983                 'url':      mediaURL,
 984                 'uploader': track['user']['username'],
 985                 'upload_date':  unified_strdate(track['created_at']),
 986                 'title':    track['title'],
 987                 'ext':      u'mp3',
 988                 'description': track['description'],
 989             })
 990         return videos
 991
 992
 993 class InfoQIE(InfoExtractor):
 994     """Information extractor for infoq.com"""
 995     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
 996
 997     def _real_extract(self, url):
 998         mobj = re.match(self._VALID_URL, url)
 999         if mobj is None:
1000             raise ExtractorError(u'Invalid URL: %s' % url)
1001
1002         webpage = self._download_webpage(url, video_id=url)
1003         self.report_extraction(url)
1004
1005         # Extract video URL
1006         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1007         if mobj is None:
1008             raise ExtractorError(u'Unable to extract video url')
1009         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1010         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1011
1012         # Extract title
1013         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1014             webpage, u'title')
1015
1016         # Extract description
1017         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1018             webpage, u'description', fatal=False)
1019
1020         video_filename = video_url.split('/')[-1]
1021         video_id, extension = video_filename.split('.')
1022
1023         info = {
1024             'id': video_id,
1025             'url': video_url,
1026             'uploader': None,
1027             'upload_date': None,
1028             'title': video_title,
1029             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1030             'thumbnail': None,
1031             'description': video_description,
1032         }
1033
1034         return [info]
1035
1036 class MixcloudIE(InfoExtractor):
1037     """Information extractor for www.mixcloud.com"""
1038
1039     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1040     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1041     IE_NAME = u'mixcloud'
1042
1043     def report_download_json(self, file_id):
1044         """Report JSON download."""
1045         self.to_screen(u'Downloading json')
1046
1047     def get_urls(self, jsonData, fmt, bitrate='best'):
1048         """Get urls from 'audio_formats' section in json"""
1049         file_url = None
1050         try:
1051             bitrate_list = jsonData[fmt]
1052             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1053                 bitrate = max(bitrate_list) # select highest
1054
1055             url_list = jsonData[fmt][bitrate]
1056         except TypeError: # we have no bitrate info.
1057             url_list = jsonData[fmt]
1058         return url_list
1059
1060     def check_urls(self, url_list):
1061         """Returns 1st active url from list"""
1062         for url in url_list:
1063             try:
1064                 compat_urllib_request.urlopen(url)
1065                 return url
1066             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1067                 url = None
1068
1069         return None
1070
1071     def _print_formats(self, formats):
1072         print('Available formats:')
1073         for fmt in formats.keys():
1074             for b in formats[fmt]:
1075                 try:
1076                     ext = formats[fmt][b][0]
1077                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1078                 except TypeError: # we have no bitrate info
1079                     ext = formats[fmt][0]
1080                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1081                     break
1082
1083     def _real_extract(self, url):
1084         mobj = re.match(self._VALID_URL, url)
1085         if mobj is None:
1086             raise ExtractorError(u'Invalid URL: %s' % url)
1087         # extract uploader & filename from url
1088         uploader = mobj.group(1).decode('utf-8')
1089         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1090
1091         # construct API request
1092         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1093         # retrieve .json file with links to files
1094         request = compat_urllib_request.Request(file_url)
1095         try:
1096             self.report_download_json(file_url)
1097             jsonData = compat_urllib_request.urlopen(request).read()
1098         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1099             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1100
1101         # parse JSON
1102         json_data = json.loads(jsonData)
1103         player_url = json_data['player_swf_url']
1104         formats = dict(json_data['audio_formats'])
1105
1106         req_format = self._downloader.params.get('format', None)
1107         bitrate = None
1108
1109         if self._downloader.params.get('listformats', None):
1110             self._print_formats(formats)
1111             return
1112
1113         if req_format is None or req_format == 'best':
1114             for format_param in formats.keys():
1115                 url_list = self.get_urls(formats, format_param)
1116                 # check urls
1117                 file_url = self.check_urls(url_list)
1118                 if file_url is not None:
1119                     break # got it!
1120         else:
1121             if req_format not in formats:
1122                 raise ExtractorError(u'Format is not available')
1123
1124             url_list = self.get_urls(formats, req_format)
1125             file_url = self.check_urls(url_list)
1126             format_param = req_format
1127
1128         return [{
1129             'id': file_id.decode('utf-8'),
1130             'url': file_url.decode('utf-8'),
1131             'uploader': uploader.decode('utf-8'),
1132             'upload_date': None,
1133             'title': json_data['name'],
1134             'ext': file_url.split('.')[-1].decode('utf-8'),
1135             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1136             'thumbnail': json_data['thumbnail_url'],
1137             'description': json_data['description'],
1138             'player_url': player_url.decode('utf-8'),
1139         }]
1140
1141 class StanfordOpenClassroomIE(InfoExtractor):
1142     """Information extractor for Stanford's Open ClassRoom"""
1143
1144     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1145     IE_NAME = u'stanfordoc'
1146
1147     def _real_extract(self, url):
1148         mobj = re.match(self._VALID_URL, url)
1149         if mobj is None:
1150             raise ExtractorError(u'Invalid URL: %s' % url)
1151
1152         if mobj.group('course') and mobj.group('video'): # A specific video
1153             course = mobj.group('course')
1154             video = mobj.group('video')
1155             info = {
1156                 'id': course + '_' + video,
1157                 'uploader': None,
1158                 'upload_date': None,
1159             }
1160
1161             self.report_extraction(info['id'])
1162             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1163             xmlUrl = baseUrl + video + '.xml'
1164             try:
1165                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1166             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1167                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1168             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1169             try:
1170                 info['title'] = mdoc.findall('./title')[0].text
1171                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1172             except IndexError:
1173                 raise ExtractorError(u'Invalid metadata XML file')
1174             info['ext'] = info['url'].rpartition('.')[2]
1175             return [info]
1176         elif mobj.group('course'): # A course page
1177             course = mobj.group('course')
1178             info = {
1179                 'id': course,
1180                 'type': 'playlist',
1181                 'uploader': None,
1182                 'upload_date': None,
1183             }
1184
1185             coursepage = self._download_webpage(url, info['id'],
1186                                         note='Downloading course info page',
1187                                         errnote='Unable to download course info page')
1188
1189             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1190
1191             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1192                 coursepage, u'description', fatal=False)
1193
1194             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1195             info['list'] = [
1196                 {
1197                     'type': 'reference',
1198                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1199                 }
1200                     for vpage in links]
1201             results = []
1202             for entry in info['list']:
1203                 assert entry['type'] == 'reference'
1204                 results += self.extract(entry['url'])
1205             return results
1206         else: # Root page
1207             info = {
1208                 'id': 'Stanford OpenClassroom',
1209                 'type': 'playlist',
1210                 'uploader': None,
1211                 'upload_date': None,
1212             }
1213
1214             self.report_download_webpage(info['id'])
1215             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1216             try:
1217                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1218             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1219                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1220
1221             info['title'] = info['id']
1222
1223             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1224             info['list'] = [
1225                 {
1226                     'type': 'reference',
1227                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1228                 }
1229                     for cpage in links]
1230
1231             results = []
1232             for entry in info['list']:
1233                 assert entry['type'] == 'reference'
1234                 results += self.extract(entry['url'])
1235             return results
1236
1237 class MTVIE(InfoExtractor):
1238     """Information extractor for MTV.com"""
1239
1240     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1241     IE_NAME = u'mtv'
1242
1243     def _real_extract(self, url):
1244         mobj = re.match(self._VALID_URL, url)
1245         if mobj is None:
1246             raise ExtractorError(u'Invalid URL: %s' % url)
1247         if not mobj.group('proto'):
1248             url = 'http://' + url
1249         video_id = mobj.group('videoid')
1250
1251         webpage = self._download_webpage(url, video_id)
1252
1253         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1254             webpage, u'song name', fatal=False)
1255
1256         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1257             webpage, u'title')
1258
1259         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1260             webpage, u'mtvn_uri', fatal=False)
1261
1262         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1263             webpage, u'content id', fatal=False)
1264
1265         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1266         self.report_extraction(video_id)
1267         request = compat_urllib_request.Request(videogen_url)
1268         try:
1269             metadataXml = compat_urllib_request.urlopen(request).read()
1270         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1271             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1272
1273         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1274         renditions = mdoc.findall('.//rendition')
1275
1276         # For now, always pick the highest quality.
1277         rendition = renditions[-1]
1278
1279         try:
1280             _,_,ext = rendition.attrib['type'].partition('/')
1281             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1282             video_url = rendition.find('./src').text
1283         except KeyError:
1284             raise ExtractorError('Invalid rendition field.')
1285
1286         info = {
1287             'id': video_id,
1288             'url': video_url,
1289             'uploader': performer,
1290             'upload_date': None,
1291             'title': video_title,
1292             'ext': ext,
1293             'format': format,
1294         }
1295
1296         return [info]
1297
1298
1299 class YoukuIE(InfoExtractor):
1300     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1301
1302     def _gen_sid(self):
1303         nowTime = int(time.time() * 1000)
1304         random1 = random.randint(1000,1998)
1305         random2 = random.randint(1000,9999)
1306
1307         return "%d%d%d" %(nowTime,random1,random2)
1308
1309     def _get_file_ID_mix_string(self, seed):
1310         mixed = []
1311         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1312         seed = float(seed)
1313         for i in range(len(source)):
1314             seed  =  (seed * 211 + 30031 ) % 65536
1315             index  =  math.floor(seed / 65536 * len(source) )
1316             mixed.append(source[int(index)])
1317             source.remove(source[int(index)])
1318         #return ''.join(mixed)
1319         return mixed
1320
1321     def _get_file_id(self, fileId, seed):
1322         mixed = self._get_file_ID_mix_string(seed)
1323         ids = fileId.split('*')
1324         realId = []
1325         for ch in ids:
1326             if ch:
1327                 realId.append(mixed[int(ch)])
1328         return ''.join(realId)
1329
1330     def _real_extract(self, url):
1331         mobj = re.match(self._VALID_URL, url)
1332         if mobj is None:
1333             raise ExtractorError(u'Invalid URL: %s' % url)
1334         video_id = mobj.group('ID')
1335
1336         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1337
1338         jsondata = self._download_webpage(info_url, video_id)
1339
1340         self.report_extraction(video_id)
1341         try:
1342             config = json.loads(jsondata)
1343
1344             video_title =  config['data'][0]['title']
1345             seed = config['data'][0]['seed']
1346
1347             format = self._downloader.params.get('format', None)
1348             supported_format = list(config['data'][0]['streamfileids'].keys())
1349
1350             if format is None or format == 'best':
1351                 if 'hd2' in supported_format:
1352                     format = 'hd2'
1353                 else:
1354                     format = 'flv'
1355                 ext = u'flv'
1356             elif format == 'worst':
1357                 format = 'mp4'
1358                 ext = u'mp4'
1359             else:
1360                 format = 'flv'
1361                 ext = u'flv'
1362
1363
1364             fileid = config['data'][0]['streamfileids'][format]
1365             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1366         except (UnicodeDecodeError, ValueError, KeyError):
1367             raise ExtractorError(u'Unable to extract info section')
1368
1369         files_info=[]
1370         sid = self._gen_sid()
1371         fileid = self._get_file_id(fileid, seed)
1372
1373         #column 8,9 of fileid represent the segment number
1374         #fileid[7:9] should be changed
1375         for index, key in enumerate(keys):
1376
1377             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1378             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1379
1380             info = {
1381                 'id': '%s_part%02d' % (video_id, index),
1382                 'url': download_url,
1383                 'uploader': None,
1384                 'upload_date': None,
1385                 'title': video_title,
1386                 'ext': ext,
1387             }
1388             files_info.append(info)
1389
1390         return files_info
1391
1392
1393 class XNXXIE(InfoExtractor):
1394     """Information extractor for xnxx.com"""
1395
1396     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1397     IE_NAME = u'xnxx'
1398     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1399     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1400     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1401
1402     def _real_extract(self, url):
1403         mobj = re.match(self._VALID_URL, url)
1404         if mobj is None:
1405             raise ExtractorError(u'Invalid URL: %s' % url)
1406         video_id = mobj.group(1)
1407
1408         # Get webpage content
1409         webpage = self._download_webpage(url, video_id)
1410
1411         video_url = self._search_regex(self.VIDEO_URL_RE,
1412             webpage, u'video URL')
1413         video_url = compat_urllib_parse.unquote(video_url)
1414
1415         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1416             webpage, u'title')
1417
1418         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1419             webpage, u'thumbnail', fatal=False)
1420
1421         return [{
1422             'id': video_id,
1423             'url': video_url,
1424             'uploader': None,
1425             'upload_date': None,
1426             'title': video_title,
1427             'ext': 'flv',
1428             'thumbnail': video_thumbnail,
1429             'description': None,
1430         }]
1431
1432
1433 class GooglePlusIE(InfoExtractor):
1434     """Information extractor for plus.google.com."""
1435
1436     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1437     IE_NAME = u'plus.google'
1438
1439     def _real_extract(self, url):
1440         # Extract id from URL
1441         mobj = re.match(self._VALID_URL, url)
1442         if mobj is None:
1443             raise ExtractorError(u'Invalid URL: %s' % url)
1444
1445         post_url = mobj.group(0)
1446         video_id = mobj.group(1)
1447
1448         video_extension = 'flv'
1449
1450         # Step 1, Retrieve post webpage to extract further information
1451         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1452
1453         self.report_extraction(video_id)
1454
1455         # Extract update date
1456         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1457             webpage, u'upload date', fatal=False)
1458         if upload_date:
1459             # Convert timestring to a format suitable for filename
1460             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1461             upload_date = upload_date.strftime('%Y%m%d')
1462
1463         # Extract uploader
1464         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1465             webpage, u'uploader', fatal=False)
1466
1467         # Extract title
1468         # Get the first line for title
1469         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1470             webpage, 'title', default=u'NA')
1471
1472         # Step 2, Stimulate clicking the image box to launch video
1473         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1474             webpage, u'video page URL')
1475         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1476
1477         # Extract video links on video page
1478         """Extract video links of all sizes"""
1479         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1480         mobj = re.findall(pattern, webpage)
1481         if len(mobj) == 0:
1482             raise ExtractorError(u'Unable to extract video links')
1483
1484         # Sort in resolution
1485         links = sorted(mobj)
1486
1487         # Choose the lowest of the sort, i.e. highest resolution
1488         video_url = links[-1]
1489         # Only get the url. The resolution part in the tuple has no use anymore
1490         video_url = video_url[-1]
1491         # Treat escaped \u0026 style hex
1492         try:
1493             video_url = video_url.decode("unicode_escape")
1494         except AttributeError: # Python 3
1495             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1496
1497
1498         return [{
1499             'id':       video_id,
1500             'url':      video_url,
1501             'uploader': uploader,
1502             'upload_date':  upload_date,
1503             'title':    video_title,
1504             'ext':      video_extension,
1505         }]
1506
1507 class NBAIE(InfoExtractor):
1508     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1509     IE_NAME = u'nba'
1510
1511     def _real_extract(self, url):
1512         mobj = re.match(self._VALID_URL, url)
1513         if mobj is None:
1514             raise ExtractorError(u'Invalid URL: %s' % url)
1515
1516         video_id = mobj.group(1)
1517
1518         webpage = self._download_webpage(url, video_id)
1519
1520         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1521
1522         shortened_video_id = video_id.rpartition('/')[2]
1523         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1524             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1525
1526         # It isn't there in the HTML it returns to us
1527         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1528
1529         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1530
1531         info = {
1532             'id': shortened_video_id,
1533             'url': video_url,
1534             'ext': 'mp4',
1535             'title': title,
1536             # 'uploader_date': uploader_date,
1537             'description': description,
1538         }
1539         return [info]
1540
1541 class JustinTVIE(InfoExtractor):
1542     """Information extractor for justin.tv and twitch.tv"""
1543     # TODO: One broadcast may be split into multiple videos. The key
1544     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1545     # starts at 1 and increases. Can we treat all parts as one video?
1546
1547     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1548         (?:
1549             (?P<channelid>[^/]+)|
1550             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1551             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1552         )
1553         /?(?:\#.*)?$
1554         """
1555     _JUSTIN_PAGE_LIMIT = 100
1556     IE_NAME = u'justin.tv'
1557
1558     def report_download_page(self, channel, offset):
1559         """Report attempt to download a single page of videos."""
1560         self.to_screen(u'%s: Downloading video information from %d to %d' %
1561                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1562
1563     # Return count of items, list of *valid* items
1564     def _parse_page(self, url, video_id):
1565         webpage = self._download_webpage(url, video_id,
1566                                          u'Downloading video info JSON',
1567                                          u'unable to download video info JSON')
1568
1569         response = json.loads(webpage)
1570         if type(response) != list:
1571             error_text = response.get('error', 'unknown error')
1572             raise ExtractorError(u'Justin.tv API: %s' % error_text)
1573         info = []
1574         for clip in response:
1575             video_url = clip['video_file_url']
1576             if video_url:
1577                 video_extension = os.path.splitext(video_url)[1][1:]
1578                 video_date = re.sub('-', '', clip['start_time'][:10])
1579                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1580                 video_id = clip['id']
1581                 video_title = clip.get('title', video_id)
1582                 info.append({
1583                     'id': video_id,
1584                     'url': video_url,
1585                     'title': video_title,
1586                     'uploader': clip.get('channel_name', video_uploader_id),
1587                     'uploader_id': video_uploader_id,
1588                     'upload_date': video_date,
1589                     'ext': video_extension,
1590                 })
1591         return (len(response), info)
1592
1593     def _real_extract(self, url):
1594         mobj = re.match(self._VALID_URL, url)
1595         if mobj is None:
1596             raise ExtractorError(u'invalid URL: %s' % url)
1597
1598         api_base = 'http://api.justin.tv'
1599         paged = False
1600         if mobj.group('channelid'):
1601             paged = True
1602             video_id = mobj.group('channelid')
1603             api = api_base + '/channel/archives/%s.json' % video_id
1604         elif mobj.group('chapterid'):
1605             chapter_id = mobj.group('chapterid')
1606
1607             webpage = self._download_webpage(url, chapter_id)
1608             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1609             if not m:
1610                 raise ExtractorError(u'Cannot find archive of a chapter')
1611             archive_id = m.group(1)
1612
1613             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1614             chapter_info_xml = self._download_webpage(api, chapter_id,
1615                                              note=u'Downloading chapter information',
1616                                              errnote=u'Chapter information download failed')
1617             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1618             for a in doc.findall('.//archive'):
1619                 if archive_id == a.find('./id').text:
1620                     break
1621             else:
1622                 raise ExtractorError(u'Could not find chapter in chapter information')
1623
1624             video_url = a.find('./video_file_url').text
1625             video_ext = video_url.rpartition('.')[2] or u'flv'
1626
1627             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1628             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1629                                    note='Downloading chapter metadata',
1630                                    errnote='Download of chapter metadata failed')
1631             chapter_info = json.loads(chapter_info_json)
1632
1633             bracket_start = int(doc.find('.//bracket_start').text)
1634             bracket_end = int(doc.find('.//bracket_end').text)
1635
1636             # TODO determine start (and probably fix up file)
1637             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1638             #video_url += u'?start=' + TODO:start_timestamp
1639             # bracket_start is 13290, but we want 51670615
1640             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1641                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1642
1643             info = {
1644                 'id': u'c' + chapter_id,
1645                 'url': video_url,
1646                 'ext': video_ext,
1647                 'title': chapter_info['title'],
1648                 'thumbnail': chapter_info['preview'],
1649                 'description': chapter_info['description'],
1650                 'uploader': chapter_info['channel']['display_name'],
1651                 'uploader_id': chapter_info['channel']['name'],
1652             }
1653             return [info]
1654         else:
1655             video_id = mobj.group('videoid')
1656             api = api_base + '/broadcast/by_archive/%s.json' % video_id
1657
1658         self.report_extraction(video_id)
1659
1660         info = []
1661         offset = 0
1662         limit = self._JUSTIN_PAGE_LIMIT
1663         while True:
1664             if paged:
1665                 self.report_download_page(video_id, offset)
1666             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1667             page_count, page_info = self._parse_page(page_url, video_id)
1668             info.extend(page_info)
1669             if not paged or page_count != limit:
1670                 break
1671             offset += limit
1672         return info
1673
1674 class FunnyOrDieIE(InfoExtractor):
1675     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1676
1677     def _real_extract(self, url):
1678         mobj = re.match(self._VALID_URL, url)
1679         if mobj is None:
1680             raise ExtractorError(u'invalid URL: %s' % url)
1681
1682         video_id = mobj.group('id')
1683         webpage = self._download_webpage(url, video_id)
1684
1685         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1686             webpage, u'video URL', flags=re.DOTALL)
1687
1688         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1689             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1690
1691         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1692             webpage, u'description', fatal=False, flags=re.DOTALL)
1693
1694         info = {
1695             'id': video_id,
1696             'url': video_url,
1697             'ext': 'mp4',
1698             'title': title,
1699             'description': video_description,
1700         }
1701         return [info]
1702
1703 class SteamIE(InfoExtractor):
1704     _VALID_URL = r"""http://store\.steampowered\.com/
1705                 (agecheck/)?
1706                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1707                 (?P<gameID>\d+)/?
1708                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1709                 """
1710     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1711     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1712
1713     @classmethod
1714     def suitable(cls, url):
1715         """Receives a URL and returns True if suitable for this IE."""
1716         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1717
1718     def _real_extract(self, url):
1719         m = re.match(self._VALID_URL, url, re.VERBOSE)
1720         gameID = m.group('gameID')
1721
1722         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1723         webpage = self._download_webpage(videourl, gameID)
1724
1725         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1726             videourl = self._AGECHECK_TEMPLATE % gameID
1727             self.report_age_confirmation()
1728             webpage = self._download_webpage(videourl, gameID)
1729
1730         self.report_extraction(gameID)
1731         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1732                                              webpage, 'game title')
1733
1734         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1735         mweb = re.finditer(urlRE, webpage)
1736         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1737         titles = re.finditer(namesRE, webpage)
1738         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1739         thumbs = re.finditer(thumbsRE, webpage)
1740         videos = []
1741         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1742             video_id = vid.group('videoID')
1743             title = vtitle.group('videoName')
1744             video_url = vid.group('videoURL')
1745             video_thumb = thumb.group('thumbnail')
1746             if not video_url:
1747                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1748             info = {
1749                 'id':video_id,
1750                 'url':video_url,
1751                 'ext': 'flv',
1752                 'title': unescapeHTML(title),
1753                 'thumbnail': video_thumb
1754                   }
1755             videos.append(info)
1756         return [self.playlist_result(videos, gameID, game_title)]
1757
1758 class UstreamIE(InfoExtractor):
1759     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1760     IE_NAME = u'ustream'
1761
1762     def _real_extract(self, url):
1763         m = re.match(self._VALID_URL, url)
1764         video_id = m.group('videoID')
1765
1766         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1767         webpage = self._download_webpage(url, video_id)
1768
1769         self.report_extraction(video_id)
1770
1771         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1772             webpage, u'title')
1773
1774         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1775             webpage, u'uploader', fatal=False, flags=re.DOTALL)
1776
1777         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1778             webpage, u'thumbnail', fatal=False)
1779
1780         info = {
1781                 'id': video_id,
1782                 'url': video_url,
1783                 'ext': 'flv',
1784                 'title': video_title,
1785                 'uploader': uploader,
1786                 'thumbnail': thumbnail,
1787                }
1788         return info
1789
1790 class WorldStarHipHopIE(InfoExtractor):
1791     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1792     IE_NAME = u'WorldStarHipHop'
1793
1794     def _real_extract(self, url):
1795         m = re.match(self._VALID_URL, url)
1796         video_id = m.group('id')
1797
1798         webpage_src = self._download_webpage(url, video_id)
1799
1800         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1801             webpage_src, u'video URL')
1802
1803         if 'mp4' in video_url:
1804             ext = 'mp4'
1805         else:
1806             ext = 'flv'
1807
1808         video_title = self._html_search_regex(r"<title>(.*)</title>",
1809             webpage_src, u'title')
1810
1811         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1812         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1813             webpage_src, u'thumbnail', fatal=False)
1814
1815         if not thumbnail:
1816             _title = r"""candytitles.*>(.*)</span>"""
1817             mobj = re.search(_title, webpage_src)
1818             if mobj is not None:
1819                 video_title = mobj.group(1)
1820
1821         results = [{
1822                     'id': video_id,
1823                     'url' : video_url,
1824                     'title' : video_title,
1825                     'thumbnail' : thumbnail,
1826                     'ext' : ext,
1827                     }]
1828         return results
1829
1830 class RBMARadioIE(InfoExtractor):
1831     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
1832
1833     def _real_extract(self, url):
1834         m = re.match(self._VALID_URL, url)
1835         video_id = m.group('videoID')
1836
1837         webpage = self._download_webpage(url, video_id)
1838
1839         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
1840             webpage, u'json data', flags=re.MULTILINE)
1841
1842         try:
1843             data = json.loads(json_data)
1844         except ValueError as e:
1845             raise ExtractorError(u'Invalid JSON: ' + str(e))
1846
1847         video_url = data['akamai_url'] + '&cbr=256'
1848         url_parts = compat_urllib_parse_urlparse(video_url)
1849         video_ext = url_parts.path.rpartition('.')[2]
1850         info = {
1851                 'id': video_id,
1852                 'url': video_url,
1853                 'ext': video_ext,
1854                 'title': data['title'],
1855                 'description': data.get('teaser_text'),
1856                 'location': data.get('country_of_origin'),
1857                 'uploader': data.get('host', {}).get('name'),
1858                 'uploader_id': data.get('host', {}).get('slug'),
1859                 'thumbnail': data.get('image', {}).get('large_url_2x'),
1860                 'duration': data.get('duration'),
1861         }
1862         return [info]
1863
1864
1865 class YouPornIE(InfoExtractor):
1866     """Information extractor for youporn.com."""
1867     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
1868
1869     def _print_formats(self, formats):
1870         """Print all available formats"""
1871         print(u'Available formats:')
1872         print(u'ext\t\tformat')
1873         print(u'---------------------------------')
1874         for format in formats:
1875             print(u'%s\t\t%s'  % (format['ext'], format['format']))
1876
1877     def _specific(self, req_format, formats):
1878         for x in formats:
1879             if(x["format"]==req_format):
1880                 return x
1881         return None
1882
1883     def _real_extract(self, url):
1884         mobj = re.match(self._VALID_URL, url)
1885         if mobj is None:
1886             raise ExtractorError(u'Invalid URL: %s' % url)
1887         video_id = mobj.group('videoid')
1888
1889         req = compat_urllib_request.Request(url)
1890         req.add_header('Cookie', 'age_verified=1')
1891         webpage = self._download_webpage(req, video_id)
1892
1893         # Get JSON parameters
1894         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
1895         try:
1896             params = json.loads(json_params)
1897         except:
1898             raise ExtractorError(u'Invalid JSON')
1899
1900         self.report_extraction(video_id)
1901         try:
1902             video_title = params['title']
1903             upload_date = unified_strdate(params['release_date_f'])
1904             video_description = params['description']
1905             video_uploader = params['submitted_by']
1906             thumbnail = params['thumbnails'][0]['image']
1907         except KeyError:
1908             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
1909
1910         # Get all of the formats available
1911         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
1912         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
1913             webpage, u'download list').strip()
1914
1915         # Get all of the links from the page
1916         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
1917         links = re.findall(LINK_RE, download_list_html)
1918         if(len(links) == 0):
1919             raise ExtractorError(u'ERROR: no known formats available for video')
1920
1921         self.to_screen(u'Links found: %d' % len(links))
1922
1923         formats = []
1924         for link in links:
1925
1926             # A link looks like this:
1927             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
1928             # A path looks like this:
1929             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
1930             video_url = unescapeHTML( link )
1931             path = compat_urllib_parse_urlparse( video_url ).path
1932             extension = os.path.splitext( path )[1][1:]
1933             format = path.split('/')[4].split('_')[:2]
1934             size = format[0]
1935             bitrate = format[1]
1936             format = "-".join( format )
1937             # title = u'%s-%s-%s' % (video_title, size, bitrate)
1938
1939             formats.append({
1940                 'id': video_id,
1941                 'url': video_url,
1942                 'uploader': video_uploader,
1943                 'upload_date': upload_date,
1944                 'title': video_title,
1945                 'ext': extension,
1946                 'format': format,
1947                 'thumbnail': thumbnail,
1948                 'description': video_description
1949             })
1950
1951         if self._downloader.params.get('listformats', None):
1952             self._print_formats(formats)
1953             return
1954
1955         req_format = self._downloader.params.get('format', None)
1956         self.to_screen(u'Format: %s' % req_format)
1957
1958         if req_format is None or req_format == 'best':
1959             return [formats[0]]
1960         elif req_format == 'worst':
1961             return [formats[-1]]
1962         elif req_format in ('-1', 'all'):
1963             return formats
1964         else:
1965             format = self._specific( req_format, formats )
1966             if result is None:
1967                 raise ExtractorError(u'Requested format not available')
1968             return [format]
1969
1970
1971
1972 class PornotubeIE(InfoExtractor):
1973     """Information extractor for pornotube.com."""
1974     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
1975
1976     def _real_extract(self, url):
1977         mobj = re.match(self._VALID_URL, url)
1978         if mobj is None:
1979             raise ExtractorError(u'Invalid URL: %s' % url)
1980
1981         video_id = mobj.group('videoid')
1982         video_title = mobj.group('title')
1983
1984         # Get webpage content
1985         webpage = self._download_webpage(url, video_id)
1986
1987         # Get the video URL
1988         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
1989         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
1990         video_url = compat_urllib_parse.unquote(video_url)
1991
1992         #Get the uploaded date
1993         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
1994         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
1995         if upload_date: upload_date = unified_strdate(upload_date)
1996
1997         info = {'id': video_id,
1998                 'url': video_url,
1999                 'uploader': None,
2000                 'upload_date': upload_date,
2001                 'title': video_title,
2002                 'ext': 'flv',
2003                 'format': 'flv'}
2004
2005         return [info]
2006
2007 class YouJizzIE(InfoExtractor):
2008     """Information extractor for youjizz.com."""
2009     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2010
2011     def _real_extract(self, url):
2012         mobj = re.match(self._VALID_URL, url)
2013         if mobj is None:
2014             raise ExtractorError(u'Invalid URL: %s' % url)
2015
2016         video_id = mobj.group('videoid')
2017
2018         # Get webpage content
2019         webpage = self._download_webpage(url, video_id)
2020
2021         # Get the video title
2022         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2023             webpage, u'title').strip()
2024
2025         # Get the embed page
2026         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2027         if result is None:
2028             raise ExtractorError(u'ERROR: unable to extract embed page')
2029
2030         embed_page_url = result.group(0).strip()
2031         video_id = result.group('videoid')
2032
2033         webpage = self._download_webpage(embed_page_url, video_id)
2034
2035         # Get the video URL
2036         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2037             webpage, u'video URL')
2038
2039         info = {'id': video_id,
2040                 'url': video_url,
2041                 'title': video_title,
2042                 'ext': 'flv',
2043                 'format': 'flv',
2044                 'player_url': embed_page_url}
2045
2046         return [info]
2047
2048 class EightTracksIE(InfoExtractor):
2049     IE_NAME = '8tracks'
2050     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2051
2052     def _real_extract(self, url):
2053         mobj = re.match(self._VALID_URL, url)
2054         if mobj is None:
2055             raise ExtractorError(u'Invalid URL: %s' % url)
2056         playlist_id = mobj.group('id')
2057
2058         webpage = self._download_webpage(url, playlist_id)
2059
2060         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2061         data = json.loads(json_like)
2062
2063         session = str(random.randint(0, 1000000000))
2064         mix_id = data['id']
2065         track_count = data['tracks_count']
2066         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2067         next_url = first_url
2068         res = []
2069         for i in itertools.count():
2070             api_json = self._download_webpage(next_url, playlist_id,
2071                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2072                 errnote=u'Failed to download song information')
2073             api_data = json.loads(api_json)
2074             track_data = api_data[u'set']['track']
2075             info = {
2076                 'id': track_data['id'],
2077                 'url': track_data['track_file_stream_url'],
2078                 'title': track_data['performer'] + u' - ' + track_data['name'],
2079                 'raw_title': track_data['name'],
2080                 'uploader_id': data['user']['login'],
2081                 'ext': 'm4a',
2082             }
2083             res.append(info)
2084             if api_data['set']['at_last_track']:
2085                 break
2086             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2087         return res
2088
2089 class KeekIE(InfoExtractor):
2090     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2091     IE_NAME = u'keek'
2092
2093     def _real_extract(self, url):
2094         m = re.match(self._VALID_URL, url)
2095         video_id = m.group('videoID')
2096
2097         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2098         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2099         webpage = self._download_webpage(url, video_id)
2100
2101         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2102             webpage, u'title')
2103
2104         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2105             webpage, u'uploader', fatal=False)
2106
2107         info = {
2108                 'id': video_id,
2109                 'url': video_url,
2110                 'ext': 'mp4',
2111                 'title': video_title,
2112                 'thumbnail': thumbnail,
2113                 'uploader': uploader
2114         }
2115         return [info]
2116
2117 class TEDIE(InfoExtractor):
2118     _VALID_URL=r'''http://www\.ted\.com/
2119                    (
2120                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2121                         |
2122                         ((?P<type_talk>talks)) # We have a simple talk
2123                    )
2124                    (/lang/(.*?))? # The url may contain the language
2125                    /(?P<name>\w+) # Here goes the name and then ".html"
2126                    '''
2127
2128     @classmethod
2129     def suitable(cls, url):
2130         """Receives a URL and returns True if suitable for this IE."""
2131         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2132
2133     def _real_extract(self, url):
2134         m=re.match(self._VALID_URL, url, re.VERBOSE)
2135         if m.group('type_talk'):
2136             return [self._talk_info(url)]
2137         else :
2138             playlist_id=m.group('playlist_id')
2139             name=m.group('name')
2140             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2141             return [self._playlist_videos_info(url,name,playlist_id)]
2142
2143     def _playlist_videos_info(self,url,name,playlist_id=0):
2144         '''Returns the videos of the playlist'''
2145         video_RE=r'''
2146                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2147                      ([.\s]*?)data-playlist_item_id="(\d+)"
2148                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2149                      '''
2150         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2151         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2152         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2153         m_names=re.finditer(video_name_RE,webpage)
2154
2155         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2156                                                  webpage, 'playlist title')
2157
2158         playlist_entries = []
2159         for m_video, m_name in zip(m_videos,m_names):
2160             video_id=m_video.group('video_id')
2161             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2162             playlist_entries.append(self.url_result(talk_url, 'TED'))
2163         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2164
2165     def _talk_info(self, url, video_id=0):
2166         """Return the video for the talk in the url"""
2167         m = re.match(self._VALID_URL, url,re.VERBOSE)
2168         video_name = m.group('name')
2169         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2170         self.report_extraction(video_name)
2171         # If the url includes the language we get the title translated
2172         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2173                                         webpage, 'title')
2174         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2175                                     webpage, 'json data')
2176         info = json.loads(json_data)
2177         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2178                                        webpage, 'description', flags = re.DOTALL)
2179
2180         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2181                                        webpage, 'thumbnail')
2182         info = {
2183                 'id': info['id'],
2184                 'url': info['htmlStreams'][-1]['file'],
2185                 'ext': 'mp4',
2186                 'title': title,
2187                 'thumbnail': thumbnail,
2188                 'description': desc,
2189                 }
2190         return info
2191
2192 class MySpassIE(InfoExtractor):
2193     _VALID_URL = r'http://www.myspass.de/.*'
2194
2195     def _real_extract(self, url):
2196         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2197
2198         # video id is the last path element of the URL
2199         # usually there is a trailing slash, so also try the second but last
2200         url_path = compat_urllib_parse_urlparse(url).path
2201         url_parent_path, video_id = os.path.split(url_path)
2202         if not video_id:
2203             _, video_id = os.path.split(url_parent_path)
2204
2205         # get metadata
2206         metadata_url = META_DATA_URL_TEMPLATE % video_id
2207         metadata_text = self._download_webpage(metadata_url, video_id)
2208         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2209
2210         # extract values from metadata
2211         url_flv_el = metadata.find('url_flv')
2212         if url_flv_el is None:
2213             raise ExtractorError(u'Unable to extract download url')
2214         video_url = url_flv_el.text
2215         extension = os.path.splitext(video_url)[1][1:]
2216         title_el = metadata.find('title')
2217         if title_el is None:
2218             raise ExtractorError(u'Unable to extract title')
2219         title = title_el.text
2220         format_id_el = metadata.find('format_id')
2221         if format_id_el is None:
2222             format = ext
2223         else:
2224             format = format_id_el.text
2225         description_el = metadata.find('description')
2226         if description_el is not None:
2227             description = description_el.text
2228         else:
2229             description = None
2230         imagePreview_el = metadata.find('imagePreview')
2231         if imagePreview_el is not None:
2232             thumbnail = imagePreview_el.text
2233         else:
2234             thumbnail = None
2235         info = {
2236             'id': video_id,
2237             'url': video_url,
2238             'title': title,
2239             'ext': extension,
2240             'format': format,
2241             'thumbnail': thumbnail,
2242             'description': description
2243         }
2244         return [info]
2245
2246 class SpiegelIE(InfoExtractor):
2247     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2248
2249     def _real_extract(self, url):
2250         m = re.match(self._VALID_URL, url)
2251         video_id = m.group('videoID')
2252
2253         webpage = self._download_webpage(url, video_id)
2254
2255         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2256             webpage, u'title')
2257
2258         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2259         xml_code = self._download_webpage(xml_url, video_id,
2260                     note=u'Downloading XML', errnote=u'Failed to download XML')
2261
2262         idoc = xml.etree.ElementTree.fromstring(xml_code)
2263         last_type = idoc[-1]
2264         filename = last_type.findall('./filename')[0].text
2265         duration = float(last_type.findall('./duration')[0].text)
2266
2267         video_url = 'http://video2.spiegel.de/flash/' + filename
2268         video_ext = filename.rpartition('.')[2]
2269         info = {
2270             'id': video_id,
2271             'url': video_url,
2272             'ext': video_ext,
2273             'title': video_title,
2274             'duration': duration,
2275         }
2276         return [info]
2277
2278 class LiveLeakIE(InfoExtractor):
2279
2280     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2281     IE_NAME = u'liveleak'
2282
2283     def _real_extract(self, url):
2284         mobj = re.match(self._VALID_URL, url)
2285         if mobj is None:
2286             raise ExtractorError(u'Invalid URL: %s' % url)
2287
2288         video_id = mobj.group('video_id')
2289
2290         webpage = self._download_webpage(url, video_id)
2291
2292         video_url = self._search_regex(r'file: "(.*?)",',
2293             webpage, u'video URL')
2294
2295         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2296             webpage, u'title').replace('LiveLeak.com -', '').strip()
2297
2298         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2299             webpage, u'description', fatal=False)
2300
2301         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2302             webpage, u'uploader', fatal=False)
2303
2304         info = {
2305             'id':  video_id,
2306             'url': video_url,
2307             'ext': 'mp4',
2308             'title': video_title,
2309             'description': video_description,
2310             'uploader': video_uploader
2311         }
2312
2313         return [info]
2314
2315
2316
2317 class TumblrIE(InfoExtractor):
2318     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2319
2320     def _real_extract(self, url):
2321         m_url = re.match(self._VALID_URL, url)
2322         video_id = m_url.group('id')
2323         blog = m_url.group('blog_name')
2324
2325         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2326         webpage = self._download_webpage(url, video_id)
2327
2328         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2329         video = re.search(re_video, webpage)
2330         if video is None:
2331            raise ExtractorError(u'Unable to extract video')
2332         video_url = video.group('video_url')
2333         ext = video.group('ext')
2334
2335         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2336             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2337         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2338
2339         # The only place where you can get a title, it's not complete,
2340         # but searching in other places doesn't work for all videos
2341         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2342             webpage, u'title', flags=re.DOTALL)
2343
2344         return [{'id': video_id,
2345                  'url': video_url,
2346                  'title': video_title,
2347                  'thumbnail': video_thumbnail,
2348                  'ext': ext
2349                  }]
2350
2351 class BandcampIE(InfoExtractor):
2352     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2353
2354     def _real_extract(self, url):
2355         mobj = re.match(self._VALID_URL, url)
2356         title = mobj.group('title')
2357         webpage = self._download_webpage(url, title)
2358         # We get the link to the free download page
2359         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2360         if m_download is None:
2361             raise ExtractorError(u'No free songs found')
2362
2363         download_link = m_download.group(1)
2364         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2365                        webpage, re.MULTILINE|re.DOTALL).group('id')
2366
2367         download_webpage = self._download_webpage(download_link, id,
2368                                                   'Downloading free downloads page')
2369         # We get the dictionary of the track from some javascrip code
2370         info = re.search(r'items: (.*?),$',
2371                          download_webpage, re.MULTILINE).group(1)
2372         info = json.loads(info)[0]
2373         # We pick mp3-320 for now, until format selection can be easily implemented.
2374         mp3_info = info[u'downloads'][u'mp3-320']
2375         # If we try to use this url it says the link has expired
2376         initial_url = mp3_info[u'url']
2377         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2378         m_url = re.match(re_url, initial_url)
2379         #We build the url we will use to get the final track url
2380         # This url is build in Bandcamp in the script download_bunde_*.js
2381         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2382         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2383         # If we could correctly generate the .rand field the url would be
2384         #in the "download_url" key
2385         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2386
2387         track_info = {'id':id,
2388                       'title' : info[u'title'],
2389                       'ext' :   'mp3',
2390                       'url' :   final_url,
2391                       'thumbnail' : info[u'thumb_url'],
2392                       'uploader' :  info[u'artist']
2393                       }
2394
2395         return [track_info]
2396
2397 class RedTubeIE(InfoExtractor):
2398     """Information Extractor for redtube"""
2399     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2400
2401     def _real_extract(self,url):
2402         mobj = re.match(self._VALID_URL, url)
2403         if mobj is None:
2404             raise ExtractorError(u'Invalid URL: %s' % url)
2405
2406         video_id = mobj.group('id')
2407         video_extension = 'mp4'
2408         webpage = self._download_webpage(url, video_id)
2409
2410         self.report_extraction(video_id)
2411
2412         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2413             webpage, u'video URL')
2414
2415         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2416             webpage, u'title')
2417
2418         return [{
2419             'id':       video_id,
2420             'url':      video_url,
2421             'ext':      video_extension,
2422             'title':    video_title,
2423         }]
2424
2425 class InaIE(InfoExtractor):
2426     """Information Extractor for Ina.fr"""
2427     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2428
2429     def _real_extract(self,url):
2430         mobj = re.match(self._VALID_URL, url)
2431
2432         video_id = mobj.group('id')
2433         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2434         video_extension = 'mp4'
2435         webpage = self._download_webpage(mrss_url, video_id)
2436
2437         self.report_extraction(video_id)
2438
2439         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2440             webpage, u'video URL')
2441
2442         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2443             webpage, u'title')
2444
2445         return [{
2446             'id':       video_id,
2447             'url':      video_url,
2448             'ext':      video_extension,
2449             'title':    video_title,
2450         }]
2451
2452 class HowcastIE(InfoExtractor):
2453     """Information Extractor for Howcast.com"""
2454     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2455
2456     def _real_extract(self, url):
2457         mobj = re.match(self._VALID_URL, url)
2458
2459         video_id = mobj.group('id')
2460         webpage_url = 'http://www.howcast.com/videos/' + video_id
2461         webpage = self._download_webpage(webpage_url, video_id)
2462
2463         self.report_extraction(video_id)
2464
2465         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2466             webpage, u'video URL')
2467
2468         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2469             webpage, u'title')
2470
2471         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2472             webpage, u'description', fatal=False)
2473
2474         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2475             webpage, u'thumbnail', fatal=False)
2476
2477         return [{
2478             'id':       video_id,
2479             'url':      video_url,
2480             'ext':      'mp4',
2481             'title':    video_title,
2482             'description': video_description,
2483             'thumbnail': thumbnail,
2484         }]
2485
2486 class VineIE(InfoExtractor):
2487     """Information Extractor for Vine.co"""
2488     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2489
2490     def _real_extract(self, url):
2491         mobj = re.match(self._VALID_URL, url)
2492
2493         video_id = mobj.group('id')
2494         webpage_url = 'https://vine.co/v/' + video_id
2495         webpage = self._download_webpage(webpage_url, video_id)
2496
2497         self.report_extraction(video_id)
2498
2499         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2500             webpage, u'video URL')
2501
2502         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2503             webpage, u'title')
2504
2505         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2506             webpage, u'thumbnail', fatal=False)
2507
2508         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2509             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2510
2511         return [{
2512             'id':        video_id,
2513             'url':       video_url,
2514             'ext':       'mp4',
2515             'title':     video_title,
2516             'thumbnail': thumbnail,
2517             'uploader':  uploader,
2518         }]
2519
2520 class FlickrIE(InfoExtractor):
2521     """Information Extractor for Flickr videos"""
2522     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2523
2524     def _real_extract(self, url):
2525         mobj = re.match(self._VALID_URL, url)
2526
2527         video_id = mobj.group('id')
2528         video_uploader_id = mobj.group('uploader_id')
2529         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2530         webpage = self._download_webpage(webpage_url, video_id)
2531
2532         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2533
2534         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2535         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2536
2537         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2538             first_xml, u'node_id')
2539
2540         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2541         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2542
2543         self.report_extraction(video_id)
2544
2545         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2546         if mobj is None:
2547             raise ExtractorError(u'Unable to extract video url')
2548         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2549
2550         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2551             webpage, u'video title')
2552
2553         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2554             webpage, u'description', fatal=False)
2555
2556         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2557             webpage, u'thumbnail', fatal=False)
2558
2559         return [{
2560             'id':          video_id,
2561             'url':         video_url,
2562             'ext':         'mp4',
2563             'title':       video_title,
2564             'description': video_description,
2565             'thumbnail':   thumbnail,
2566             'uploader_id': video_uploader_id,
2567         }]
2568
2569 class TeamcocoIE(InfoExtractor):
2570     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2571
2572     def _real_extract(self, url):
2573         mobj = re.match(self._VALID_URL, url)
2574         if mobj is None:
2575             raise ExtractorError(u'Invalid URL: %s' % url)
2576         url_title = mobj.group('url_title')
2577         webpage = self._download_webpage(url, url_title)
2578
2579         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2580             webpage, u'video id')
2581
2582         self.report_extraction(video_id)
2583
2584         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2585             webpage, u'title')
2586
2587         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2588             webpage, u'thumbnail', fatal=False)
2589
2590         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2591             webpage, u'description', fatal=False)
2592
2593         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2594         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2595
2596         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2597             data, u'video URL')
2598
2599         return [{
2600             'id':          video_id,
2601             'url':         video_url,
2602             'ext':         'mp4',
2603             'title':       video_title,
2604             'thumbnail':   thumbnail,
2605             'description': video_description,
2606         }]
2607
2608 class XHamsterIE(InfoExtractor):
2609     """Information Extractor for xHamster"""
2610     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2611
2612     def _real_extract(self,url):
2613         mobj = re.match(self._VALID_URL, url)
2614
2615         video_id = mobj.group('id')
2616         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2617         webpage = self._download_webpage(mrss_url, video_id)
2618
2619         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2620         if mobj is None:
2621             raise ExtractorError(u'Unable to extract media URL')
2622         if len(mobj.group('server')) == 0:
2623             video_url = compat_urllib_parse.unquote(mobj.group('file'))
2624         else:
2625             video_url = mobj.group('server')+'/key='+mobj.group('file')
2626         video_extension = video_url.split('.')[-1]
2627
2628         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2629             webpage, u'title')
2630
2631         # Can't see the description anywhere in the UI
2632         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2633         #     webpage, u'description', fatal=False)
2634         # if video_description: video_description = unescapeHTML(video_description)
2635
2636         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2637         if mobj:
2638             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2639         else:
2640             video_upload_date = None
2641             self._downloader.report_warning(u'Unable to extract upload date')
2642
2643         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2644             webpage, u'uploader id', default=u'anonymous')
2645
2646         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2647             webpage, u'thumbnail', fatal=False)
2648
2649         return [{
2650             'id':       video_id,
2651             'url':      video_url,
2652             'ext':      video_extension,
2653             'title':    video_title,
2654             # 'description': video_description,
2655             'upload_date': video_upload_date,
2656             'uploader_id': video_uploader_id,
2657             'thumbnail': video_thumbnail
2658         }]
2659
2660 class HypemIE(InfoExtractor):
2661     """Information Extractor for hypem"""
2662     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2663
2664     def _real_extract(self, url):
2665         mobj = re.match(self._VALID_URL, url)
2666         if mobj is None:
2667             raise ExtractorError(u'Invalid URL: %s' % url)
2668         track_id = mobj.group(1)
2669
2670         data = { 'ax': 1, 'ts': time.time() }
2671         data_encoded = compat_urllib_parse.urlencode(data)
2672         complete_url = url + "?" + data_encoded
2673         request = compat_urllib_request.Request(complete_url)
2674         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2675         cookie = urlh.headers.get('Set-Cookie', '')
2676
2677         self.report_extraction(track_id)
2678
2679         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2680             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2681         try:
2682             track_list = json.loads(html_tracks)
2683             track = track_list[u'tracks'][0]
2684         except ValueError:
2685             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2686
2687         key = track[u"key"]
2688         track_id = track[u"id"]
2689         artist = track[u"artist"]
2690         title = track[u"song"]
2691
2692         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2693         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2694         request.add_header('cookie', cookie)
2695         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2696         try:
2697             song_data = json.loads(song_data_json)
2698         except ValueError:
2699             raise ExtractorError(u'Hypemachine contained invalid JSON.')
2700         final_url = song_data[u"url"]
2701
2702         return [{
2703             'id':       track_id,
2704             'url':      final_url,
2705             'ext':      "mp3",
2706             'title':    title,
2707             'artist':   artist,
2708         }]
2709
2710 class Vbox7IE(InfoExtractor):
2711     """Information Extractor for Vbox7"""
2712     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2713
2714     def _real_extract(self,url):
2715         mobj = re.match(self._VALID_URL, url)
2716         if mobj is None:
2717             raise ExtractorError(u'Invalid URL: %s' % url)
2718         video_id = mobj.group(1)
2719
2720         redirect_page, urlh = self._download_webpage_handle(url, video_id)
2721         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2722         redirect_url = urlh.geturl() + new_location
2723         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2724
2725         title = self._html_search_regex(r'<title>(.*)</title>',
2726             webpage, u'title').split('/')[0].strip()
2727
2728         ext = "flv"
2729         info_url = "http://vbox7.com/play/magare.do"
2730         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2731         info_request = compat_urllib_request.Request(info_url, data)
2732         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2733         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2734         if info_response is None:
2735             raise ExtractorError(u'Unable to extract the media url')
2736         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2737
2738         return [{
2739             'id':        video_id,
2740             'url':       final_url,
2741             'ext':       ext,
2742             'title':     title,
2743             'thumbnail': thumbnail_url,
2744         }]
2745
2746
2747 def gen_extractors():
2748     """ Return a list of an instance of every supported extractor.
2749     The order does matter; the first extractor matched is the one handling the URL.
2750     """
2751     return [
2752         YoutubePlaylistIE(),
2753         YoutubeChannelIE(),
2754         YoutubeUserIE(),
2755         YoutubeSearchIE(),
2756         YoutubeIE(),
2757         MetacafeIE(),
2758         DailymotionIE(),
2759         GoogleSearchIE(),
2760         PhotobucketIE(),
2761         YahooIE(),
2762         YahooSearchIE(),
2763         DepositFilesIE(),
2764         FacebookIE(),
2765         BlipTVIE(),
2766         BlipTVUserIE(),
2767         VimeoIE(),
2768         MyVideoIE(),
2769         ComedyCentralIE(),
2770         EscapistIE(),
2771         CollegeHumorIE(),
2772         XVideosIE(),
2773         SoundcloudSetIE(),
2774         SoundcloudIE(),
2775         InfoQIE(),
2776         MixcloudIE(),
2777         StanfordOpenClassroomIE(),
2778         MTVIE(),
2779         YoukuIE(),
2780         XNXXIE(),
2781         YouJizzIE(),
2782         PornotubeIE(),
2783         YouPornIE(),
2784         GooglePlusIE(),
2785         ArteTvIE(),
2786         NBAIE(),
2787         WorldStarHipHopIE(),
2788         JustinTVIE(),
2789         FunnyOrDieIE(),
2790         SteamIE(),
2791         UstreamIE(),
2792         RBMARadioIE(),
2793         EightTracksIE(),
2794         KeekIE(),
2795         TEDIE(),
2796         MySpassIE(),
2797         SpiegelIE(),
2798         LiveLeakIE(),
2799         ARDIE(),
2800         ZDFIE(),
2801         TumblrIE(),
2802         BandcampIE(),
2803         RedTubeIE(),
2804         InaIE(),
2805         HowcastIE(),
2806         VineIE(),
2807         FlickrIE(),
2808         TeamcocoIE(),
2809         XHamsterIE(),
2810         HypemIE(),
2811         Vbox7IE(),
2812         GametrailersIE(),
2813         StatigramIE(),
2814         GenericIE()
2815     ]
2816
2817 def get_info_extractor(ie_name):
2818     """Returns the info extractor class with the given ie_name"""
2819     return globals()[ie_name+'IE']