_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 from .extractor.common import InfoExtractor, SearchInfoExtractor
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.metacafe import MetacafeIE
  29 from .extractor.statigram import StatigramIE
  30 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
  31
  32
  33
  34
  35
  36 class PhotobucketIE(InfoExtractor):
  37     """Information extractor for photobucket.com."""
  38
  39     # TODO: the original _VALID_URL was:
  40     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
  41     # Check if it's necessary to keep the old extracion process
  42     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
  43     IE_NAME = u'photobucket'
  44
  45     def _real_extract(self, url):
  46         # Extract id from URL
  47         mobj = re.match(self._VALID_URL, url)
  48         if mobj is None:
  49             raise ExtractorError(u'Invalid URL: %s' % url)
  50
  51         video_id = mobj.group('id')
  52
  53         video_extension = mobj.group('ext')
  54
  55         # Retrieve video webpage to extract further information
  56         webpage = self._download_webpage(url, video_id)
  57
  58         # Extract URL, uploader, and title from webpage
  59         self.report_extraction(video_id)
  60         # We try first by looking the javascript code:
  61         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
  62         if mobj is not None:
  63             info = json.loads(mobj.group('json'))
  64             return [{
  65                 'id':       video_id,
  66                 'url':      info[u'downloadUrl'],
  67                 'uploader': info[u'username'],
  68                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
  69                 'title':    info[u'title'],
  70                 'ext':      video_extension,
  71                 'thumbnail': info[u'thumbUrl'],
  72             }]
  73
  74         # We try looking in other parts of the webpage
  75         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
  76             webpage, u'video URL')
  77
  78         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
  79         if mobj is None:
  80             raise ExtractorError(u'Unable to extract title')
  81         video_title = mobj.group(1).decode('utf-8')
  82         video_uploader = mobj.group(2).decode('utf-8')
  83
  84         return [{
  85             'id':       video_id.decode('utf-8'),
  86             'url':      video_url.decode('utf-8'),
  87             'uploader': video_uploader,
  88             'upload_date':  None,
  89             'title':    video_title,
  90             'ext':      video_extension.decode('utf-8'),
  91         }]
  92
  93
  94 class YahooIE(InfoExtractor):
  95     """Information extractor for screen.yahoo.com."""
  96     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
  97
  98     def _real_extract(self, url):
  99         mobj = re.match(self._VALID_URL, url)
 100         if mobj is None:
 101             raise ExtractorError(u'Invalid URL: %s' % url)
 102         video_id = mobj.group('id')
 103         webpage = self._download_webpage(url, video_id)
 104         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
 105
 106         if m_id is None:
 107             # TODO: Check which url parameters are required
 108             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 109             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
 110             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
 111                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
 112                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
 113                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
 114                         '''
 115             self.report_extraction(video_id)
 116             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
 117             if m_info is None:
 118                 raise ExtractorError(u'Unable to extract video info')
 119             video_title = m_info.group('title')
 120             video_description = m_info.group('description')
 121             video_thumb = m_info.group('thumb')
 122             video_date = m_info.group('date')
 123             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
 124
 125             # TODO: Find a way to get mp4 videos
 126             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 127             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
 128             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
 129             video_url = m_rest.group('url')
 130             video_path = m_rest.group('path')
 131             if m_rest is None:
 132                 raise ExtractorError(u'Unable to extract video url')
 133
 134         else: # We have to use a different method if another id is defined
 135             long_id = m_id.group('new_id')
 136             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
 137             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
 138             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
 139             info = json.loads(json_str)
 140             res = info[u'query'][u'results'][u'mediaObj'][0]
 141             stream = res[u'streams'][0]
 142             video_path = stream[u'path']
 143             video_url = stream[u'host']
 144             meta = res[u'meta']
 145             video_title = meta[u'title']
 146             video_description = meta[u'description']
 147             video_thumb = meta[u'thumbnail']
 148             video_date = None # I can't find it
 149
 150         info_dict = {
 151                      'id': video_id,
 152                      'url': video_url,
 153                      'play_path': video_path,
 154                      'title':video_title,
 155                      'description': video_description,
 156                      'thumbnail': video_thumb,
 157                      'upload_date': video_date,
 158                      'ext': 'flv',
 159                      }
 160         return info_dict
 161
 162 class VimeoIE(InfoExtractor):
 163     """Information extractor for vimeo.com."""
 164
 165     # _VALID_URL matches Vimeo URLs
 166     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 167     IE_NAME = u'vimeo'
 168
 169     def _verify_video_password(self, url, video_id, webpage):
 170         password = self._downloader.params.get('password', None)
 171         if password is None:
 172             raise ExtractorError(u'This video is protected by a password, use the --password option')
 173         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
 174         data = compat_urllib_parse.urlencode({'password': password,
 175                                               'token': token})
 176         # I didn't manage to use the password with https
 177         if url.startswith('https'):
 178             pass_url = url.replace('https','http')
 179         else:
 180             pass_url = url
 181         password_request = compat_urllib_request.Request(pass_url+'/password', data)
 182         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 183         password_request.add_header('Cookie', 'xsrft=%s' % token)
 184         pass_web = self._download_webpage(password_request, video_id,
 185                                           u'Verifying the password',
 186                                           u'Wrong password')
 187
 188     def _real_extract(self, url, new_video=True):
 189         # Extract ID from URL
 190         mobj = re.match(self._VALID_URL, url)
 191         if mobj is None:
 192             raise ExtractorError(u'Invalid URL: %s' % url)
 193
 194         video_id = mobj.group('id')
 195         if not mobj.group('proto'):
 196             url = 'https://' + url
 197         if mobj.group('direct_link') or mobj.group('pro'):
 198             url = 'https://vimeo.com/' + video_id
 199
 200         # Retrieve video webpage to extract further information
 201         request = compat_urllib_request.Request(url, None, std_headers)
 202         webpage = self._download_webpage(request, video_id)
 203
 204         # Now we begin extracting as much information as we can from what we
 205         # retrieved. First we extract the information common to all extractors,
 206         # and latter we extract those that are Vimeo specific.
 207         self.report_extraction(video_id)
 208
 209         # Extract the config JSON
 210         try:
 211             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 212             config = json.loads(config)
 213         except:
 214             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
 215                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
 216
 217             if re.search('If so please provide the correct password.', webpage):
 218                 self._verify_video_password(url, video_id, webpage)
 219                 return self._real_extract(url)
 220             else:
 221                 raise ExtractorError(u'Unable to extract info section')
 222
 223         # Extract title
 224         video_title = config["video"]["title"]
 225
 226         # Extract uploader and uploader_id
 227         video_uploader = config["video"]["owner"]["name"]
 228         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
 229
 230         # Extract video thumbnail
 231         video_thumbnail = config["video"]["thumbnail"]
 232
 233         # Extract video description
 234         video_description = get_element_by_attribute("itemprop", "description", webpage)
 235         if video_description: video_description = clean_html(video_description)
 236         else: video_description = u''
 237
 238         # Extract upload date
 239         video_upload_date = None
 240         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
 241         if mobj is not None:
 242             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
 243
 244         # Vimeo specific: extract request signature and timestamp
 245         sig = config['request']['signature']
 246         timestamp = config['request']['timestamp']
 247
 248         # Vimeo specific: extract video codec and quality information
 249         # First consider quality, then codecs, then take everything
 250         # TODO bind to format param
 251         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
 252         files = { 'hd': [], 'sd': [], 'other': []}
 253         for codec_name, codec_extension in codecs:
 254             if codec_name in config["video"]["files"]:
 255                 if 'hd' in config["video"]["files"][codec_name]:
 256                     files['hd'].append((codec_name, codec_extension, 'hd'))
 257                 elif 'sd' in config["video"]["files"][codec_name]:
 258                     files['sd'].append((codec_name, codec_extension, 'sd'))
 259                 else:
 260                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
 261
 262         for quality in ('hd', 'sd', 'other'):
 263             if len(files[quality]) > 0:
 264                 video_quality = files[quality][0][2]
 265                 video_codec = files[quality][0][0]
 266                 video_extension = files[quality][0][1]
 267                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
 268                 break
 269         else:
 270             raise ExtractorError(u'No known codec found')
 271
 272         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 273                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
 274
 275         return [{
 276             'id':       video_id,
 277             'url':      video_url,
 278             'uploader': video_uploader,
 279             'uploader_id': video_uploader_id,
 280             'upload_date':  video_upload_date,
 281             'title':    video_title,
 282             'ext':      video_extension,
 283             'thumbnail':    video_thumbnail,
 284             'description':  video_description,
 285         }]
 286
 287
 288 class ArteTvIE(InfoExtractor):
 289     """arte.tv information extractor."""
 290
 291     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
 292     _LIVE_URL = r'index-[0-9]+\.html$'
 293
 294     IE_NAME = u'arte.tv'
 295
 296     def fetch_webpage(self, url):
 297         request = compat_urllib_request.Request(url)
 298         try:
 299             self.report_download_webpage(url)
 300             webpage = compat_urllib_request.urlopen(request).read()
 301         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 302             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 303         except ValueError as err:
 304             raise ExtractorError(u'Invalid URL: %s' % url)
 305         return webpage
 306
 307     def grep_webpage(self, url, regex, regexFlags, matchTuples):
 308         page = self.fetch_webpage(url)
 309         mobj = re.search(regex, page, regexFlags)
 310         info = {}
 311
 312         if mobj is None:
 313             raise ExtractorError(u'Invalid URL: %s' % url)
 314
 315         for (i, key, err) in matchTuples:
 316             if mobj.group(i) is None:
 317                 raise ExtractorError(err)
 318             else:
 319                 info[key] = mobj.group(i)
 320
 321         return info
 322
 323     def extractLiveStream(self, url):
 324         video_lang = url.split('/')[-4]
 325         info = self.grep_webpage(
 326             url,
 327             r'src="(.*?/videothek_js.*?\.js)',
 328             0,
 329             [
 330                 (1, 'url', u'Invalid URL: %s' % url)
 331             ]
 332         )
 333         http_host = url.split('/')[2]
 334         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
 335         info = self.grep_webpage(
 336             next_url,
 337             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
 338                 '(http://.*?\.swf).*?' +
 339                 '(rtmp://.*?)\'',
 340             re.DOTALL,
 341             [
 342                 (1, 'path',   u'could not extract video path: %s' % url),
 343                 (2, 'player', u'could not extract video player: %s' % url),
 344                 (3, 'url',    u'could not extract video url: %s' % url)
 345             ]
 346         )
 347         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 348
 349     def extractPlus7Stream(self, url):
 350         video_lang = url.split('/')[-3]
 351         info = self.grep_webpage(
 352             url,
 353             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
 354             0,
 355             [
 356                 (1, 'url', u'Invalid URL: %s' % url)
 357             ]
 358         )
 359         next_url = compat_urllib_parse.unquote(info.get('url'))
 360         info = self.grep_webpage(
 361             next_url,
 362             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
 363             0,
 364             [
 365                 (1, 'url', u'Could not find <video> tag: %s' % url)
 366             ]
 367         )
 368         next_url = compat_urllib_parse.unquote(info.get('url'))
 369
 370         info = self.grep_webpage(
 371             next_url,
 372             r'<video id="(.*?)".*?>.*?' +
 373                 '<name>(.*?)</name>.*?' +
 374                 '<dateVideo>(.*?)</dateVideo>.*?' +
 375                 '<url quality="hd">(.*?)</url>',
 376             re.DOTALL,
 377             [
 378                 (1, 'id',    u'could not extract video id: %s' % url),
 379                 (2, 'title', u'could not extract video title: %s' % url),
 380                 (3, 'date',  u'could not extract video date: %s' % url),
 381                 (4, 'url',   u'could not extract video url: %s' % url)
 382             ]
 383         )
 384
 385         return {
 386             'id':           info.get('id'),
 387             'url':          compat_urllib_parse.unquote(info.get('url')),
 388             'uploader':     u'arte.tv',
 389             'upload_date':  unified_strdate(info.get('date')),
 390             'title':        info.get('title').decode('utf-8'),
 391             'ext':          u'mp4',
 392             'format':       u'NA',
 393             'player_url':   None,
 394         }
 395
 396     def _real_extract(self, url):
 397         video_id = url.split('/')[-1]
 398         self.report_extraction(video_id)
 399
 400         if re.search(self._LIVE_URL, video_id) is not None:
 401             self.extractLiveStream(url)
 402             return
 403         else:
 404             info = self.extractPlus7Stream(url)
 405
 406         return [info]
 407
 408
 409 class GenericIE(InfoExtractor):
 410     """Generic last-resort information extractor."""
 411
 412     _VALID_URL = r'.*'
 413     IE_NAME = u'generic'
 414
 415     def report_download_webpage(self, video_id):
 416         """Report webpage download."""
 417         if not self._downloader.params.get('test', False):
 418             self._downloader.report_warning(u'Falling back on generic information extractor.')
 419         super(GenericIE, self).report_download_webpage(video_id)
 420
 421     def report_following_redirect(self, new_url):
 422         """Report information extraction."""
 423         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
 424
 425     def _test_redirect(self, url):
 426         """Check if it is a redirect, like url shorteners, in case return the new url."""
 427         class HeadRequest(compat_urllib_request.Request):
 428             def get_method(self):
 429                 return "HEAD"
 430
 431         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
 432             """
 433             Subclass the HTTPRedirectHandler to make it use our
 434             HeadRequest also on the redirected URL
 435             """
 436             def redirect_request(self, req, fp, code, msg, headers, newurl):
 437                 if code in (301, 302, 303, 307):
 438                     newurl = newurl.replace(' ', '%20')
 439                     newheaders = dict((k,v) for k,v in req.headers.items()
 440                                       if k.lower() not in ("content-length", "content-type"))
 441                     return HeadRequest(newurl,
 442                                        headers=newheaders,
 443                                        origin_req_host=req.get_origin_req_host(),
 444                                        unverifiable=True)
 445                 else:
 446                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
 447
 448         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
 449             """
 450             Fallback to GET if HEAD is not allowed (405 HTTP error)
 451             """
 452             def http_error_405(self, req, fp, code, msg, headers):
 453                 fp.read()
 454                 fp.close()
 455
 456                 newheaders = dict((k,v) for k,v in req.headers.items()
 457                                   if k.lower() not in ("content-length", "content-type"))
 458                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
 459                                                  headers=newheaders,
 460                                                  origin_req_host=req.get_origin_req_host(),
 461                                                  unverifiable=True))
 462
 463         # Build our opener
 464         opener = compat_urllib_request.OpenerDirector()
 465         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
 466                         HTTPMethodFallback, HEADRedirectHandler,
 467                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
 468             opener.add_handler(handler())
 469
 470         response = opener.open(HeadRequest(url))
 471         if response is None:
 472             raise ExtractorError(u'Invalid URL protocol')
 473         new_url = response.geturl()
 474
 475         if url == new_url:
 476             return False
 477
 478         self.report_following_redirect(new_url)
 479         return new_url
 480
 481     def _real_extract(self, url):
 482         new_url = self._test_redirect(url)
 483         if new_url: return [self.url_result(new_url)]
 484
 485         video_id = url.split('/')[-1]
 486         try:
 487             webpage = self._download_webpage(url, video_id)
 488         except ValueError as err:
 489             # since this is the last-resort InfoExtractor, if
 490             # this error is thrown, it'll be thrown here
 491             raise ExtractorError(u'Invalid URL: %s' % url)
 492
 493         self.report_extraction(video_id)
 494         # Start with something easy: JW Player in SWFObject
 495         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 496         if mobj is None:
 497             # Broaden the search a little bit
 498             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 499         if mobj is None:
 500             # Broaden the search a little bit: JWPlayer JS loader
 501             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
 502         if mobj is None:
 503             # Try to find twitter cards info
 504             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
 505         if mobj is None:
 506             # We look for Open Graph info:
 507             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
 508             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
 509             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
 510             if m_video_type is not None:
 511                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
 512         if mobj is None:
 513             raise ExtractorError(u'Invalid URL: %s' % url)
 514
 515         # It's possible that one of the regexes
 516         # matched, but returned an empty group:
 517         if mobj.group(1) is None:
 518             raise ExtractorError(u'Invalid URL: %s' % url)
 519
 520         video_url = compat_urllib_parse.unquote(mobj.group(1))
 521         video_id = os.path.basename(video_url)
 522
 523         # here's a fun little line of code for you:
 524         video_extension = os.path.splitext(video_id)[1][1:]
 525         video_id = os.path.splitext(video_id)[0]
 526
 527         # it's tempting to parse this further, but you would
 528         # have to take into account all the variations like
 529         #   Video Title - Site Name
 530         #   Site Name | Video Title
 531         #   Video Title - Tagline | Site Name
 532         # and so on and so forth; it's just not practical
 533         video_title = self._html_search_regex(r'<title>(.*)</title>',
 534             webpage, u'video title')
 535
 536         # video uploader is domain name
 537         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
 538             url, u'video uploader')
 539
 540         return [{
 541             'id':       video_id,
 542             'url':      video_url,
 543             'uploader': video_uploader,
 544             'upload_date':  None,
 545             'title':    video_title,
 546             'ext':      video_extension,
 547         }]
 548
 549
 550 class YoutubeSearchIE(SearchInfoExtractor):
 551     """Information Extractor for YouTube search queries."""
 552     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
 553     _MAX_RESULTS = 1000
 554     IE_NAME = u'youtube:search'
 555     _SEARCH_KEY = 'ytsearch'
 556
 557     def report_download_page(self, query, pagenum):
 558         """Report attempt to download search page with given number."""
 559         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 560
 561     def _get_n_results(self, query, n):
 562         """Get a specified number of results for a query"""
 563
 564         video_ids = []
 565         pagenum = 0
 566         limit = n
 567
 568         while (50 * pagenum) < limit:
 569             self.report_download_page(query, pagenum+1)
 570             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
 571             request = compat_urllib_request.Request(result_url)
 572             try:
 573                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
 574             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 575                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
 576             api_response = json.loads(data)['data']
 577
 578             if not 'items' in api_response:
 579                 raise ExtractorError(u'[youtube] No video results')
 580
 581             new_ids = list(video['id'] for video in api_response['items'])
 582             video_ids += new_ids
 583
 584             limit = min(n, api_response['totalItems'])
 585             pagenum += 1
 586
 587         if len(video_ids) > n:
 588             video_ids = video_ids[:n]
 589         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
 590         return self.playlist_result(videos, query)
 591
 592
 593 class GoogleSearchIE(SearchInfoExtractor):
 594     """Information Extractor for Google Video search queries."""
 595     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
 596     _MAX_RESULTS = 1000
 597     IE_NAME = u'video.google:search'
 598     _SEARCH_KEY = 'gvsearch'
 599
 600     def _get_n_results(self, query, n):
 601         """Get a specified number of results for a query"""
 602
 603         res = {
 604             '_type': 'playlist',
 605             'id': query,
 606             'entries': []
 607         }
 608
 609         for pagenum in itertools.count(1):
 610             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
 611             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
 612                                              note='Downloading result page ' + str(pagenum))
 613
 614             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
 615                 e = {
 616                     '_type': 'url',
 617                     'url': mobj.group(1)
 618                 }
 619                 res['entries'].append(e)
 620
 621             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
 622                 return res
 623
 624 class YahooSearchIE(SearchInfoExtractor):
 625     """Information Extractor for Yahoo! Video search queries."""
 626
 627     _MAX_RESULTS = 1000
 628     IE_NAME = u'screen.yahoo:search'
 629     _SEARCH_KEY = 'yvsearch'
 630
 631     def _get_n_results(self, query, n):
 632         """Get a specified number of results for a query"""
 633
 634         res = {
 635             '_type': 'playlist',
 636             'id': query,
 637             'entries': []
 638         }
 639         for pagenum in itertools.count(0):
 640             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 641             webpage = self._download_webpage(result_url, query,
 642                                              note='Downloading results page '+str(pagenum+1))
 643             info = json.loads(webpage)
 644             m = info[u'm']
 645             results = info[u'results']
 646
 647             for (i, r) in enumerate(results):
 648                 if (pagenum * 30) +i >= n:
 649                     break
 650                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 651                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 652                 res['entries'].append(e)
 653             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
 654                 break
 655
 656         return res
 657
 658
 659 class BlipTVUserIE(InfoExtractor):
 660     """Information Extractor for blip.tv users."""
 661
 662     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 663     _PAGE_SIZE = 12
 664     IE_NAME = u'blip.tv:user'
 665
 666     def _real_extract(self, url):
 667         # Extract username
 668         mobj = re.match(self._VALID_URL, url)
 669         if mobj is None:
 670             raise ExtractorError(u'Invalid URL: %s' % url)
 671
 672         username = mobj.group(1)
 673
 674         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 675
 676         page = self._download_webpage(url, username, u'Downloading user page')
 677         mobj = re.search(r'data-users-id="([^"]+)"', page)
 678         page_base = page_base % mobj.group(1)
 679
 680
 681         # Download video ids using BlipTV Ajax calls. Result size per
 682         # query is limited (currently to 12 videos) so we need to query
 683         # page by page until there are no video ids - it means we got
 684         # all of them.
 685
 686         video_ids = []
 687         pagenum = 1
 688
 689         while True:
 690             url = page_base + "&page=" + str(pagenum)
 691             page = self._download_webpage(url, username,
 692                                           u'Downloading video ids from page %d' % pagenum)
 693
 694             # Extract video identifiers
 695             ids_in_page = []
 696
 697             for mobj in re.finditer(r'href="/([^"]+)"', page):
 698                 if mobj.group(1) not in ids_in_page:
 699                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 700
 701             video_ids.extend(ids_in_page)
 702
 703             # A little optimization - if current page is not
 704             # "full", ie. does not contain PAGE_SIZE video ids then
 705             # we can assume that this page is the last one - there
 706             # are no more ids on further pages - no need to query
 707             # again.
 708
 709             if len(ids_in_page) < self._PAGE_SIZE:
 710                 break
 711
 712             pagenum += 1
 713
 714         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 715         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 716         return [self.playlist_result(url_entries, playlist_title = username)]
 717
 718
 719 class DepositFilesIE(InfoExtractor):
 720     """Information extractor for depositfiles.com"""
 721
 722     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 723
 724     def _real_extract(self, url):
 725         file_id = url.split('/')[-1]
 726         # Rebuild url in english locale
 727         url = 'http://depositfiles.com/en/files/' + file_id
 728
 729         # Retrieve file webpage with 'Free download' button pressed
 730         free_download_indication = { 'gateway_result' : '1' }
 731         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 732         try:
 733             self.report_download_webpage(file_id)
 734             webpage = compat_urllib_request.urlopen(request).read()
 735         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 736             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 737
 738         # Search for the real file URL
 739         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 740         if (mobj is None) or (mobj.group(1) is None):
 741             # Try to figure out reason of the error.
 742             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 743             if (mobj is not None) and (mobj.group(1) is not None):
 744                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 745                 raise ExtractorError(u'%s' % restriction_message)
 746             else:
 747                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 748
 749         file_url = mobj.group(1)
 750         file_extension = os.path.splitext(file_url)[1][1:]
 751
 752         # Search for file title
 753         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 754
 755         return [{
 756             'id':       file_id.decode('utf-8'),
 757             'url':      file_url.decode('utf-8'),
 758             'uploader': None,
 759             'upload_date':  None,
 760             'title':    file_title,
 761             'ext':      file_extension.decode('utf-8'),
 762         }]
 763
 764
 765 class FacebookIE(InfoExtractor):
 766     """Information Extractor for Facebook"""
 767
 768     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 769     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 770     _NETRC_MACHINE = 'facebook'
 771     IE_NAME = u'facebook'
 772
 773     def report_login(self):
 774         """Report attempt to log in."""
 775         self.to_screen(u'Logging in')
 776
 777     def _real_initialize(self):
 778         if self._downloader is None:
 779             return
 780
 781         useremail = None
 782         password = None
 783         downloader_params = self._downloader.params
 784
 785         # Attempt to use provided username and password or .netrc data
 786         if downloader_params.get('username', None) is not None:
 787             useremail = downloader_params['username']
 788             password = downloader_params['password']
 789         elif downloader_params.get('usenetrc', False):
 790             try:
 791                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 792                 if info is not None:
 793                     useremail = info[0]
 794                     password = info[2]
 795                 else:
 796                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 797             except (IOError, netrc.NetrcParseError) as err:
 798                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 799                 return
 800
 801         if useremail is None:
 802             return
 803
 804         # Log in
 805         login_form = {
 806             'email': useremail,
 807             'pass': password,
 808             'login': 'Log+In'
 809             }
 810         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 811         try:
 812             self.report_login()
 813             login_results = compat_urllib_request.urlopen(request).read()
 814             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 815                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 816                 return
 817         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 818             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 819             return
 820
 821     def _real_extract(self, url):
 822         mobj = re.match(self._VALID_URL, url)
 823         if mobj is None:
 824             raise ExtractorError(u'Invalid URL: %s' % url)
 825         video_id = mobj.group('ID')
 826
 827         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 828         webpage = self._download_webpage(url, video_id)
 829
 830         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 831         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 832         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 833         if not m:
 834             raise ExtractorError(u'Cannot parse data')
 835         data = dict(json.loads(m.group(1)))
 836         params_raw = compat_urllib_parse.unquote(data['params'])
 837         params = json.loads(params_raw)
 838         video_data = params['video_data'][0]
 839         video_url = video_data.get('hd_src')
 840         if not video_url:
 841             video_url = video_data['sd_src']
 842         if not video_url:
 843             raise ExtractorError(u'Cannot find video URL')
 844         video_duration = int(video_data['video_duration'])
 845         thumbnail = video_data['thumbnail_src']
 846
 847         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 848             webpage, u'title')
 849
 850         info = {
 851             'id': video_id,
 852             'title': video_title,
 853             'url': video_url,
 854             'ext': 'mp4',
 855             'duration': video_duration,
 856             'thumbnail': thumbnail,
 857         }
 858         return [info]
 859
 860
 861 class BlipTVIE(InfoExtractor):
 862     """Information extractor for blip.tv"""
 863
 864     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 865     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 866     IE_NAME = u'blip.tv'
 867
 868     def report_direct_download(self, title):
 869         """Report information extraction."""
 870         self.to_screen(u'%s: Direct download detected' % title)
 871
 872     def _real_extract(self, url):
 873         mobj = re.match(self._VALID_URL, url)
 874         if mobj is None:
 875             raise ExtractorError(u'Invalid URL: %s' % url)
 876
 877         # See https://github.com/rg3/youtube-dl/issues/857
 878         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 879         if api_mobj is not None:
 880             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 881         urlp = compat_urllib_parse_urlparse(url)
 882         if urlp.path.startswith('/play/'):
 883             request = compat_urllib_request.Request(url)
 884             response = compat_urllib_request.urlopen(request)
 885             redirecturl = response.geturl()
 886             rurlp = compat_urllib_parse_urlparse(redirecturl)
 887             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 888             url = 'http://blip.tv/a/a-' + file_id
 889             return self._real_extract(url)
 890
 891
 892         if '?' in url:
 893             cchar = '&'
 894         else:
 895             cchar = '?'
 896         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 897         request = compat_urllib_request.Request(json_url)
 898         request.add_header('User-Agent', 'iTunes/10.6.1')
 899         self.report_extraction(mobj.group(1))
 900         info = None
 901         try:
 902             urlh = compat_urllib_request.urlopen(request)
 903             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 904                 basename = url.split('/')[-1]
 905                 title,ext = os.path.splitext(basename)
 906                 title = title.decode('UTF-8')
 907                 ext = ext.replace('.', '')
 908                 self.report_direct_download(title)
 909                 info = {
 910                     'id': title,
 911                     'url': url,
 912                     'uploader': None,
 913                     'upload_date': None,
 914                     'title': title,
 915                     'ext': ext,
 916                     'urlhandle': urlh
 917                 }
 918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 919             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 920         if info is None: # Regular URL
 921             try:
 922                 json_code_bytes = urlh.read()
 923                 json_code = json_code_bytes.decode('utf-8')
 924             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 925                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 926
 927             try:
 928                 json_data = json.loads(json_code)
 929                 if 'Post' in json_data:
 930                     data = json_data['Post']
 931                 else:
 932                     data = json_data
 933
 934                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 935                 video_url = data['media']['url']
 936                 umobj = re.match(self._URL_EXT, video_url)
 937                 if umobj is None:
 938                     raise ValueError('Can not determine filename extension')
 939                 ext = umobj.group(1)
 940
 941                 info = {
 942                     'id': data['item_id'],
 943                     'url': video_url,
 944                     'uploader': data['display_name'],
 945                     'upload_date': upload_date,
 946                     'title': data['title'],
 947                     'ext': ext,
 948                     'format': data['media']['mimeType'],
 949                     'thumbnail': data['thumbnailUrl'],
 950                     'description': data['description'],
 951                     'player_url': data['embedUrl'],
 952                     'user_agent': 'iTunes/10.6.1',
 953                 }
 954             except (ValueError,KeyError) as err:
 955                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 956
 957         return [info]
 958
 959
 960 class MyVideoIE(InfoExtractor):
 961     """Information Extractor for myvideo.de."""
 962
 963     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 964     IE_NAME = u'myvideo'
 965
 966     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 967     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 968     # https://github.com/rg3/youtube-dl/pull/842
 969     def __rc4crypt(self,data, key):
 970         x = 0
 971         box = list(range(256))
 972         for i in list(range(256)):
 973             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 974             box[i], box[x] = box[x], box[i]
 975         x = 0
 976         y = 0
 977         out = ''
 978         for char in data:
 979             x = (x + 1) % 256
 980             y = (y + box[x]) % 256
 981             box[x], box[y] = box[y], box[x]
 982             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 983         return out
 984
 985     def __md5(self,s):
 986         return hashlib.md5(s).hexdigest().encode()
 987
 988     def _real_extract(self,url):
 989         mobj = re.match(self._VALID_URL, url)
 990         if mobj is None:
 991             raise ExtractorError(u'invalid URL: %s' % url)
 992
 993         video_id = mobj.group(1)
 994
 995         GK = (
 996           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 997           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 998           b'TnpsbA0KTVRkbU1tSTRNdz09'
 999         )
1000
1001         # Get video webpage
1002         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1003         webpage = self._download_webpage(webpage_url, video_id)
1004
1005         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
1006         if mobj is not None:
1007             self.report_extraction(video_id)
1008             video_url = mobj.group(1) + '.flv'
1009
1010             video_title = self._html_search_regex('<title>([^<]+)</title>',
1011                 webpage, u'title')
1012
1013             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
1014
1015             return [{
1016                 'id':       video_id,
1017                 'url':      video_url,
1018                 'uploader': None,
1019                 'upload_date':  None,
1020                 'title':    video_title,
1021                 'ext':      u'flv',
1022             }]
1023
1024         # try encxml
1025         mobj = re.search('var flashvars={(.+?)}', webpage)
1026         if mobj is None:
1027             raise ExtractorError(u'Unable to extract video')
1028
1029         params = {}
1030         encxml = ''
1031         sec = mobj.group(1)
1032         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
1033             if not a == '_encxml':
1034                 params[a] = b
1035             else:
1036                 encxml = compat_urllib_parse.unquote(b)
1037         if not params.get('domain'):
1038             params['domain'] = 'www.myvideo.de'
1039         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
1040         if 'flash_playertype=MTV' in xmldata_url:
1041             self._downloader.report_warning(u'avoiding MTV player')
1042             xmldata_url = (
1043                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
1044                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
1045             ) % video_id
1046
1047         # get enc data
1048         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
1049         enc_data_b = binascii.unhexlify(enc_data)
1050         sk = self.__md5(
1051             base64.b64decode(base64.b64decode(GK)) +
1052             self.__md5(
1053                 str(video_id).encode('utf-8')
1054             )
1055         )
1056         dec_data = self.__rc4crypt(enc_data_b, sk)
1057
1058         # extracting infos
1059         self.report_extraction(video_id)
1060
1061         video_url = None
1062         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1063         if mobj:
1064             video_url = compat_urllib_parse.unquote(mobj.group(1))
1065             if 'myvideo2flash' in video_url:
1066                 self._downloader.report_warning(u'forcing RTMPT ...')
1067                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1068
1069         if not video_url:
1070             # extract non rtmp videos
1071             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1072             if mobj is None:
1073                 raise ExtractorError(u'unable to extract url')
1074             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1075
1076         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1077         video_file = compat_urllib_parse.unquote(video_file)
1078
1079         if not video_file.endswith('f4m'):
1080             ppath, prefix = video_file.split('.')
1081             video_playpath = '%s:%s' % (prefix, ppath)
1082             video_hls_playlist = ''
1083         else:
1084             video_playpath = ''
1085             video_hls_playlist = (
1086                 video_filepath + video_file
1087             ).replace('.f4m', '.m3u8')
1088
1089         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1090         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1091
1092         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1093             webpage, u'title')
1094
1095         return [{
1096             'id':                 video_id,
1097             'url':                video_url,
1098             'tc_url':             video_url,
1099             'uploader':           None,
1100             'upload_date':        None,
1101             'title':              video_title,
1102             'ext':                u'flv',
1103             'play_path':          video_playpath,
1104             'video_file':         video_file,
1105             'video_hls_playlist': video_hls_playlist,
1106             'player_url':         video_swfobj,
1107         }]
1108
1109
1110 class ComedyCentralIE(InfoExtractor):
1111     """Information extractor for The Daily Show and Colbert Report """
1112
1113     # urls can be abbreviations like :thedailyshow or :colbert
1114     # urls for episodes like:
1115     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
1116     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
1117     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
1118     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
1119                       |(https?://)?(www\.)?
1120                           (?P<showname>thedailyshow|colbertnation)\.com/
1121                          (full-episodes/(?P<episode>.*)|
1122                           (?P<clip>
1123                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1124                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1125                      $"""
1126
1127     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1128
1129     _video_extensions = {
1130         '3500': 'mp4',
1131         '2200': 'mp4',
1132         '1700': 'mp4',
1133         '1200': 'mp4',
1134         '750': 'mp4',
1135         '400': 'mp4',
1136     }
1137     _video_dimensions = {
1138         '3500': '1280x720',
1139         '2200': '960x540',
1140         '1700': '768x432',
1141         '1200': '640x360',
1142         '750': '512x288',
1143         '400': '384x216',
1144     }
1145
1146     @classmethod
1147     def suitable(cls, url):
1148         """Receives a URL and returns True if suitable for this IE."""
1149         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1150
1151     def _print_formats(self, formats):
1152         print('Available formats:')
1153         for x in formats:
1154             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1155
1156
1157     def _real_extract(self, url):
1158         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1159         if mobj is None:
1160             raise ExtractorError(u'Invalid URL: %s' % url)
1161
1162         if mobj.group('shortname'):
1163             if mobj.group('shortname') in ('tds', 'thedailyshow'):
1164                 url = u'http://www.thedailyshow.com/full-episodes/'
1165             else:
1166                 url = u'http://www.colbertnation.com/full-episodes/'
1167             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1168             assert mobj is not None
1169
1170         if mobj.group('clip'):
1171             if mobj.group('showname') == 'thedailyshow':
1172                 epTitle = mobj.group('tdstitle')
1173             else:
1174                 epTitle = mobj.group('cntitle')
1175             dlNewest = False
1176         else:
1177             dlNewest = not mobj.group('episode')
1178             if dlNewest:
1179                 epTitle = mobj.group('showname')
1180             else:
1181                 epTitle = mobj.group('episode')
1182
1183         self.report_extraction(epTitle)
1184         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1185         if dlNewest:
1186             url = htmlHandle.geturl()
1187             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1188             if mobj is None:
1189                 raise ExtractorError(u'Invalid redirected URL: ' + url)
1190             if mobj.group('episode') == '':
1191                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1192             epTitle = mobj.group('episode')
1193
1194         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1195
1196         if len(mMovieParams) == 0:
1197             # The Colbert Report embeds the information in a without
1198             # a URL prefix; so extract the alternate reference
1199             # and then add the URL prefix manually.
1200
1201             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1202             if len(altMovieParams) == 0:
1203                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1204             else:
1205                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1206
1207         uri = mMovieParams[0][1]
1208         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1209         indexXml = self._download_webpage(indexUrl, epTitle,
1210                                           u'Downloading show index',
1211                                           u'unable to download episode index')
1212
1213         results = []
1214
1215         idoc = xml.etree.ElementTree.fromstring(indexXml)
1216         itemEls = idoc.findall('.//item')
1217         for partNum,itemEl in enumerate(itemEls):
1218             mediaId = itemEl.findall('./guid')[0].text
1219             shortMediaId = mediaId.split(':')[-1]
1220             showId = mediaId.split(':')[-2].replace('.com', '')
1221             officialTitle = itemEl.findall('./title')[0].text
1222             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1223
1224             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1225                         compat_urllib_parse.urlencode({'uri': mediaId}))
1226             configXml = self._download_webpage(configUrl, epTitle,
1227                                                u'Downloading configuration for %s' % shortMediaId)
1228
1229             cdoc = xml.etree.ElementTree.fromstring(configXml)
1230             turls = []
1231             for rendition in cdoc.findall('.//rendition'):
1232                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1233                 turls.append(finfo)
1234
1235             if len(turls) == 0:
1236                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1237                 continue
1238
1239             if self._downloader.params.get('listformats', None):
1240                 self._print_formats([i[0] for i in turls])
1241                 return
1242
1243             # For now, just pick the highest bitrate
1244             format,rtmp_video_url = turls[-1]
1245
1246             # Get the format arg from the arg stream
1247             req_format = self._downloader.params.get('format', None)
1248
1249             # Select format if we can find one
1250             for f,v in turls:
1251                 if f == req_format:
1252                     format, rtmp_video_url = f, v
1253                     break
1254
1255             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1256             if not m:
1257                 raise ExtractorError(u'Cannot transform RTMP url')
1258             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1259             video_url = base + m.group('finalid')
1260
1261             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1262             info = {
1263                 'id': shortMediaId,
1264                 'url': video_url,
1265                 'uploader': showId,
1266                 'upload_date': officialDate,
1267                 'title': effTitle,
1268                 'ext': 'mp4',
1269                 'format': format,
1270                 'thumbnail': None,
1271                 'description': officialTitle,
1272             }
1273             results.append(info)
1274
1275         return results
1276
1277
1278 class EscapistIE(InfoExtractor):
1279     """Information extractor for The Escapist """
1280
1281     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1282     IE_NAME = u'escapist'
1283
1284     def _real_extract(self, url):
1285         mobj = re.match(self._VALID_URL, url)
1286         if mobj is None:
1287             raise ExtractorError(u'Invalid URL: %s' % url)
1288         showName = mobj.group('showname')
1289         videoId = mobj.group('episode')
1290
1291         self.report_extraction(videoId)
1292         webpage = self._download_webpage(url, videoId)
1293
1294         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1295             webpage, u'description', fatal=False)
1296
1297         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1298             webpage, u'thumbnail', fatal=False)
1299
1300         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1301             webpage, u'player url')
1302
1303         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1304             webpage, u'player url').split(' : ')[-1]
1305
1306         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1307         configUrl = compat_urllib_parse.unquote(configUrl)
1308
1309         configJSON = self._download_webpage(configUrl, videoId,
1310                                             u'Downloading configuration',
1311                                             u'unable to download configuration')
1312
1313         # Technically, it's JavaScript, not JSON
1314         configJSON = configJSON.replace("'", '"')
1315
1316         try:
1317             config = json.loads(configJSON)
1318         except (ValueError,) as err:
1319             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1320
1321         playlist = config['playlist']
1322         videoUrl = playlist[1]['url']
1323
1324         info = {
1325             'id': videoId,
1326             'url': videoUrl,
1327             'uploader': showName,
1328             'upload_date': None,
1329             'title': title,
1330             'ext': 'mp4',
1331             'thumbnail': imgUrl,
1332             'description': videoDesc,
1333             'player_url': playerUrl,
1334         }
1335
1336         return [info]
1337
1338 class CollegeHumorIE(InfoExtractor):
1339     """Information extractor for collegehumor.com"""
1340
1341     _WORKING = False
1342     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1343     IE_NAME = u'collegehumor'
1344
1345     def report_manifest(self, video_id):
1346         """Report information extraction."""
1347         self.to_screen(u'%s: Downloading XML manifest' % video_id)
1348
1349     def _real_extract(self, url):
1350         mobj = re.match(self._VALID_URL, url)
1351         if mobj is None:
1352             raise ExtractorError(u'Invalid URL: %s' % url)
1353         video_id = mobj.group('videoid')
1354
1355         info = {
1356             'id': video_id,
1357             'uploader': None,
1358             'upload_date': None,
1359         }
1360
1361         self.report_extraction(video_id)
1362         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1363         try:
1364             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1365         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1366             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1367
1368         mdoc = xml.etree.ElementTree.fromstring(metaXml)
1369         try:
1370             videoNode = mdoc.findall('./video')[0]
1371             info['description'] = videoNode.findall('./description')[0].text
1372             info['title'] = videoNode.findall('./caption')[0].text
1373             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1374             manifest_url = videoNode.findall('./file')[0].text
1375         except IndexError:
1376             raise ExtractorError(u'Invalid metadata XML file')
1377
1378         manifest_url += '?hdcore=2.10.3'
1379         self.report_manifest(video_id)
1380         try:
1381             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1382         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1383             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1384
1385         adoc = xml.etree.ElementTree.fromstring(manifestXml)
1386         try:
1387             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1388             node_id = media_node.attrib['url']
1389             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1390         except IndexError as err:
1391             raise ExtractorError(u'Invalid manifest file')
1392
1393         url_pr = compat_urllib_parse_urlparse(manifest_url)
1394         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1395
1396         info['url'] = url
1397         info['ext'] = 'f4f'
1398         return [info]
1399
1400
1401 class XVideosIE(InfoExtractor):
1402     """Information extractor for xvideos.com"""
1403
1404     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1405     IE_NAME = u'xvideos'
1406
1407     def _real_extract(self, url):
1408         mobj = re.match(self._VALID_URL, url)
1409         if mobj is None:
1410             raise ExtractorError(u'Invalid URL: %s' % url)
1411         video_id = mobj.group(1)
1412
1413         webpage = self._download_webpage(url, video_id)
1414
1415         self.report_extraction(video_id)
1416
1417         # Extract video URL
1418         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1419             webpage, u'video URL'))
1420
1421         # Extract title
1422         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1423             webpage, u'title')
1424
1425         # Extract video thumbnail
1426         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1427             webpage, u'thumbnail', fatal=False)
1428
1429         info = {
1430             'id': video_id,
1431             'url': video_url,
1432             'uploader': None,
1433             'upload_date': None,
1434             'title': video_title,
1435             'ext': 'flv',
1436             'thumbnail': video_thumbnail,
1437             'description': None,
1438         }
1439
1440         return [info]
1441
1442
1443 class SoundcloudIE(InfoExtractor):
1444     """Information extractor for soundcloud.com
1445        To access the media, the uid of the song and a stream token
1446        must be extracted from the page source and the script must make
1447        a request to media.soundcloud.com/crossdomain.xml. Then
1448        the media can be grabbed by requesting from an url composed
1449        of the stream token and uid
1450      """
1451
1452     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1453     IE_NAME = u'soundcloud'
1454
1455     def report_resolve(self, video_id):
1456         """Report information extraction."""
1457         self.to_screen(u'%s: Resolving id' % video_id)
1458
1459     def _real_extract(self, url):
1460         mobj = re.match(self._VALID_URL, url)
1461         if mobj is None:
1462             raise ExtractorError(u'Invalid URL: %s' % url)
1463
1464         # extract uploader (which is in the url)
1465         uploader = mobj.group(1)
1466         # extract simple title (uploader + slug of song title)
1467         slug_title =  mobj.group(2)
1468         simple_title = uploader + u'-' + slug_title
1469         full_title = '%s/%s' % (uploader, slug_title)
1470
1471         self.report_resolve(full_title)
1472
1473         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1474         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1475         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1476
1477         info = json.loads(info_json)
1478         video_id = info['id']
1479         self.report_extraction(full_title)
1480
1481         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1482         stream_json = self._download_webpage(streams_url, full_title,
1483                                              u'Downloading stream definitions',
1484                                              u'unable to download stream definitions')
1485
1486         streams = json.loads(stream_json)
1487         mediaURL = streams['http_mp3_128_url']
1488         upload_date = unified_strdate(info['created_at'])
1489
1490         return [{
1491             'id':       info['id'],
1492             'url':      mediaURL,
1493             'uploader': info['user']['username'],
1494             'upload_date': upload_date,
1495             'title':    info['title'],
1496             'ext':      u'mp3',
1497             'description': info['description'],
1498         }]
1499
1500 class SoundcloudSetIE(InfoExtractor):
1501     """Information extractor for soundcloud.com sets
1502        To access the media, the uid of the song and a stream token
1503        must be extracted from the page source and the script must make
1504        a request to media.soundcloud.com/crossdomain.xml. Then
1505        the media can be grabbed by requesting from an url composed
1506        of the stream token and uid
1507      """
1508
1509     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1510     IE_NAME = u'soundcloud:set'
1511
1512     def report_resolve(self, video_id):
1513         """Report information extraction."""
1514         self.to_screen(u'%s: Resolving id' % video_id)
1515
1516     def _real_extract(self, url):
1517         mobj = re.match(self._VALID_URL, url)
1518         if mobj is None:
1519             raise ExtractorError(u'Invalid URL: %s' % url)
1520
1521         # extract uploader (which is in the url)
1522         uploader = mobj.group(1)
1523         # extract simple title (uploader + slug of song title)
1524         slug_title =  mobj.group(2)
1525         simple_title = uploader + u'-' + slug_title
1526         full_title = '%s/sets/%s' % (uploader, slug_title)
1527
1528         self.report_resolve(full_title)
1529
1530         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1531         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1532         info_json = self._download_webpage(resolv_url, full_title)
1533
1534         videos = []
1535         info = json.loads(info_json)
1536         if 'errors' in info:
1537             for err in info['errors']:
1538                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1539             return
1540
1541         self.report_extraction(full_title)
1542         for track in info['tracks']:
1543             video_id = track['id']
1544
1545             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1546             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1547
1548             self.report_extraction(video_id)
1549             streams = json.loads(stream_json)
1550             mediaURL = streams['http_mp3_128_url']
1551
1552             videos.append({
1553                 'id':       video_id,
1554                 'url':      mediaURL,
1555                 'uploader': track['user']['username'],
1556                 'upload_date':  unified_strdate(track['created_at']),
1557                 'title':    track['title'],
1558                 'ext':      u'mp3',
1559                 'description': track['description'],
1560             })
1561         return videos
1562
1563
1564 class InfoQIE(InfoExtractor):
1565     """Information extractor for infoq.com"""
1566     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1567
1568     def _real_extract(self, url):
1569         mobj = re.match(self._VALID_URL, url)
1570         if mobj is None:
1571             raise ExtractorError(u'Invalid URL: %s' % url)
1572
1573         webpage = self._download_webpage(url, video_id=url)
1574         self.report_extraction(url)
1575
1576         # Extract video URL
1577         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1578         if mobj is None:
1579             raise ExtractorError(u'Unable to extract video url')
1580         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1581         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1582
1583         # Extract title
1584         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1585             webpage, u'title')
1586
1587         # Extract description
1588         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1589             webpage, u'description', fatal=False)
1590
1591         video_filename = video_url.split('/')[-1]
1592         video_id, extension = video_filename.split('.')
1593
1594         info = {
1595             'id': video_id,
1596             'url': video_url,
1597             'uploader': None,
1598             'upload_date': None,
1599             'title': video_title,
1600             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1601             'thumbnail': None,
1602             'description': video_description,
1603         }
1604
1605         return [info]
1606
1607 class MixcloudIE(InfoExtractor):
1608     """Information extractor for www.mixcloud.com"""
1609
1610     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1611     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1612     IE_NAME = u'mixcloud'
1613
1614     def report_download_json(self, file_id):
1615         """Report JSON download."""
1616         self.to_screen(u'Downloading json')
1617
1618     def get_urls(self, jsonData, fmt, bitrate='best'):
1619         """Get urls from 'audio_formats' section in json"""
1620         file_url = None
1621         try:
1622             bitrate_list = jsonData[fmt]
1623             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1624                 bitrate = max(bitrate_list) # select highest
1625
1626             url_list = jsonData[fmt][bitrate]
1627         except TypeError: # we have no bitrate info.
1628             url_list = jsonData[fmt]
1629         return url_list
1630
1631     def check_urls(self, url_list):
1632         """Returns 1st active url from list"""
1633         for url in url_list:
1634             try:
1635                 compat_urllib_request.urlopen(url)
1636                 return url
1637             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1638                 url = None
1639
1640         return None
1641
1642     def _print_formats(self, formats):
1643         print('Available formats:')
1644         for fmt in formats.keys():
1645             for b in formats[fmt]:
1646                 try:
1647                     ext = formats[fmt][b][0]
1648                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1649                 except TypeError: # we have no bitrate info
1650                     ext = formats[fmt][0]
1651                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1652                     break
1653
1654     def _real_extract(self, url):
1655         mobj = re.match(self._VALID_URL, url)
1656         if mobj is None:
1657             raise ExtractorError(u'Invalid URL: %s' % url)
1658         # extract uploader & filename from url
1659         uploader = mobj.group(1).decode('utf-8')
1660         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1661
1662         # construct API request
1663         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1664         # retrieve .json file with links to files
1665         request = compat_urllib_request.Request(file_url)
1666         try:
1667             self.report_download_json(file_url)
1668             jsonData = compat_urllib_request.urlopen(request).read()
1669         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1670             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1671
1672         # parse JSON
1673         json_data = json.loads(jsonData)
1674         player_url = json_data['player_swf_url']
1675         formats = dict(json_data['audio_formats'])
1676
1677         req_format = self._downloader.params.get('format', None)
1678         bitrate = None
1679
1680         if self._downloader.params.get('listformats', None):
1681             self._print_formats(formats)
1682             return
1683
1684         if req_format is None or req_format == 'best':
1685             for format_param in formats.keys():
1686                 url_list = self.get_urls(formats, format_param)
1687                 # check urls
1688                 file_url = self.check_urls(url_list)
1689                 if file_url is not None:
1690                     break # got it!
1691         else:
1692             if req_format not in formats:
1693                 raise ExtractorError(u'Format is not available')
1694
1695             url_list = self.get_urls(formats, req_format)
1696             file_url = self.check_urls(url_list)
1697             format_param = req_format
1698
1699         return [{
1700             'id': file_id.decode('utf-8'),
1701             'url': file_url.decode('utf-8'),
1702             'uploader': uploader.decode('utf-8'),
1703             'upload_date': None,
1704             'title': json_data['name'],
1705             'ext': file_url.split('.')[-1].decode('utf-8'),
1706             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1707             'thumbnail': json_data['thumbnail_url'],
1708             'description': json_data['description'],
1709             'player_url': player_url.decode('utf-8'),
1710         }]
1711
1712 class StanfordOpenClassroomIE(InfoExtractor):
1713     """Information extractor for Stanford's Open ClassRoom"""
1714
1715     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1716     IE_NAME = u'stanfordoc'
1717
1718     def _real_extract(self, url):
1719         mobj = re.match(self._VALID_URL, url)
1720         if mobj is None:
1721             raise ExtractorError(u'Invalid URL: %s' % url)
1722
1723         if mobj.group('course') and mobj.group('video'): # A specific video
1724             course = mobj.group('course')
1725             video = mobj.group('video')
1726             info = {
1727                 'id': course + '_' + video,
1728                 'uploader': None,
1729                 'upload_date': None,
1730             }
1731
1732             self.report_extraction(info['id'])
1733             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1734             xmlUrl = baseUrl + video + '.xml'
1735             try:
1736                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1737             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1738                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1739             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1740             try:
1741                 info['title'] = mdoc.findall('./title')[0].text
1742                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1743             except IndexError:
1744                 raise ExtractorError(u'Invalid metadata XML file')
1745             info['ext'] = info['url'].rpartition('.')[2]
1746             return [info]
1747         elif mobj.group('course'): # A course page
1748             course = mobj.group('course')
1749             info = {
1750                 'id': course,
1751                 'type': 'playlist',
1752                 'uploader': None,
1753                 'upload_date': None,
1754             }
1755
1756             coursepage = self._download_webpage(url, info['id'],
1757                                         note='Downloading course info page',
1758                                         errnote='Unable to download course info page')
1759
1760             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1761
1762             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1763                 coursepage, u'description', fatal=False)
1764
1765             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1766             info['list'] = [
1767                 {
1768                     'type': 'reference',
1769                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1770                 }
1771                     for vpage in links]
1772             results = []
1773             for entry in info['list']:
1774                 assert entry['type'] == 'reference'
1775                 results += self.extract(entry['url'])
1776             return results
1777         else: # Root page
1778             info = {
1779                 'id': 'Stanford OpenClassroom',
1780                 'type': 'playlist',
1781                 'uploader': None,
1782                 'upload_date': None,
1783             }
1784
1785             self.report_download_webpage(info['id'])
1786             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1787             try:
1788                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1789             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1790                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1791
1792             info['title'] = info['id']
1793
1794             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1795             info['list'] = [
1796                 {
1797                     'type': 'reference',
1798                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1799                 }
1800                     for cpage in links]
1801
1802             results = []
1803             for entry in info['list']:
1804                 assert entry['type'] == 'reference'
1805                 results += self.extract(entry['url'])
1806             return results
1807
1808 class MTVIE(InfoExtractor):
1809     """Information extractor for MTV.com"""
1810
1811     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1812     IE_NAME = u'mtv'
1813
1814     def _real_extract(self, url):
1815         mobj = re.match(self._VALID_URL, url)
1816         if mobj is None:
1817             raise ExtractorError(u'Invalid URL: %s' % url)
1818         if not mobj.group('proto'):
1819             url = 'http://' + url
1820         video_id = mobj.group('videoid')
1821
1822         webpage = self._download_webpage(url, video_id)
1823
1824         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1825             webpage, u'song name', fatal=False)
1826
1827         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1828             webpage, u'title')
1829
1830         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1831             webpage, u'mtvn_uri', fatal=False)
1832
1833         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1834             webpage, u'content id', fatal=False)
1835
1836         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1837         self.report_extraction(video_id)
1838         request = compat_urllib_request.Request(videogen_url)
1839         try:
1840             metadataXml = compat_urllib_request.urlopen(request).read()
1841         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1842             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1843
1844         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1845         renditions = mdoc.findall('.//rendition')
1846
1847         # For now, always pick the highest quality.
1848         rendition = renditions[-1]
1849
1850         try:
1851             _,_,ext = rendition.attrib['type'].partition('/')
1852             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1853             video_url = rendition.find('./src').text
1854         except KeyError:
1855             raise ExtractorError('Invalid rendition field.')
1856
1857         info = {
1858             'id': video_id,
1859             'url': video_url,
1860             'uploader': performer,
1861             'upload_date': None,
1862             'title': video_title,
1863             'ext': ext,
1864             'format': format,
1865         }
1866
1867         return [info]
1868
1869
1870 class YoukuIE(InfoExtractor):
1871     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1872
1873     def _gen_sid(self):
1874         nowTime = int(time.time() * 1000)
1875         random1 = random.randint(1000,1998)
1876         random2 = random.randint(1000,9999)
1877
1878         return "%d%d%d" %(nowTime,random1,random2)
1879
1880     def _get_file_ID_mix_string(self, seed):
1881         mixed = []
1882         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1883         seed = float(seed)
1884         for i in range(len(source)):
1885             seed  =  (seed * 211 + 30031 ) % 65536
1886             index  =  math.floor(seed / 65536 * len(source) )
1887             mixed.append(source[int(index)])
1888             source.remove(source[int(index)])
1889         #return ''.join(mixed)
1890         return mixed
1891
1892     def _get_file_id(self, fileId, seed):
1893         mixed = self._get_file_ID_mix_string(seed)
1894         ids = fileId.split('*')
1895         realId = []
1896         for ch in ids:
1897             if ch:
1898                 realId.append(mixed[int(ch)])
1899         return ''.join(realId)
1900
1901     def _real_extract(self, url):
1902         mobj = re.match(self._VALID_URL, url)
1903         if mobj is None:
1904             raise ExtractorError(u'Invalid URL: %s' % url)
1905         video_id = mobj.group('ID')
1906
1907         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1908
1909         jsondata = self._download_webpage(info_url, video_id)
1910
1911         self.report_extraction(video_id)
1912         try:
1913             config = json.loads(jsondata)
1914
1915             video_title =  config['data'][0]['title']
1916             seed = config['data'][0]['seed']
1917
1918             format = self._downloader.params.get('format', None)
1919             supported_format = list(config['data'][0]['streamfileids'].keys())
1920
1921             if format is None or format == 'best':
1922                 if 'hd2' in supported_format:
1923                     format = 'hd2'
1924                 else:
1925                     format = 'flv'
1926                 ext = u'flv'
1927             elif format == 'worst':
1928                 format = 'mp4'
1929                 ext = u'mp4'
1930             else:
1931                 format = 'flv'
1932                 ext = u'flv'
1933
1934
1935             fileid = config['data'][0]['streamfileids'][format]
1936             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1937         except (UnicodeDecodeError, ValueError, KeyError):
1938             raise ExtractorError(u'Unable to extract info section')
1939
1940         files_info=[]
1941         sid = self._gen_sid()
1942         fileid = self._get_file_id(fileid, seed)
1943
1944         #column 8,9 of fileid represent the segment number
1945         #fileid[7:9] should be changed
1946         for index, key in enumerate(keys):
1947
1948             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1949             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1950
1951             info = {
1952                 'id': '%s_part%02d' % (video_id, index),
1953                 'url': download_url,
1954                 'uploader': None,
1955                 'upload_date': None,
1956                 'title': video_title,
1957                 'ext': ext,
1958             }
1959             files_info.append(info)
1960
1961         return files_info
1962
1963
1964 class XNXXIE(InfoExtractor):
1965     """Information extractor for xnxx.com"""
1966
1967     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1968     IE_NAME = u'xnxx'
1969     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1970     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1971     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1972
1973     def _real_extract(self, url):
1974         mobj = re.match(self._VALID_URL, url)
1975         if mobj is None:
1976             raise ExtractorError(u'Invalid URL: %s' % url)
1977         video_id = mobj.group(1)
1978
1979         # Get webpage content
1980         webpage = self._download_webpage(url, video_id)
1981
1982         video_url = self._search_regex(self.VIDEO_URL_RE,
1983             webpage, u'video URL')
1984         video_url = compat_urllib_parse.unquote(video_url)
1985
1986         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1987             webpage, u'title')
1988
1989         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1990             webpage, u'thumbnail', fatal=False)
1991
1992         return [{
1993             'id': video_id,
1994             'url': video_url,
1995             'uploader': None,
1996             'upload_date': None,
1997             'title': video_title,
1998             'ext': 'flv',
1999             'thumbnail': video_thumbnail,
2000             'description': None,
2001         }]
2002
2003
2004 class GooglePlusIE(InfoExtractor):
2005     """Information extractor for plus.google.com."""
2006
2007     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2008     IE_NAME = u'plus.google'
2009
2010     def _real_extract(self, url):
2011         # Extract id from URL
2012         mobj = re.match(self._VALID_URL, url)
2013         if mobj is None:
2014             raise ExtractorError(u'Invalid URL: %s' % url)
2015
2016         post_url = mobj.group(0)
2017         video_id = mobj.group(1)
2018
2019         video_extension = 'flv'
2020
2021         # Step 1, Retrieve post webpage to extract further information
2022         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2023
2024         self.report_extraction(video_id)
2025
2026         # Extract update date
2027         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
2028             webpage, u'upload date', fatal=False)
2029         if upload_date:
2030             # Convert timestring to a format suitable for filename
2031             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2032             upload_date = upload_date.strftime('%Y%m%d')
2033
2034         # Extract uploader
2035         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
2036             webpage, u'uploader', fatal=False)
2037
2038         # Extract title
2039         # Get the first line for title
2040         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
2041             webpage, 'title', default=u'NA')
2042
2043         # Step 2, Stimulate clicking the image box to launch video
2044         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
2045             webpage, u'video page URL')
2046         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
2047
2048         # Extract video links on video page
2049         """Extract video links of all sizes"""
2050         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
2051         mobj = re.findall(pattern, webpage)
2052         if len(mobj) == 0:
2053             raise ExtractorError(u'Unable to extract video links')
2054
2055         # Sort in resolution
2056         links = sorted(mobj)
2057
2058         # Choose the lowest of the sort, i.e. highest resolution
2059         video_url = links[-1]
2060         # Only get the url. The resolution part in the tuple has no use anymore
2061         video_url = video_url[-1]
2062         # Treat escaped \u0026 style hex
2063         try:
2064             video_url = video_url.decode("unicode_escape")
2065         except AttributeError: # Python 3
2066             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2067
2068
2069         return [{
2070             'id':       video_id,
2071             'url':      video_url,
2072             'uploader': uploader,
2073             'upload_date':  upload_date,
2074             'title':    video_title,
2075             'ext':      video_extension,
2076         }]
2077
2078 class NBAIE(InfoExtractor):
2079     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2080     IE_NAME = u'nba'
2081
2082     def _real_extract(self, url):
2083         mobj = re.match(self._VALID_URL, url)
2084         if mobj is None:
2085             raise ExtractorError(u'Invalid URL: %s' % url)
2086
2087         video_id = mobj.group(1)
2088
2089         webpage = self._download_webpage(url, video_id)
2090
2091         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2092
2093         shortened_video_id = video_id.rpartition('/')[2]
2094         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2095             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2096
2097         # It isn't there in the HTML it returns to us
2098         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2099
2100         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2101
2102         info = {
2103             'id': shortened_video_id,
2104             'url': video_url,
2105             'ext': 'mp4',
2106             'title': title,
2107             # 'uploader_date': uploader_date,
2108             'description': description,
2109         }
2110         return [info]
2111
2112 class JustinTVIE(InfoExtractor):
2113     """Information extractor for justin.tv and twitch.tv"""
2114     # TODO: One broadcast may be split into multiple videos. The key
2115     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
2116     # starts at 1 and increases. Can we treat all parts as one video?
2117
2118     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
2119         (?:
2120             (?P<channelid>[^/]+)|
2121             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2122             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2123         )
2124         /?(?:\#.*)?$
2125         """
2126     _JUSTIN_PAGE_LIMIT = 100
2127     IE_NAME = u'justin.tv'
2128
2129     def report_download_page(self, channel, offset):
2130         """Report attempt to download a single page of videos."""
2131         self.to_screen(u'%s: Downloading video information from %d to %d' %
2132                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2133
2134     # Return count of items, list of *valid* items
2135     def _parse_page(self, url, video_id):
2136         webpage = self._download_webpage(url, video_id,
2137                                          u'Downloading video info JSON',
2138                                          u'unable to download video info JSON')
2139
2140         response = json.loads(webpage)
2141         if type(response) != list:
2142             error_text = response.get('error', 'unknown error')
2143             raise ExtractorError(u'Justin.tv API: %s' % error_text)
2144         info = []
2145         for clip in response:
2146             video_url = clip['video_file_url']
2147             if video_url:
2148                 video_extension = os.path.splitext(video_url)[1][1:]
2149                 video_date = re.sub('-', '', clip['start_time'][:10])
2150                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2151                 video_id = clip['id']
2152                 video_title = clip.get('title', video_id)
2153                 info.append({
2154                     'id': video_id,
2155                     'url': video_url,
2156                     'title': video_title,
2157                     'uploader': clip.get('channel_name', video_uploader_id),
2158                     'uploader_id': video_uploader_id,
2159                     'upload_date': video_date,
2160                     'ext': video_extension,
2161                 })
2162         return (len(response), info)
2163
2164     def _real_extract(self, url):
2165         mobj = re.match(self._VALID_URL, url)
2166         if mobj is None:
2167             raise ExtractorError(u'invalid URL: %s' % url)
2168
2169         api_base = 'http://api.justin.tv'
2170         paged = False
2171         if mobj.group('channelid'):
2172             paged = True
2173             video_id = mobj.group('channelid')
2174             api = api_base + '/channel/archives/%s.json' % video_id
2175         elif mobj.group('chapterid'):
2176             chapter_id = mobj.group('chapterid')
2177
2178             webpage = self._download_webpage(url, chapter_id)
2179             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2180             if not m:
2181                 raise ExtractorError(u'Cannot find archive of a chapter')
2182             archive_id = m.group(1)
2183
2184             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2185             chapter_info_xml = self._download_webpage(api, chapter_id,
2186                                              note=u'Downloading chapter information',
2187                                              errnote=u'Chapter information download failed')
2188             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2189             for a in doc.findall('.//archive'):
2190                 if archive_id == a.find('./id').text:
2191                     break
2192             else:
2193                 raise ExtractorError(u'Could not find chapter in chapter information')
2194
2195             video_url = a.find('./video_file_url').text
2196             video_ext = video_url.rpartition('.')[2] or u'flv'
2197
2198             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2199             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2200                                    note='Downloading chapter metadata',
2201                                    errnote='Download of chapter metadata failed')
2202             chapter_info = json.loads(chapter_info_json)
2203
2204             bracket_start = int(doc.find('.//bracket_start').text)
2205             bracket_end = int(doc.find('.//bracket_end').text)
2206
2207             # TODO determine start (and probably fix up file)
2208             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2209             #video_url += u'?start=' + TODO:start_timestamp
2210             # bracket_start is 13290, but we want 51670615
2211             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2212                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2213
2214             info = {
2215                 'id': u'c' + chapter_id,
2216                 'url': video_url,
2217                 'ext': video_ext,
2218                 'title': chapter_info['title'],
2219                 'thumbnail': chapter_info['preview'],
2220                 'description': chapter_info['description'],
2221                 'uploader': chapter_info['channel']['display_name'],
2222                 'uploader_id': chapter_info['channel']['name'],
2223             }
2224             return [info]
2225         else:
2226             video_id = mobj.group('videoid')
2227             api = api_base + '/broadcast/by_archive/%s.json' % video_id
2228
2229         self.report_extraction(video_id)
2230
2231         info = []
2232         offset = 0
2233         limit = self._JUSTIN_PAGE_LIMIT
2234         while True:
2235             if paged:
2236                 self.report_download_page(video_id, offset)
2237             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2238             page_count, page_info = self._parse_page(page_url, video_id)
2239             info.extend(page_info)
2240             if not paged or page_count != limit:
2241                 break
2242             offset += limit
2243         return info
2244
2245 class FunnyOrDieIE(InfoExtractor):
2246     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2247
2248     def _real_extract(self, url):
2249         mobj = re.match(self._VALID_URL, url)
2250         if mobj is None:
2251             raise ExtractorError(u'invalid URL: %s' % url)
2252
2253         video_id = mobj.group('id')
2254         webpage = self._download_webpage(url, video_id)
2255
2256         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2257             webpage, u'video URL', flags=re.DOTALL)
2258
2259         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2260             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2261
2262         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2263             webpage, u'description', fatal=False, flags=re.DOTALL)
2264
2265         info = {
2266             'id': video_id,
2267             'url': video_url,
2268             'ext': 'mp4',
2269             'title': title,
2270             'description': video_description,
2271         }
2272         return [info]
2273
2274 class SteamIE(InfoExtractor):
2275     _VALID_URL = r"""http://store\.steampowered\.com/
2276                 (agecheck/)?
2277                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2278                 (?P<gameID>\d+)/?
2279                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2280                 """
2281     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2282     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2283
2284     @classmethod
2285     def suitable(cls, url):
2286         """Receives a URL and returns True if suitable for this IE."""
2287         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2288
2289     def _real_extract(self, url):
2290         m = re.match(self._VALID_URL, url, re.VERBOSE)
2291         gameID = m.group('gameID')
2292
2293         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2294         webpage = self._download_webpage(videourl, gameID)
2295
2296         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2297             videourl = self._AGECHECK_TEMPLATE % gameID
2298             self.report_age_confirmation()
2299             webpage = self._download_webpage(videourl, gameID)
2300
2301         self.report_extraction(gameID)
2302         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2303                                              webpage, 'game title')
2304
2305         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2306         mweb = re.finditer(urlRE, webpage)
2307         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2308         titles = re.finditer(namesRE, webpage)
2309         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2310         thumbs = re.finditer(thumbsRE, webpage)
2311         videos = []
2312         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2313             video_id = vid.group('videoID')
2314             title = vtitle.group('videoName')
2315             video_url = vid.group('videoURL')
2316             video_thumb = thumb.group('thumbnail')
2317             if not video_url:
2318                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2319             info = {
2320                 'id':video_id,
2321                 'url':video_url,
2322                 'ext': 'flv',
2323                 'title': unescapeHTML(title),
2324                 'thumbnail': video_thumb
2325                   }
2326             videos.append(info)
2327         return [self.playlist_result(videos, gameID, game_title)]
2328
2329 class UstreamIE(InfoExtractor):
2330     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2331     IE_NAME = u'ustream'
2332
2333     def _real_extract(self, url):
2334         m = re.match(self._VALID_URL, url)
2335         video_id = m.group('videoID')
2336
2337         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2338         webpage = self._download_webpage(url, video_id)
2339
2340         self.report_extraction(video_id)
2341
2342         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2343             webpage, u'title')
2344
2345         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2346             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2347
2348         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2349             webpage, u'thumbnail', fatal=False)
2350
2351         info = {
2352                 'id': video_id,
2353                 'url': video_url,
2354                 'ext': 'flv',
2355                 'title': video_title,
2356                 'uploader': uploader,
2357                 'thumbnail': thumbnail,
2358                }
2359         return info
2360
2361 class WorldStarHipHopIE(InfoExtractor):
2362     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2363     IE_NAME = u'WorldStarHipHop'
2364
2365     def _real_extract(self, url):
2366         m = re.match(self._VALID_URL, url)
2367         video_id = m.group('id')
2368
2369         webpage_src = self._download_webpage(url, video_id)
2370
2371         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2372             webpage_src, u'video URL')
2373
2374         if 'mp4' in video_url:
2375             ext = 'mp4'
2376         else:
2377             ext = 'flv'
2378
2379         video_title = self._html_search_regex(r"<title>(.*)</title>",
2380             webpage_src, u'title')
2381
2382         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2383         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2384             webpage_src, u'thumbnail', fatal=False)
2385
2386         if not thumbnail:
2387             _title = r"""candytitles.*>(.*)</span>"""
2388             mobj = re.search(_title, webpage_src)
2389             if mobj is not None:
2390                 video_title = mobj.group(1)
2391
2392         results = [{
2393                     'id': video_id,
2394                     'url' : video_url,
2395                     'title' : video_title,
2396                     'thumbnail' : thumbnail,
2397                     'ext' : ext,
2398                     }]
2399         return results
2400
2401 class RBMARadioIE(InfoExtractor):
2402     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2403
2404     def _real_extract(self, url):
2405         m = re.match(self._VALID_URL, url)
2406         video_id = m.group('videoID')
2407
2408         webpage = self._download_webpage(url, video_id)
2409
2410         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2411             webpage, u'json data', flags=re.MULTILINE)
2412
2413         try:
2414             data = json.loads(json_data)
2415         except ValueError as e:
2416             raise ExtractorError(u'Invalid JSON: ' + str(e))
2417
2418         video_url = data['akamai_url'] + '&cbr=256'
2419         url_parts = compat_urllib_parse_urlparse(video_url)
2420         video_ext = url_parts.path.rpartition('.')[2]
2421         info = {
2422                 'id': video_id,
2423                 'url': video_url,
2424                 'ext': video_ext,
2425                 'title': data['title'],
2426                 'description': data.get('teaser_text'),
2427                 'location': data.get('country_of_origin'),
2428                 'uploader': data.get('host', {}).get('name'),
2429                 'uploader_id': data.get('host', {}).get('slug'),
2430                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2431                 'duration': data.get('duration'),
2432         }
2433         return [info]
2434
2435
2436 class YouPornIE(InfoExtractor):
2437     """Information extractor for youporn.com."""
2438     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2439
2440     def _print_formats(self, formats):
2441         """Print all available formats"""
2442         print(u'Available formats:')
2443         print(u'ext\t\tformat')
2444         print(u'---------------------------------')
2445         for format in formats:
2446             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2447
2448     def _specific(self, req_format, formats):
2449         for x in formats:
2450             if(x["format"]==req_format):
2451                 return x
2452         return None
2453
2454     def _real_extract(self, url):
2455         mobj = re.match(self._VALID_URL, url)
2456         if mobj is None:
2457             raise ExtractorError(u'Invalid URL: %s' % url)
2458         video_id = mobj.group('videoid')
2459
2460         req = compat_urllib_request.Request(url)
2461         req.add_header('Cookie', 'age_verified=1')
2462         webpage = self._download_webpage(req, video_id)
2463
2464         # Get JSON parameters
2465         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2466         try:
2467             params = json.loads(json_params)
2468         except:
2469             raise ExtractorError(u'Invalid JSON')
2470
2471         self.report_extraction(video_id)
2472         try:
2473             video_title = params['title']
2474             upload_date = unified_strdate(params['release_date_f'])
2475             video_description = params['description']
2476             video_uploader = params['submitted_by']
2477             thumbnail = params['thumbnails'][0]['image']
2478         except KeyError:
2479             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2480
2481         # Get all of the formats available
2482         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2483         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2484             webpage, u'download list').strip()
2485
2486         # Get all of the links from the page
2487         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2488         links = re.findall(LINK_RE, download_list_html)
2489         if(len(links) == 0):
2490             raise ExtractorError(u'ERROR: no known formats available for video')
2491
2492         self.to_screen(u'Links found: %d' % len(links))
2493
2494         formats = []
2495         for link in links:
2496
2497             # A link looks like this:
2498             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2499             # A path looks like this:
2500             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2501             video_url = unescapeHTML( link )
2502             path = compat_urllib_parse_urlparse( video_url ).path
2503             extension = os.path.splitext( path )[1][1:]
2504             format = path.split('/')[4].split('_')[:2]
2505             size = format[0]
2506             bitrate = format[1]
2507             format = "-".join( format )
2508             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2509
2510             formats.append({
2511                 'id': video_id,
2512                 'url': video_url,
2513                 'uploader': video_uploader,
2514                 'upload_date': upload_date,
2515                 'title': video_title,
2516                 'ext': extension,
2517                 'format': format,
2518                 'thumbnail': thumbnail,
2519                 'description': video_description
2520             })
2521
2522         if self._downloader.params.get('listformats', None):
2523             self._print_formats(formats)
2524             return
2525
2526         req_format = self._downloader.params.get('format', None)
2527         self.to_screen(u'Format: %s' % req_format)
2528
2529         if req_format is None or req_format == 'best':
2530             return [formats[0]]
2531         elif req_format == 'worst':
2532             return [formats[-1]]
2533         elif req_format in ('-1', 'all'):
2534             return formats
2535         else:
2536             format = self._specific( req_format, formats )
2537             if result is None:
2538                 raise ExtractorError(u'Requested format not available')
2539             return [format]
2540
2541
2542
2543 class PornotubeIE(InfoExtractor):
2544     """Information extractor for pornotube.com."""
2545     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2546
2547     def _real_extract(self, url):
2548         mobj = re.match(self._VALID_URL, url)
2549         if mobj is None:
2550             raise ExtractorError(u'Invalid URL: %s' % url)
2551
2552         video_id = mobj.group('videoid')
2553         video_title = mobj.group('title')
2554
2555         # Get webpage content
2556         webpage = self._download_webpage(url, video_id)
2557
2558         # Get the video URL
2559         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2560         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2561         video_url = compat_urllib_parse.unquote(video_url)
2562
2563         #Get the uploaded date
2564         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2565         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2566         if upload_date: upload_date = unified_strdate(upload_date)
2567
2568         info = {'id': video_id,
2569                 'url': video_url,
2570                 'uploader': None,
2571                 'upload_date': upload_date,
2572                 'title': video_title,
2573                 'ext': 'flv',
2574                 'format': 'flv'}
2575
2576         return [info]
2577
2578 class YouJizzIE(InfoExtractor):
2579     """Information extractor for youjizz.com."""
2580     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2581
2582     def _real_extract(self, url):
2583         mobj = re.match(self._VALID_URL, url)
2584         if mobj is None:
2585             raise ExtractorError(u'Invalid URL: %s' % url)
2586
2587         video_id = mobj.group('videoid')
2588
2589         # Get webpage content
2590         webpage = self._download_webpage(url, video_id)
2591
2592         # Get the video title
2593         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2594             webpage, u'title').strip()
2595
2596         # Get the embed page
2597         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2598         if result is None:
2599             raise ExtractorError(u'ERROR: unable to extract embed page')
2600
2601         embed_page_url = result.group(0).strip()
2602         video_id = result.group('videoid')
2603
2604         webpage = self._download_webpage(embed_page_url, video_id)
2605
2606         # Get the video URL
2607         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2608             webpage, u'video URL')
2609
2610         info = {'id': video_id,
2611                 'url': video_url,
2612                 'title': video_title,
2613                 'ext': 'flv',
2614                 'format': 'flv',
2615                 'player_url': embed_page_url}
2616
2617         return [info]
2618
2619 class EightTracksIE(InfoExtractor):
2620     IE_NAME = '8tracks'
2621     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2622
2623     def _real_extract(self, url):
2624         mobj = re.match(self._VALID_URL, url)
2625         if mobj is None:
2626             raise ExtractorError(u'Invalid URL: %s' % url)
2627         playlist_id = mobj.group('id')
2628
2629         webpage = self._download_webpage(url, playlist_id)
2630
2631         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2632         data = json.loads(json_like)
2633
2634         session = str(random.randint(0, 1000000000))
2635         mix_id = data['id']
2636         track_count = data['tracks_count']
2637         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2638         next_url = first_url
2639         res = []
2640         for i in itertools.count():
2641             api_json = self._download_webpage(next_url, playlist_id,
2642                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2643                 errnote=u'Failed to download song information')
2644             api_data = json.loads(api_json)
2645             track_data = api_data[u'set']['track']
2646             info = {
2647                 'id': track_data['id'],
2648                 'url': track_data['track_file_stream_url'],
2649                 'title': track_data['performer'] + u' - ' + track_data['name'],
2650                 'raw_title': track_data['name'],
2651                 'uploader_id': data['user']['login'],
2652                 'ext': 'm4a',
2653             }
2654             res.append(info)
2655             if api_data['set']['at_last_track']:
2656                 break
2657             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2658         return res
2659
2660 class KeekIE(InfoExtractor):
2661     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2662     IE_NAME = u'keek'
2663
2664     def _real_extract(self, url):
2665         m = re.match(self._VALID_URL, url)
2666         video_id = m.group('videoID')
2667
2668         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2669         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2670         webpage = self._download_webpage(url, video_id)
2671
2672         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2673             webpage, u'title')
2674
2675         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2676             webpage, u'uploader', fatal=False)
2677
2678         info = {
2679                 'id': video_id,
2680                 'url': video_url,
2681                 'ext': 'mp4',
2682                 'title': video_title,
2683                 'thumbnail': thumbnail,
2684                 'uploader': uploader
2685         }
2686         return [info]
2687
2688 class TEDIE(InfoExtractor):
2689     _VALID_URL=r'''http://www\.ted\.com/
2690                    (
2691                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2692                         |
2693                         ((?P<type_talk>talks)) # We have a simple talk
2694                    )
2695                    (/lang/(.*?))? # The url may contain the language
2696                    /(?P<name>\w+) # Here goes the name and then ".html"
2697                    '''
2698
2699     @classmethod
2700     def suitable(cls, url):
2701         """Receives a URL and returns True if suitable for this IE."""
2702         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2703
2704     def _real_extract(self, url):
2705         m=re.match(self._VALID_URL, url, re.VERBOSE)
2706         if m.group('type_talk'):
2707             return [self._talk_info(url)]
2708         else :
2709             playlist_id=m.group('playlist_id')
2710             name=m.group('name')
2711             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2712             return [self._playlist_videos_info(url,name,playlist_id)]
2713
2714     def _playlist_videos_info(self,url,name,playlist_id=0):
2715         '''Returns the videos of the playlist'''
2716         video_RE=r'''
2717                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2718                      ([.\s]*?)data-playlist_item_id="(\d+)"
2719                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2720                      '''
2721         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2722         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2723         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2724         m_names=re.finditer(video_name_RE,webpage)
2725
2726         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2727                                                  webpage, 'playlist title')
2728
2729         playlist_entries = []
2730         for m_video, m_name in zip(m_videos,m_names):
2731             video_id=m_video.group('video_id')
2732             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2733             playlist_entries.append(self.url_result(talk_url, 'TED'))
2734         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2735
2736     def _talk_info(self, url, video_id=0):
2737         """Return the video for the talk in the url"""
2738         m = re.match(self._VALID_URL, url,re.VERBOSE)
2739         video_name = m.group('name')
2740         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2741         self.report_extraction(video_name)
2742         # If the url includes the language we get the title translated
2743         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2744                                         webpage, 'title')
2745         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2746                                     webpage, 'json data')
2747         info = json.loads(json_data)
2748         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2749                                        webpage, 'description', flags = re.DOTALL)
2750
2751         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2752                                        webpage, 'thumbnail')
2753         info = {
2754                 'id': info['id'],
2755                 'url': info['htmlStreams'][-1]['file'],
2756                 'ext': 'mp4',
2757                 'title': title,
2758                 'thumbnail': thumbnail,
2759                 'description': desc,
2760                 }
2761         return info
2762
2763 class MySpassIE(InfoExtractor):
2764     _VALID_URL = r'http://www.myspass.de/.*'
2765
2766     def _real_extract(self, url):
2767         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2768
2769         # video id is the last path element of the URL
2770         # usually there is a trailing slash, so also try the second but last
2771         url_path = compat_urllib_parse_urlparse(url).path
2772         url_parent_path, video_id = os.path.split(url_path)
2773         if not video_id:
2774             _, video_id = os.path.split(url_parent_path)
2775
2776         # get metadata
2777         metadata_url = META_DATA_URL_TEMPLATE % video_id
2778         metadata_text = self._download_webpage(metadata_url, video_id)
2779         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2780
2781         # extract values from metadata
2782         url_flv_el = metadata.find('url_flv')
2783         if url_flv_el is None:
2784             raise ExtractorError(u'Unable to extract download url')
2785         video_url = url_flv_el.text
2786         extension = os.path.splitext(video_url)[1][1:]
2787         title_el = metadata.find('title')
2788         if title_el is None:
2789             raise ExtractorError(u'Unable to extract title')
2790         title = title_el.text
2791         format_id_el = metadata.find('format_id')
2792         if format_id_el is None:
2793             format = ext
2794         else:
2795             format = format_id_el.text
2796         description_el = metadata.find('description')
2797         if description_el is not None:
2798             description = description_el.text
2799         else:
2800             description = None
2801         imagePreview_el = metadata.find('imagePreview')
2802         if imagePreview_el is not None:
2803             thumbnail = imagePreview_el.text
2804         else:
2805             thumbnail = None
2806         info = {
2807             'id': video_id,
2808             'url': video_url,
2809             'title': title,
2810             'ext': extension,
2811             'format': format,
2812             'thumbnail': thumbnail,
2813             'description': description
2814         }
2815         return [info]
2816
2817 class SpiegelIE(InfoExtractor):
2818     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2819
2820     def _real_extract(self, url):
2821         m = re.match(self._VALID_URL, url)
2822         video_id = m.group('videoID')
2823
2824         webpage = self._download_webpage(url, video_id)
2825
2826         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2827             webpage, u'title')
2828
2829         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2830         xml_code = self._download_webpage(xml_url, video_id,
2831                     note=u'Downloading XML', errnote=u'Failed to download XML')
2832
2833         idoc = xml.etree.ElementTree.fromstring(xml_code)
2834         last_type = idoc[-1]
2835         filename = last_type.findall('./filename')[0].text
2836         duration = float(last_type.findall('./duration')[0].text)
2837
2838         video_url = 'http://video2.spiegel.de/flash/' + filename
2839         video_ext = filename.rpartition('.')[2]
2840         info = {
2841             'id': video_id,
2842             'url': video_url,
2843             'ext': video_ext,
2844             'title': video_title,
2845             'duration': duration,
2846         }
2847         return [info]
2848
2849 class LiveLeakIE(InfoExtractor):
2850
2851     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2852     IE_NAME = u'liveleak'
2853
2854     def _real_extract(self, url):
2855         mobj = re.match(self._VALID_URL, url)
2856         if mobj is None:
2857             raise ExtractorError(u'Invalid URL: %s' % url)
2858
2859         video_id = mobj.group('video_id')
2860
2861         webpage = self._download_webpage(url, video_id)
2862
2863         video_url = self._search_regex(r'file: "(.*?)",',
2864             webpage, u'video URL')
2865
2866         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2867             webpage, u'title').replace('LiveLeak.com -', '').strip()
2868
2869         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2870             webpage, u'description', fatal=False)
2871
2872         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2873             webpage, u'uploader', fatal=False)
2874
2875         info = {
2876             'id':  video_id,
2877             'url': video_url,
2878             'ext': 'mp4',
2879             'title': video_title,
2880             'description': video_description,
2881             'uploader': video_uploader
2882         }
2883
2884         return [info]
2885
2886 class ARDIE(InfoExtractor):
2887     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2888     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2889     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2890
2891     def _real_extract(self, url):
2892         # determine video id from url
2893         m = re.match(self._VALID_URL, url)
2894
2895         numid = re.search(r'documentId=([0-9]+)', url)
2896         if numid:
2897             video_id = numid.group(1)
2898         else:
2899             video_id = m.group('video_id')
2900
2901         # determine title and media streams from webpage
2902         html = self._download_webpage(url, video_id)
2903         title = re.search(self._TITLE, html).group('title')
2904         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2905         if not streams:
2906             assert '"fsk"' in html
2907             raise ExtractorError(u'This video is only available after 8:00 pm')
2908
2909         # choose default media type and highest quality for now
2910         stream = max([s for s in streams if int(s["media_type"]) == 0],
2911                      key=lambda s: int(s["quality"]))
2912
2913         # there's two possibilities: RTMP stream or HTTP download
2914         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2915         if stream['rtmp_url']:
2916             self.to_screen(u'RTMP download detected')
2917             assert stream['video_url'].startswith('mp4:')
2918             info["url"] = stream["rtmp_url"]
2919             info["play_path"] = stream['video_url']
2920         else:
2921             assert stream["video_url"].endswith('.mp4')
2922             info["url"] = stream["video_url"]
2923         return [info]
2924
2925 class ZDFIE(InfoExtractor):
2926     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2927     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2928     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2929     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2930     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2931
2932     def _real_extract(self, url):
2933         mobj = re.match(self._VALID_URL, url)
2934         if mobj is None:
2935             raise ExtractorError(u'Invalid URL: %s' % url)
2936         video_id = mobj.group('video_id')
2937
2938         html = self._download_webpage(url, video_id)
2939         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2940         if streams is None:
2941             raise ExtractorError(u'No media url found.')
2942
2943         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2944         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2945         # choose first/default media type and highest quality for now
2946         for s in streams:        #find 300 - dsl1000mbit
2947             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2948                 stream_=s
2949                 break
2950         for s in streams:        #find veryhigh - dsl2000mbit
2951             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2952                 stream_=s
2953                 break
2954         if stream_ is None:
2955             raise ExtractorError(u'No stream found.')
2956
2957         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2958
2959         self.report_extraction(video_id)
2960         mobj = re.search(self._TITLE, html)
2961         if mobj is None:
2962             raise ExtractorError(u'Cannot extract title')
2963         title = unescapeHTML(mobj.group('title'))
2964
2965         mobj = re.search(self._MMS_STREAM, media_link)
2966         if mobj is None:
2967             mobj = re.search(self._RTSP_STREAM, media_link)
2968             if mobj is None:
2969                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2970         mms_url = mobj.group('video_url')
2971
2972         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2973         if mobj is None:
2974             raise ExtractorError(u'Cannot extract extention')
2975         ext = mobj.group('ext')
2976
2977         return [{'id': video_id,
2978                  'url': mms_url,
2979                  'title': title,
2980                  'ext': ext
2981                  }]
2982
2983 class TumblrIE(InfoExtractor):
2984     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2985
2986     def _real_extract(self, url):
2987         m_url = re.match(self._VALID_URL, url)
2988         video_id = m_url.group('id')
2989         blog = m_url.group('blog_name')
2990
2991         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2992         webpage = self._download_webpage(url, video_id)
2993
2994         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2995         video = re.search(re_video, webpage)
2996         if video is None:
2997            raise ExtractorError(u'Unable to extract video')
2998         video_url = video.group('video_url')
2999         ext = video.group('ext')
3000
3001         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
3002             webpage, u'thumbnail', fatal=False)  # We pick the first poster
3003         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
3004
3005         # The only place where you can get a title, it's not complete,
3006         # but searching in other places doesn't work for all videos
3007         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
3008             webpage, u'title', flags=re.DOTALL)
3009
3010         return [{'id': video_id,
3011                  'url': video_url,
3012                  'title': video_title,
3013                  'thumbnail': video_thumbnail,
3014                  'ext': ext
3015                  }]
3016
3017 class BandcampIE(InfoExtractor):
3018     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3019
3020     def _real_extract(self, url):
3021         mobj = re.match(self._VALID_URL, url)
3022         title = mobj.group('title')
3023         webpage = self._download_webpage(url, title)
3024         # We get the link to the free download page
3025         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3026         if m_download is None:
3027             raise ExtractorError(u'No free songs found')
3028
3029         download_link = m_download.group(1)
3030         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3031                        webpage, re.MULTILINE|re.DOTALL).group('id')
3032
3033         download_webpage = self._download_webpage(download_link, id,
3034                                                   'Downloading free downloads page')
3035         # We get the dictionary of the track from some javascrip code
3036         info = re.search(r'items: (.*?),$',
3037                          download_webpage, re.MULTILINE).group(1)
3038         info = json.loads(info)[0]
3039         # We pick mp3-320 for now, until format selection can be easily implemented.
3040         mp3_info = info[u'downloads'][u'mp3-320']
3041         # If we try to use this url it says the link has expired
3042         initial_url = mp3_info[u'url']
3043         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3044         m_url = re.match(re_url, initial_url)
3045         #We build the url we will use to get the final track url
3046         # This url is build in Bandcamp in the script download_bunde_*.js
3047         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3048         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3049         # If we could correctly generate the .rand field the url would be
3050         #in the "download_url" key
3051         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3052
3053         track_info = {'id':id,
3054                       'title' : info[u'title'],
3055                       'ext' :   'mp3',
3056                       'url' :   final_url,
3057                       'thumbnail' : info[u'thumb_url'],
3058                       'uploader' :  info[u'artist']
3059                       }
3060
3061         return [track_info]
3062
3063 class RedTubeIE(InfoExtractor):
3064     """Information Extractor for redtube"""
3065     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3066
3067     def _real_extract(self,url):
3068         mobj = re.match(self._VALID_URL, url)
3069         if mobj is None:
3070             raise ExtractorError(u'Invalid URL: %s' % url)
3071
3072         video_id = mobj.group('id')
3073         video_extension = 'mp4'
3074         webpage = self._download_webpage(url, video_id)
3075
3076         self.report_extraction(video_id)
3077
3078         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3079             webpage, u'video URL')
3080
3081         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3082             webpage, u'title')
3083
3084         return [{
3085             'id':       video_id,
3086             'url':      video_url,
3087             'ext':      video_extension,
3088             'title':    video_title,
3089         }]
3090
3091 class InaIE(InfoExtractor):
3092     """Information Extractor for Ina.fr"""
3093     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3094
3095     def _real_extract(self,url):
3096         mobj = re.match(self._VALID_URL, url)
3097
3098         video_id = mobj.group('id')
3099         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3100         video_extension = 'mp4'
3101         webpage = self._download_webpage(mrss_url, video_id)
3102
3103         self.report_extraction(video_id)
3104
3105         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
3106             webpage, u'video URL')
3107
3108         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
3109             webpage, u'title')
3110
3111         return [{
3112             'id':       video_id,
3113             'url':      video_url,
3114             'ext':      video_extension,
3115             'title':    video_title,
3116         }]
3117
3118 class HowcastIE(InfoExtractor):
3119     """Information Extractor for Howcast.com"""
3120     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
3121
3122     def _real_extract(self, url):
3123         mobj = re.match(self._VALID_URL, url)
3124
3125         video_id = mobj.group('id')
3126         webpage_url = 'http://www.howcast.com/videos/' + video_id
3127         webpage = self._download_webpage(webpage_url, video_id)
3128
3129         self.report_extraction(video_id)
3130
3131         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3132             webpage, u'video URL')
3133
3134         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3135             webpage, u'title')
3136
3137         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3138             webpage, u'description', fatal=False)
3139
3140         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3141             webpage, u'thumbnail', fatal=False)
3142
3143         return [{
3144             'id':       video_id,
3145             'url':      video_url,
3146             'ext':      'mp4',
3147             'title':    video_title,
3148             'description': video_description,
3149             'thumbnail': thumbnail,
3150         }]
3151
3152 class VineIE(InfoExtractor):
3153     """Information Extractor for Vine.co"""
3154     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3155
3156     def _real_extract(self, url):
3157         mobj = re.match(self._VALID_URL, url)
3158
3159         video_id = mobj.group('id')
3160         webpage_url = 'https://vine.co/v/' + video_id
3161         webpage = self._download_webpage(webpage_url, video_id)
3162
3163         self.report_extraction(video_id)
3164
3165         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3166             webpage, u'video URL')
3167
3168         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3169             webpage, u'title')
3170
3171         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3172             webpage, u'thumbnail', fatal=False)
3173
3174         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3175             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3176
3177         return [{
3178             'id':        video_id,
3179             'url':       video_url,
3180             'ext':       'mp4',
3181             'title':     video_title,
3182             'thumbnail': thumbnail,
3183             'uploader':  uploader,
3184         }]
3185
3186 class FlickrIE(InfoExtractor):
3187     """Information Extractor for Flickr videos"""
3188     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3189
3190     def _real_extract(self, url):
3191         mobj = re.match(self._VALID_URL, url)
3192
3193         video_id = mobj.group('id')
3194         video_uploader_id = mobj.group('uploader_id')
3195         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3196         webpage = self._download_webpage(webpage_url, video_id)
3197
3198         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3199
3200         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3201         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3202
3203         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3204             first_xml, u'node_id')
3205
3206         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3207         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3208
3209         self.report_extraction(video_id)
3210
3211         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3212         if mobj is None:
3213             raise ExtractorError(u'Unable to extract video url')
3214         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3215
3216         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3217             webpage, u'video title')
3218
3219         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3220             webpage, u'description', fatal=False)
3221
3222         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3223             webpage, u'thumbnail', fatal=False)
3224
3225         return [{
3226             'id':          video_id,
3227             'url':         video_url,
3228             'ext':         'mp4',
3229             'title':       video_title,
3230             'description': video_description,
3231             'thumbnail':   thumbnail,
3232             'uploader_id': video_uploader_id,
3233         }]
3234
3235 class TeamcocoIE(InfoExtractor):
3236     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3237
3238     def _real_extract(self, url):
3239         mobj = re.match(self._VALID_URL, url)
3240         if mobj is None:
3241             raise ExtractorError(u'Invalid URL: %s' % url)
3242         url_title = mobj.group('url_title')
3243         webpage = self._download_webpage(url, url_title)
3244
3245         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3246             webpage, u'video id')
3247
3248         self.report_extraction(video_id)
3249
3250         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3251             webpage, u'title')
3252
3253         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3254             webpage, u'thumbnail', fatal=False)
3255
3256         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3257             webpage, u'description', fatal=False)
3258
3259         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3260         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3261
3262         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3263             data, u'video URL')
3264
3265         return [{
3266             'id':          video_id,
3267             'url':         video_url,
3268             'ext':         'mp4',
3269             'title':       video_title,
3270             'thumbnail':   thumbnail,
3271             'description': video_description,
3272         }]
3273
3274 class XHamsterIE(InfoExtractor):
3275     """Information Extractor for xHamster"""
3276     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3277
3278     def _real_extract(self,url):
3279         mobj = re.match(self._VALID_URL, url)
3280
3281         video_id = mobj.group('id')
3282         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3283         webpage = self._download_webpage(mrss_url, video_id)
3284
3285         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3286         if mobj is None:
3287             raise ExtractorError(u'Unable to extract media URL')
3288         if len(mobj.group('server')) == 0:
3289             video_url = compat_urllib_parse.unquote(mobj.group('file'))
3290         else:
3291             video_url = mobj.group('server')+'/key='+mobj.group('file')
3292         video_extension = video_url.split('.')[-1]
3293
3294         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3295             webpage, u'title')
3296
3297         # Can't see the description anywhere in the UI
3298         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3299         #     webpage, u'description', fatal=False)
3300         # if video_description: video_description = unescapeHTML(video_description)
3301
3302         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3303         if mobj:
3304             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3305         else:
3306             video_upload_date = None
3307             self._downloader.report_warning(u'Unable to extract upload date')
3308
3309         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3310             webpage, u'uploader id', default=u'anonymous')
3311
3312         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3313             webpage, u'thumbnail', fatal=False)
3314
3315         return [{
3316             'id':       video_id,
3317             'url':      video_url,
3318             'ext':      video_extension,
3319             'title':    video_title,
3320             # 'description': video_description,
3321             'upload_date': video_upload_date,
3322             'uploader_id': video_uploader_id,
3323             'thumbnail': video_thumbnail
3324         }]
3325
3326 class HypemIE(InfoExtractor):
3327     """Information Extractor for hypem"""
3328     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3329
3330     def _real_extract(self, url):
3331         mobj = re.match(self._VALID_URL, url)
3332         if mobj is None:
3333             raise ExtractorError(u'Invalid URL: %s' % url)
3334         track_id = mobj.group(1)
3335
3336         data = { 'ax': 1, 'ts': time.time() }
3337         data_encoded = compat_urllib_parse.urlencode(data)
3338         complete_url = url + "?" + data_encoded
3339         request = compat_urllib_request.Request(complete_url)
3340         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3341         cookie = urlh.headers.get('Set-Cookie', '')
3342
3343         self.report_extraction(track_id)
3344
3345         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3346             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3347         try:
3348             track_list = json.loads(html_tracks)
3349             track = track_list[u'tracks'][0]
3350         except ValueError:
3351             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3352
3353         key = track[u"key"]
3354         track_id = track[u"id"]
3355         artist = track[u"artist"]
3356         title = track[u"song"]
3357
3358         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3359         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3360         request.add_header('cookie', cookie)
3361         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3362         try:
3363             song_data = json.loads(song_data_json)
3364         except ValueError:
3365             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3366         final_url = song_data[u"url"]
3367
3368         return [{
3369             'id':       track_id,
3370             'url':      final_url,
3371             'ext':      "mp3",
3372             'title':    title,
3373             'artist':   artist,
3374         }]
3375
3376 class Vbox7IE(InfoExtractor):
3377     """Information Extractor for Vbox7"""
3378     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3379
3380     def _real_extract(self,url):
3381         mobj = re.match(self._VALID_URL, url)
3382         if mobj is None:
3383             raise ExtractorError(u'Invalid URL: %s' % url)
3384         video_id = mobj.group(1)
3385
3386         redirect_page, urlh = self._download_webpage_handle(url, video_id)
3387         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3388         redirect_url = urlh.geturl() + new_location
3389         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3390
3391         title = self._html_search_regex(r'<title>(.*)</title>',
3392             webpage, u'title').split('/')[0].strip()
3393
3394         ext = "flv"
3395         info_url = "http://vbox7.com/play/magare.do"
3396         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3397         info_request = compat_urllib_request.Request(info_url, data)
3398         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3399         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3400         if info_response is None:
3401             raise ExtractorError(u'Unable to extract the media url')
3402         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3403
3404         return [{
3405             'id':        video_id,
3406             'url':       final_url,
3407             'ext':       ext,
3408             'title':     title,
3409             'thumbnail': thumbnail_url,
3410         }]
3411
3412 class GametrailersIE(InfoExtractor):
3413     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3414
3415     def _real_extract(self, url):
3416         mobj = re.match(self._VALID_URL, url)
3417         if mobj is None:
3418             raise ExtractorError(u'Invalid URL: %s' % url)
3419         video_id = mobj.group('id')
3420         video_type = mobj.group('type')
3421         webpage = self._download_webpage(url, video_id)
3422         if video_type == 'full-episodes':
3423             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3424         else:
3425             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3426         mgid = self._search_regex(mgid_re, webpage, u'mgid')
3427         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3428
3429         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3430                                            video_id, u'Downloading video info')
3431         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3432                                                video_id, u'Downloading video urls info')
3433
3434         self.report_extraction(video_id)
3435         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3436                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3437                       <image>.*
3438                         <url>(?P<thumb>.*?)</url>.*
3439                       </image>'''
3440
3441         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3442         if m_info is None:
3443             raise ExtractorError(u'Unable to extract video info')
3444         video_title = m_info.group('title')
3445         video_description = m_info.group('description')
3446         video_thumb = m_info.group('thumb')
3447
3448         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3449         if m_urls is None or len(m_urls) == 0:
3450             raise ExtractError(u'Unable to extrat video url')
3451         # They are sorted from worst to best quality
3452         video_url = m_urls[-1].group('url')
3453
3454         return {'url':         video_url,
3455                 'id':          video_id,
3456                 'title':       video_title,
3457                 # Videos are actually flv not mp4
3458                 'ext':         'flv',
3459                 'thumbnail':   video_thumb,
3460                 'description': video_description,
3461                 }
3462
3463 def gen_extractors():
3464     """ Return a list of an instance of every supported extractor.
3465     The order does matter; the first extractor matched is the one handling the URL.
3466     """
3467     return [
3468         YoutubePlaylistIE(),
3469         YoutubeChannelIE(),
3470         YoutubeUserIE(),
3471         YoutubeSearchIE(),
3472         YoutubeIE(),
3473         MetacafeIE(),
3474         DailymotionIE(),
3475         GoogleSearchIE(),
3476         PhotobucketIE(),
3477         YahooIE(),
3478         YahooSearchIE(),
3479         DepositFilesIE(),
3480         FacebookIE(),
3481         BlipTVIE(),
3482         BlipTVUserIE(),
3483         VimeoIE(),
3484         MyVideoIE(),
3485         ComedyCentralIE(),
3486         EscapistIE(),
3487         CollegeHumorIE(),
3488         XVideosIE(),
3489         SoundcloudSetIE(),
3490         SoundcloudIE(),
3491         InfoQIE(),
3492         MixcloudIE(),
3493         StanfordOpenClassroomIE(),
3494         MTVIE(),
3495         YoukuIE(),
3496         XNXXIE(),
3497         YouJizzIE(),
3498         PornotubeIE(),
3499         YouPornIE(),
3500         GooglePlusIE(),
3501         ArteTvIE(),
3502         NBAIE(),
3503         WorldStarHipHopIE(),
3504         JustinTVIE(),
3505         FunnyOrDieIE(),
3506         SteamIE(),
3507         UstreamIE(),
3508         RBMARadioIE(),
3509         EightTracksIE(),
3510         KeekIE(),
3511         TEDIE(),
3512         MySpassIE(),
3513         SpiegelIE(),
3514         LiveLeakIE(),
3515         ARDIE(),
3516         ZDFIE(),
3517         TumblrIE(),
3518         BandcampIE(),
3519         RedTubeIE(),
3520         InaIE(),
3521         HowcastIE(),
3522         VineIE(),
3523         FlickrIE(),
3524         TeamcocoIE(),
3525         XHamsterIE(),
3526         HypemIE(),
3527         Vbox7IE(),
3528         GametrailersIE(),
3529         StatigramIE(),
3530         GenericIE()
3531     ]
3532
3533 def get_info_extractor(ie_name):
3534     """Returns the info extractor class with the given ie_name"""
3535     return globals()[ie_name+'IE']