_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 from .extractor.common import InfoExtractor, SearchInfoExtractor
  27 from .extractor.dailymotion import DailymotionIE
  28 from .extractor.metacafe import MetacafeIE
  29 from .extractor.statigram import StatigramIE
  30 from .extractor.photobucket import PhotobucketIE
  31 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
  32
  33
  34
  35
  36
  37
  38
  39 class YahooIE(InfoExtractor):
  40     """Information extractor for screen.yahoo.com."""
  41     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
  42
  43     def _real_extract(self, url):
  44         mobj = re.match(self._VALID_URL, url)
  45         if mobj is None:
  46             raise ExtractorError(u'Invalid URL: %s' % url)
  47         video_id = mobj.group('id')
  48         webpage = self._download_webpage(url, video_id)
  49         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
  50
  51         if m_id is None:
  52             # TODO: Check which url parameters are required
  53             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
  54             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
  55             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
  56                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
  57                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
  58                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
  59                         '''
  60             self.report_extraction(video_id)
  61             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
  62             if m_info is None:
  63                 raise ExtractorError(u'Unable to extract video info')
  64             video_title = m_info.group('title')
  65             video_description = m_info.group('description')
  66             video_thumb = m_info.group('thumb')
  67             video_date = m_info.group('date')
  68             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
  69
  70             # TODO: Find a way to get mp4 videos
  71             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
  72             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
  73             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
  74             video_url = m_rest.group('url')
  75             video_path = m_rest.group('path')
  76             if m_rest is None:
  77                 raise ExtractorError(u'Unable to extract video url')
  78
  79         else: # We have to use a different method if another id is defined
  80             long_id = m_id.group('new_id')
  81             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
  82             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
  83             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
  84             info = json.loads(json_str)
  85             res = info[u'query'][u'results'][u'mediaObj'][0]
  86             stream = res[u'streams'][0]
  87             video_path = stream[u'path']
  88             video_url = stream[u'host']
  89             meta = res[u'meta']
  90             video_title = meta[u'title']
  91             video_description = meta[u'description']
  92             video_thumb = meta[u'thumbnail']
  93             video_date = None # I can't find it
  94
  95         info_dict = {
  96                      'id': video_id,
  97                      'url': video_url,
  98                      'play_path': video_path,
  99                      'title':video_title,
 100                      'description': video_description,
 101                      'thumbnail': video_thumb,
 102                      'upload_date': video_date,
 103                      'ext': 'flv',
 104                      }
 105         return info_dict
 106
 107 class VimeoIE(InfoExtractor):
 108     """Information extractor for vimeo.com."""
 109
 110     # _VALID_URL matches Vimeo URLs
 111     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 112     IE_NAME = u'vimeo'
 113
 114     def _verify_video_password(self, url, video_id, webpage):
 115         password = self._downloader.params.get('password', None)
 116         if password is None:
 117             raise ExtractorError(u'This video is protected by a password, use the --password option')
 118         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
 119         data = compat_urllib_parse.urlencode({'password': password,
 120                                               'token': token})
 121         # I didn't manage to use the password with https
 122         if url.startswith('https'):
 123             pass_url = url.replace('https','http')
 124         else:
 125             pass_url = url
 126         password_request = compat_urllib_request.Request(pass_url+'/password', data)
 127         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 128         password_request.add_header('Cookie', 'xsrft=%s' % token)
 129         pass_web = self._download_webpage(password_request, video_id,
 130                                           u'Verifying the password',
 131                                           u'Wrong password')
 132
 133     def _real_extract(self, url, new_video=True):
 134         # Extract ID from URL
 135         mobj = re.match(self._VALID_URL, url)
 136         if mobj is None:
 137             raise ExtractorError(u'Invalid URL: %s' % url)
 138
 139         video_id = mobj.group('id')
 140         if not mobj.group('proto'):
 141             url = 'https://' + url
 142         if mobj.group('direct_link') or mobj.group('pro'):
 143             url = 'https://vimeo.com/' + video_id
 144
 145         # Retrieve video webpage to extract further information
 146         request = compat_urllib_request.Request(url, None, std_headers)
 147         webpage = self._download_webpage(request, video_id)
 148
 149         # Now we begin extracting as much information as we can from what we
 150         # retrieved. First we extract the information common to all extractors,
 151         # and latter we extract those that are Vimeo specific.
 152         self.report_extraction(video_id)
 153
 154         # Extract the config JSON
 155         try:
 156             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 157             config = json.loads(config)
 158         except:
 159             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
 160                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
 161
 162             if re.search('If so please provide the correct password.', webpage):
 163                 self._verify_video_password(url, video_id, webpage)
 164                 return self._real_extract(url)
 165             else:
 166                 raise ExtractorError(u'Unable to extract info section')
 167
 168         # Extract title
 169         video_title = config["video"]["title"]
 170
 171         # Extract uploader and uploader_id
 172         video_uploader = config["video"]["owner"]["name"]
 173         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
 174
 175         # Extract video thumbnail
 176         video_thumbnail = config["video"]["thumbnail"]
 177
 178         # Extract video description
 179         video_description = get_element_by_attribute("itemprop", "description", webpage)
 180         if video_description: video_description = clean_html(video_description)
 181         else: video_description = u''
 182
 183         # Extract upload date
 184         video_upload_date = None
 185         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
 186         if mobj is not None:
 187             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
 188
 189         # Vimeo specific: extract request signature and timestamp
 190         sig = config['request']['signature']
 191         timestamp = config['request']['timestamp']
 192
 193         # Vimeo specific: extract video codec and quality information
 194         # First consider quality, then codecs, then take everything
 195         # TODO bind to format param
 196         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
 197         files = { 'hd': [], 'sd': [], 'other': []}
 198         for codec_name, codec_extension in codecs:
 199             if codec_name in config["video"]["files"]:
 200                 if 'hd' in config["video"]["files"][codec_name]:
 201                     files['hd'].append((codec_name, codec_extension, 'hd'))
 202                 elif 'sd' in config["video"]["files"][codec_name]:
 203                     files['sd'].append((codec_name, codec_extension, 'sd'))
 204                 else:
 205                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
 206
 207         for quality in ('hd', 'sd', 'other'):
 208             if len(files[quality]) > 0:
 209                 video_quality = files[quality][0][2]
 210                 video_codec = files[quality][0][0]
 211                 video_extension = files[quality][0][1]
 212                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
 213                 break
 214         else:
 215             raise ExtractorError(u'No known codec found')
 216
 217         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 218                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
 219
 220         return [{
 221             'id':       video_id,
 222             'url':      video_url,
 223             'uploader': video_uploader,
 224             'uploader_id': video_uploader_id,
 225             'upload_date':  video_upload_date,
 226             'title':    video_title,
 227             'ext':      video_extension,
 228             'thumbnail':    video_thumbnail,
 229             'description':  video_description,
 230         }]
 231
 232
 233 class ArteTvIE(InfoExtractor):
 234     """arte.tv information extractor."""
 235
 236     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
 237     _LIVE_URL = r'index-[0-9]+\.html$'
 238
 239     IE_NAME = u'arte.tv'
 240
 241     def fetch_webpage(self, url):
 242         request = compat_urllib_request.Request(url)
 243         try:
 244             self.report_download_webpage(url)
 245             webpage = compat_urllib_request.urlopen(request).read()
 246         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 247             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 248         except ValueError as err:
 249             raise ExtractorError(u'Invalid URL: %s' % url)
 250         return webpage
 251
 252     def grep_webpage(self, url, regex, regexFlags, matchTuples):
 253         page = self.fetch_webpage(url)
 254         mobj = re.search(regex, page, regexFlags)
 255         info = {}
 256
 257         if mobj is None:
 258             raise ExtractorError(u'Invalid URL: %s' % url)
 259
 260         for (i, key, err) in matchTuples:
 261             if mobj.group(i) is None:
 262                 raise ExtractorError(err)
 263             else:
 264                 info[key] = mobj.group(i)
 265
 266         return info
 267
 268     def extractLiveStream(self, url):
 269         video_lang = url.split('/')[-4]
 270         info = self.grep_webpage(
 271             url,
 272             r'src="(.*?/videothek_js.*?\.js)',
 273             0,
 274             [
 275                 (1, 'url', u'Invalid URL: %s' % url)
 276             ]
 277         )
 278         http_host = url.split('/')[2]
 279         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
 280         info = self.grep_webpage(
 281             next_url,
 282             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
 283                 '(http://.*?\.swf).*?' +
 284                 '(rtmp://.*?)\'',
 285             re.DOTALL,
 286             [
 287                 (1, 'path',   u'could not extract video path: %s' % url),
 288                 (2, 'player', u'could not extract video player: %s' % url),
 289                 (3, 'url',    u'could not extract video url: %s' % url)
 290             ]
 291         )
 292         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 293
 294     def extractPlus7Stream(self, url):
 295         video_lang = url.split('/')[-3]
 296         info = self.grep_webpage(
 297             url,
 298             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
 299             0,
 300             [
 301                 (1, 'url', u'Invalid URL: %s' % url)
 302             ]
 303         )
 304         next_url = compat_urllib_parse.unquote(info.get('url'))
 305         info = self.grep_webpage(
 306             next_url,
 307             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
 308             0,
 309             [
 310                 (1, 'url', u'Could not find <video> tag: %s' % url)
 311             ]
 312         )
 313         next_url = compat_urllib_parse.unquote(info.get('url'))
 314
 315         info = self.grep_webpage(
 316             next_url,
 317             r'<video id="(.*?)".*?>.*?' +
 318                 '<name>(.*?)</name>.*?' +
 319                 '<dateVideo>(.*?)</dateVideo>.*?' +
 320                 '<url quality="hd">(.*?)</url>',
 321             re.DOTALL,
 322             [
 323                 (1, 'id',    u'could not extract video id: %s' % url),
 324                 (2, 'title', u'could not extract video title: %s' % url),
 325                 (3, 'date',  u'could not extract video date: %s' % url),
 326                 (4, 'url',   u'could not extract video url: %s' % url)
 327             ]
 328         )
 329
 330         return {
 331             'id':           info.get('id'),
 332             'url':          compat_urllib_parse.unquote(info.get('url')),
 333             'uploader':     u'arte.tv',
 334             'upload_date':  unified_strdate(info.get('date')),
 335             'title':        info.get('title').decode('utf-8'),
 336             'ext':          u'mp4',
 337             'format':       u'NA',
 338             'player_url':   None,
 339         }
 340
 341     def _real_extract(self, url):
 342         video_id = url.split('/')[-1]
 343         self.report_extraction(video_id)
 344
 345         if re.search(self._LIVE_URL, video_id) is not None:
 346             self.extractLiveStream(url)
 347             return
 348         else:
 349             info = self.extractPlus7Stream(url)
 350
 351         return [info]
 352
 353
 354 class GenericIE(InfoExtractor):
 355     """Generic last-resort information extractor."""
 356
 357     _VALID_URL = r'.*'
 358     IE_NAME = u'generic'
 359
 360     def report_download_webpage(self, video_id):
 361         """Report webpage download."""
 362         if not self._downloader.params.get('test', False):
 363             self._downloader.report_warning(u'Falling back on generic information extractor.')
 364         super(GenericIE, self).report_download_webpage(video_id)
 365
 366     def report_following_redirect(self, new_url):
 367         """Report information extraction."""
 368         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
 369
 370     def _test_redirect(self, url):
 371         """Check if it is a redirect, like url shorteners, in case return the new url."""
 372         class HeadRequest(compat_urllib_request.Request):
 373             def get_method(self):
 374                 return "HEAD"
 375
 376         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
 377             """
 378             Subclass the HTTPRedirectHandler to make it use our
 379             HeadRequest also on the redirected URL
 380             """
 381             def redirect_request(self, req, fp, code, msg, headers, newurl):
 382                 if code in (301, 302, 303, 307):
 383                     newurl = newurl.replace(' ', '%20')
 384                     newheaders = dict((k,v) for k,v in req.headers.items()
 385                                       if k.lower() not in ("content-length", "content-type"))
 386                     return HeadRequest(newurl,
 387                                        headers=newheaders,
 388                                        origin_req_host=req.get_origin_req_host(),
 389                                        unverifiable=True)
 390                 else:
 391                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
 392
 393         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
 394             """
 395             Fallback to GET if HEAD is not allowed (405 HTTP error)
 396             """
 397             def http_error_405(self, req, fp, code, msg, headers):
 398                 fp.read()
 399                 fp.close()
 400
 401                 newheaders = dict((k,v) for k,v in req.headers.items()
 402                                   if k.lower() not in ("content-length", "content-type"))
 403                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
 404                                                  headers=newheaders,
 405                                                  origin_req_host=req.get_origin_req_host(),
 406                                                  unverifiable=True))
 407
 408         # Build our opener
 409         opener = compat_urllib_request.OpenerDirector()
 410         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
 411                         HTTPMethodFallback, HEADRedirectHandler,
 412                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
 413             opener.add_handler(handler())
 414
 415         response = opener.open(HeadRequest(url))
 416         if response is None:
 417             raise ExtractorError(u'Invalid URL protocol')
 418         new_url = response.geturl()
 419
 420         if url == new_url:
 421             return False
 422
 423         self.report_following_redirect(new_url)
 424         return new_url
 425
 426     def _real_extract(self, url):
 427         new_url = self._test_redirect(url)
 428         if new_url: return [self.url_result(new_url)]
 429
 430         video_id = url.split('/')[-1]
 431         try:
 432             webpage = self._download_webpage(url, video_id)
 433         except ValueError as err:
 434             # since this is the last-resort InfoExtractor, if
 435             # this error is thrown, it'll be thrown here
 436             raise ExtractorError(u'Invalid URL: %s' % url)
 437
 438         self.report_extraction(video_id)
 439         # Start with something easy: JW Player in SWFObject
 440         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 441         if mobj is None:
 442             # Broaden the search a little bit
 443             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 444         if mobj is None:
 445             # Broaden the search a little bit: JWPlayer JS loader
 446             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
 447         if mobj is None:
 448             # Try to find twitter cards info
 449             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
 450         if mobj is None:
 451             # We look for Open Graph info:
 452             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
 453             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
 454             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
 455             if m_video_type is not None:
 456                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
 457         if mobj is None:
 458             raise ExtractorError(u'Invalid URL: %s' % url)
 459
 460         # It's possible that one of the regexes
 461         # matched, but returned an empty group:
 462         if mobj.group(1) is None:
 463             raise ExtractorError(u'Invalid URL: %s' % url)
 464
 465         video_url = compat_urllib_parse.unquote(mobj.group(1))
 466         video_id = os.path.basename(video_url)
 467
 468         # here's a fun little line of code for you:
 469         video_extension = os.path.splitext(video_id)[1][1:]
 470         video_id = os.path.splitext(video_id)[0]
 471
 472         # it's tempting to parse this further, but you would
 473         # have to take into account all the variations like
 474         #   Video Title - Site Name
 475         #   Site Name | Video Title
 476         #   Video Title - Tagline | Site Name
 477         # and so on and so forth; it's just not practical
 478         video_title = self._html_search_regex(r'<title>(.*)</title>',
 479             webpage, u'video title')
 480
 481         # video uploader is domain name
 482         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
 483             url, u'video uploader')
 484
 485         return [{
 486             'id':       video_id,
 487             'url':      video_url,
 488             'uploader': video_uploader,
 489             'upload_date':  None,
 490             'title':    video_title,
 491             'ext':      video_extension,
 492         }]
 493
 494
 495 class YoutubeSearchIE(SearchInfoExtractor):
 496     """Information Extractor for YouTube search queries."""
 497     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
 498     _MAX_RESULTS = 1000
 499     IE_NAME = u'youtube:search'
 500     _SEARCH_KEY = 'ytsearch'
 501
 502     def report_download_page(self, query, pagenum):
 503         """Report attempt to download search page with given number."""
 504         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 505
 506     def _get_n_results(self, query, n):
 507         """Get a specified number of results for a query"""
 508
 509         video_ids = []
 510         pagenum = 0
 511         limit = n
 512
 513         while (50 * pagenum) < limit:
 514             self.report_download_page(query, pagenum+1)
 515             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
 516             request = compat_urllib_request.Request(result_url)
 517             try:
 518                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
 519             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 520                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
 521             api_response = json.loads(data)['data']
 522
 523             if not 'items' in api_response:
 524                 raise ExtractorError(u'[youtube] No video results')
 525
 526             new_ids = list(video['id'] for video in api_response['items'])
 527             video_ids += new_ids
 528
 529             limit = min(n, api_response['totalItems'])
 530             pagenum += 1
 531
 532         if len(video_ids) > n:
 533             video_ids = video_ids[:n]
 534         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
 535         return self.playlist_result(videos, query)
 536
 537
 538 class GoogleSearchIE(SearchInfoExtractor):
 539     """Information Extractor for Google Video search queries."""
 540     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
 541     _MAX_RESULTS = 1000
 542     IE_NAME = u'video.google:search'
 543     _SEARCH_KEY = 'gvsearch'
 544
 545     def _get_n_results(self, query, n):
 546         """Get a specified number of results for a query"""
 547
 548         res = {
 549             '_type': 'playlist',
 550             'id': query,
 551             'entries': []
 552         }
 553
 554         for pagenum in itertools.count(1):
 555             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
 556             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
 557                                              note='Downloading result page ' + str(pagenum))
 558
 559             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
 560                 e = {
 561                     '_type': 'url',
 562                     'url': mobj.group(1)
 563                 }
 564                 res['entries'].append(e)
 565
 566             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
 567                 return res
 568
 569 class YahooSearchIE(SearchInfoExtractor):
 570     """Information Extractor for Yahoo! Video search queries."""
 571
 572     _MAX_RESULTS = 1000
 573     IE_NAME = u'screen.yahoo:search'
 574     _SEARCH_KEY = 'yvsearch'
 575
 576     def _get_n_results(self, query, n):
 577         """Get a specified number of results for a query"""
 578
 579         res = {
 580             '_type': 'playlist',
 581             'id': query,
 582             'entries': []
 583         }
 584         for pagenum in itertools.count(0):
 585             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 586             webpage = self._download_webpage(result_url, query,
 587                                              note='Downloading results page '+str(pagenum+1))
 588             info = json.loads(webpage)
 589             m = info[u'm']
 590             results = info[u'results']
 591
 592             for (i, r) in enumerate(results):
 593                 if (pagenum * 30) +i >= n:
 594                     break
 595                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 596                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 597                 res['entries'].append(e)
 598             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
 599                 break
 600
 601         return res
 602
 603
 604 class BlipTVUserIE(InfoExtractor):
 605     """Information Extractor for blip.tv users."""
 606
 607     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 608     _PAGE_SIZE = 12
 609     IE_NAME = u'blip.tv:user'
 610
 611     def _real_extract(self, url):
 612         # Extract username
 613         mobj = re.match(self._VALID_URL, url)
 614         if mobj is None:
 615             raise ExtractorError(u'Invalid URL: %s' % url)
 616
 617         username = mobj.group(1)
 618
 619         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 620
 621         page = self._download_webpage(url, username, u'Downloading user page')
 622         mobj = re.search(r'data-users-id="([^"]+)"', page)
 623         page_base = page_base % mobj.group(1)
 624
 625
 626         # Download video ids using BlipTV Ajax calls. Result size per
 627         # query is limited (currently to 12 videos) so we need to query
 628         # page by page until there are no video ids - it means we got
 629         # all of them.
 630
 631         video_ids = []
 632         pagenum = 1
 633
 634         while True:
 635             url = page_base + "&page=" + str(pagenum)
 636             page = self._download_webpage(url, username,
 637                                           u'Downloading video ids from page %d' % pagenum)
 638
 639             # Extract video identifiers
 640             ids_in_page = []
 641
 642             for mobj in re.finditer(r'href="/([^"]+)"', page):
 643                 if mobj.group(1) not in ids_in_page:
 644                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 645
 646             video_ids.extend(ids_in_page)
 647
 648             # A little optimization - if current page is not
 649             # "full", ie. does not contain PAGE_SIZE video ids then
 650             # we can assume that this page is the last one - there
 651             # are no more ids on further pages - no need to query
 652             # again.
 653
 654             if len(ids_in_page) < self._PAGE_SIZE:
 655                 break
 656
 657             pagenum += 1
 658
 659         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 660         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 661         return [self.playlist_result(url_entries, playlist_title = username)]
 662
 663
 664 class DepositFilesIE(InfoExtractor):
 665     """Information extractor for depositfiles.com"""
 666
 667     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 668
 669     def _real_extract(self, url):
 670         file_id = url.split('/')[-1]
 671         # Rebuild url in english locale
 672         url = 'http://depositfiles.com/en/files/' + file_id
 673
 674         # Retrieve file webpage with 'Free download' button pressed
 675         free_download_indication = { 'gateway_result' : '1' }
 676         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 677         try:
 678             self.report_download_webpage(file_id)
 679             webpage = compat_urllib_request.urlopen(request).read()
 680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 681             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 682
 683         # Search for the real file URL
 684         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 685         if (mobj is None) or (mobj.group(1) is None):
 686             # Try to figure out reason of the error.
 687             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 688             if (mobj is not None) and (mobj.group(1) is not None):
 689                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 690                 raise ExtractorError(u'%s' % restriction_message)
 691             else:
 692                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 693
 694         file_url = mobj.group(1)
 695         file_extension = os.path.splitext(file_url)[1][1:]
 696
 697         # Search for file title
 698         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 699
 700         return [{
 701             'id':       file_id.decode('utf-8'),
 702             'url':      file_url.decode('utf-8'),
 703             'uploader': None,
 704             'upload_date':  None,
 705             'title':    file_title,
 706             'ext':      file_extension.decode('utf-8'),
 707         }]
 708
 709
 710 class FacebookIE(InfoExtractor):
 711     """Information Extractor for Facebook"""
 712
 713     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 714     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 715     _NETRC_MACHINE = 'facebook'
 716     IE_NAME = u'facebook'
 717
 718     def report_login(self):
 719         """Report attempt to log in."""
 720         self.to_screen(u'Logging in')
 721
 722     def _real_initialize(self):
 723         if self._downloader is None:
 724             return
 725
 726         useremail = None
 727         password = None
 728         downloader_params = self._downloader.params
 729
 730         # Attempt to use provided username and password or .netrc data
 731         if downloader_params.get('username', None) is not None:
 732             useremail = downloader_params['username']
 733             password = downloader_params['password']
 734         elif downloader_params.get('usenetrc', False):
 735             try:
 736                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 737                 if info is not None:
 738                     useremail = info[0]
 739                     password = info[2]
 740                 else:
 741                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 742             except (IOError, netrc.NetrcParseError) as err:
 743                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 744                 return
 745
 746         if useremail is None:
 747             return
 748
 749         # Log in
 750         login_form = {
 751             'email': useremail,
 752             'pass': password,
 753             'login': 'Log+In'
 754             }
 755         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 756         try:
 757             self.report_login()
 758             login_results = compat_urllib_request.urlopen(request).read()
 759             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 760                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 761                 return
 762         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 763             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 764             return
 765
 766     def _real_extract(self, url):
 767         mobj = re.match(self._VALID_URL, url)
 768         if mobj is None:
 769             raise ExtractorError(u'Invalid URL: %s' % url)
 770         video_id = mobj.group('ID')
 771
 772         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 773         webpage = self._download_webpage(url, video_id)
 774
 775         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 776         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 777         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 778         if not m:
 779             raise ExtractorError(u'Cannot parse data')
 780         data = dict(json.loads(m.group(1)))
 781         params_raw = compat_urllib_parse.unquote(data['params'])
 782         params = json.loads(params_raw)
 783         video_data = params['video_data'][0]
 784         video_url = video_data.get('hd_src')
 785         if not video_url:
 786             video_url = video_data['sd_src']
 787         if not video_url:
 788             raise ExtractorError(u'Cannot find video URL')
 789         video_duration = int(video_data['video_duration'])
 790         thumbnail = video_data['thumbnail_src']
 791
 792         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
 793             webpage, u'title')
 794
 795         info = {
 796             'id': video_id,
 797             'title': video_title,
 798             'url': video_url,
 799             'ext': 'mp4',
 800             'duration': video_duration,
 801             'thumbnail': thumbnail,
 802         }
 803         return [info]
 804
 805
 806 class BlipTVIE(InfoExtractor):
 807     """Information extractor for blip.tv"""
 808
 809     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
 810     _URL_EXT = r'^.*\.([a-z0-9]+)$'
 811     IE_NAME = u'blip.tv'
 812
 813     def report_direct_download(self, title):
 814         """Report information extraction."""
 815         self.to_screen(u'%s: Direct download detected' % title)
 816
 817     def _real_extract(self, url):
 818         mobj = re.match(self._VALID_URL, url)
 819         if mobj is None:
 820             raise ExtractorError(u'Invalid URL: %s' % url)
 821
 822         # See https://github.com/rg3/youtube-dl/issues/857
 823         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 824         if api_mobj is not None:
 825             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
 826         urlp = compat_urllib_parse_urlparse(url)
 827         if urlp.path.startswith('/play/'):
 828             request = compat_urllib_request.Request(url)
 829             response = compat_urllib_request.urlopen(request)
 830             redirecturl = response.geturl()
 831             rurlp = compat_urllib_parse_urlparse(redirecturl)
 832             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
 833             url = 'http://blip.tv/a/a-' + file_id
 834             return self._real_extract(url)
 835
 836
 837         if '?' in url:
 838             cchar = '&'
 839         else:
 840             cchar = '?'
 841         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
 842         request = compat_urllib_request.Request(json_url)
 843         request.add_header('User-Agent', 'iTunes/10.6.1')
 844         self.report_extraction(mobj.group(1))
 845         info = None
 846         try:
 847             urlh = compat_urllib_request.urlopen(request)
 848             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
 849                 basename = url.split('/')[-1]
 850                 title,ext = os.path.splitext(basename)
 851                 title = title.decode('UTF-8')
 852                 ext = ext.replace('.', '')
 853                 self.report_direct_download(title)
 854                 info = {
 855                     'id': title,
 856                     'url': url,
 857                     'uploader': None,
 858                     'upload_date': None,
 859                     'title': title,
 860                     'ext': ext,
 861                     'urlhandle': urlh
 862                 }
 863         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 864             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
 865         if info is None: # Regular URL
 866             try:
 867                 json_code_bytes = urlh.read()
 868                 json_code = json_code_bytes.decode('utf-8')
 869             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 870                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
 871
 872             try:
 873                 json_data = json.loads(json_code)
 874                 if 'Post' in json_data:
 875                     data = json_data['Post']
 876                 else:
 877                     data = json_data
 878
 879                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
 880                 video_url = data['media']['url']
 881                 umobj = re.match(self._URL_EXT, video_url)
 882                 if umobj is None:
 883                     raise ValueError('Can not determine filename extension')
 884                 ext = umobj.group(1)
 885
 886                 info = {
 887                     'id': data['item_id'],
 888                     'url': video_url,
 889                     'uploader': data['display_name'],
 890                     'upload_date': upload_date,
 891                     'title': data['title'],
 892                     'ext': ext,
 893                     'format': data['media']['mimeType'],
 894                     'thumbnail': data['thumbnailUrl'],
 895                     'description': data['description'],
 896                     'player_url': data['embedUrl'],
 897                     'user_agent': 'iTunes/10.6.1',
 898                 }
 899             except (ValueError,KeyError) as err:
 900                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
 901
 902         return [info]
 903
 904
 905 class MyVideoIE(InfoExtractor):
 906     """Information Extractor for myvideo.de."""
 907
 908     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
 909     IE_NAME = u'myvideo'
 910
 911     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
 912     # Released into the Public Domain by Tristan Fischer on 2013-05-19
 913     # https://github.com/rg3/youtube-dl/pull/842
 914     def __rc4crypt(self,data, key):
 915         x = 0
 916         box = list(range(256))
 917         for i in list(range(256)):
 918             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
 919             box[i], box[x] = box[x], box[i]
 920         x = 0
 921         y = 0
 922         out = ''
 923         for char in data:
 924             x = (x + 1) % 256
 925             y = (y + box[x]) % 256
 926             box[x], box[y] = box[y], box[x]
 927             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
 928         return out
 929
 930     def __md5(self,s):
 931         return hashlib.md5(s).hexdigest().encode()
 932
 933     def _real_extract(self,url):
 934         mobj = re.match(self._VALID_URL, url)
 935         if mobj is None:
 936             raise ExtractorError(u'invalid URL: %s' % url)
 937
 938         video_id = mobj.group(1)
 939
 940         GK = (
 941           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
 942           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
 943           b'TnpsbA0KTVRkbU1tSTRNdz09'
 944         )
 945
 946         # Get video webpage
 947         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
 948         webpage = self._download_webpage(webpage_url, video_id)
 949
 950         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
 951         if mobj is not None:
 952             self.report_extraction(video_id)
 953             video_url = mobj.group(1) + '.flv'
 954
 955             video_title = self._html_search_regex('<title>([^<]+)</title>',
 956                 webpage, u'title')
 957
 958             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
 959
 960             return [{
 961                 'id':       video_id,
 962                 'url':      video_url,
 963                 'uploader': None,
 964                 'upload_date':  None,
 965                 'title':    video_title,
 966                 'ext':      u'flv',
 967             }]
 968
 969         # try encxml
 970         mobj = re.search('var flashvars={(.+?)}', webpage)
 971         if mobj is None:
 972             raise ExtractorError(u'Unable to extract video')
 973
 974         params = {}
 975         encxml = ''
 976         sec = mobj.group(1)
 977         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
 978             if not a == '_encxml':
 979                 params[a] = b
 980             else:
 981                 encxml = compat_urllib_parse.unquote(b)
 982         if not params.get('domain'):
 983             params['domain'] = 'www.myvideo.de'
 984         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
 985         if 'flash_playertype=MTV' in xmldata_url:
 986             self._downloader.report_warning(u'avoiding MTV player')
 987             xmldata_url = (
 988                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
 989                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
 990             ) % video_id
 991
 992         # get enc data
 993         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
 994         enc_data_b = binascii.unhexlify(enc_data)
 995         sk = self.__md5(
 996             base64.b64decode(base64.b64decode(GK)) +
 997             self.__md5(
 998                 str(video_id).encode('utf-8')
 999             )
1000         )
1001         dec_data = self.__rc4crypt(enc_data_b, sk)
1002
1003         # extracting infos
1004         self.report_extraction(video_id)
1005
1006         video_url = None
1007         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1008         if mobj:
1009             video_url = compat_urllib_parse.unquote(mobj.group(1))
1010             if 'myvideo2flash' in video_url:
1011                 self._downloader.report_warning(u'forcing RTMPT ...')
1012                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1013
1014         if not video_url:
1015             # extract non rtmp videos
1016             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1017             if mobj is None:
1018                 raise ExtractorError(u'unable to extract url')
1019             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1020
1021         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1022         video_file = compat_urllib_parse.unquote(video_file)
1023
1024         if not video_file.endswith('f4m'):
1025             ppath, prefix = video_file.split('.')
1026             video_playpath = '%s:%s' % (prefix, ppath)
1027             video_hls_playlist = ''
1028         else:
1029             video_playpath = ''
1030             video_hls_playlist = (
1031                 video_filepath + video_file
1032             ).replace('.f4m', '.m3u8')
1033
1034         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1035         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1036
1037         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1038             webpage, u'title')
1039
1040         return [{
1041             'id':                 video_id,
1042             'url':                video_url,
1043             'tc_url':             video_url,
1044             'uploader':           None,
1045             'upload_date':        None,
1046             'title':              video_title,
1047             'ext':                u'flv',
1048             'play_path':          video_playpath,
1049             'video_file':         video_file,
1050             'video_hls_playlist': video_hls_playlist,
1051             'player_url':         video_swfobj,
1052         }]
1053
1054
1055 class ComedyCentralIE(InfoExtractor):
1056     """Information extractor for The Daily Show and Colbert Report """
1057
1058     # urls can be abbreviations like :thedailyshow or :colbert
1059     # urls for episodes like:
1060     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
1061     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
1062     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
1063     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
1064                       |(https?://)?(www\.)?
1065                           (?P<showname>thedailyshow|colbertnation)\.com/
1066                          (full-episodes/(?P<episode>.*)|
1067                           (?P<clip>
1068                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1069                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1070                      $"""
1071
1072     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1073
1074     _video_extensions = {
1075         '3500': 'mp4',
1076         '2200': 'mp4',
1077         '1700': 'mp4',
1078         '1200': 'mp4',
1079         '750': 'mp4',
1080         '400': 'mp4',
1081     }
1082     _video_dimensions = {
1083         '3500': '1280x720',
1084         '2200': '960x540',
1085         '1700': '768x432',
1086         '1200': '640x360',
1087         '750': '512x288',
1088         '400': '384x216',
1089     }
1090
1091     @classmethod
1092     def suitable(cls, url):
1093         """Receives a URL and returns True if suitable for this IE."""
1094         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1095
1096     def _print_formats(self, formats):
1097         print('Available formats:')
1098         for x in formats:
1099             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1100
1101
1102     def _real_extract(self, url):
1103         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1104         if mobj is None:
1105             raise ExtractorError(u'Invalid URL: %s' % url)
1106
1107         if mobj.group('shortname'):
1108             if mobj.group('shortname') in ('tds', 'thedailyshow'):
1109                 url = u'http://www.thedailyshow.com/full-episodes/'
1110             else:
1111                 url = u'http://www.colbertnation.com/full-episodes/'
1112             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1113             assert mobj is not None
1114
1115         if mobj.group('clip'):
1116             if mobj.group('showname') == 'thedailyshow':
1117                 epTitle = mobj.group('tdstitle')
1118             else:
1119                 epTitle = mobj.group('cntitle')
1120             dlNewest = False
1121         else:
1122             dlNewest = not mobj.group('episode')
1123             if dlNewest:
1124                 epTitle = mobj.group('showname')
1125             else:
1126                 epTitle = mobj.group('episode')
1127
1128         self.report_extraction(epTitle)
1129         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1130         if dlNewest:
1131             url = htmlHandle.geturl()
1132             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1133             if mobj is None:
1134                 raise ExtractorError(u'Invalid redirected URL: ' + url)
1135             if mobj.group('episode') == '':
1136                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1137             epTitle = mobj.group('episode')
1138
1139         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1140
1141         if len(mMovieParams) == 0:
1142             # The Colbert Report embeds the information in a without
1143             # a URL prefix; so extract the alternate reference
1144             # and then add the URL prefix manually.
1145
1146             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1147             if len(altMovieParams) == 0:
1148                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1149             else:
1150                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1151
1152         uri = mMovieParams[0][1]
1153         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1154         indexXml = self._download_webpage(indexUrl, epTitle,
1155                                           u'Downloading show index',
1156                                           u'unable to download episode index')
1157
1158         results = []
1159
1160         idoc = xml.etree.ElementTree.fromstring(indexXml)
1161         itemEls = idoc.findall('.//item')
1162         for partNum,itemEl in enumerate(itemEls):
1163             mediaId = itemEl.findall('./guid')[0].text
1164             shortMediaId = mediaId.split(':')[-1]
1165             showId = mediaId.split(':')[-2].replace('.com', '')
1166             officialTitle = itemEl.findall('./title')[0].text
1167             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1168
1169             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1170                         compat_urllib_parse.urlencode({'uri': mediaId}))
1171             configXml = self._download_webpage(configUrl, epTitle,
1172                                                u'Downloading configuration for %s' % shortMediaId)
1173
1174             cdoc = xml.etree.ElementTree.fromstring(configXml)
1175             turls = []
1176             for rendition in cdoc.findall('.//rendition'):
1177                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1178                 turls.append(finfo)
1179
1180             if len(turls) == 0:
1181                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1182                 continue
1183
1184             if self._downloader.params.get('listformats', None):
1185                 self._print_formats([i[0] for i in turls])
1186                 return
1187
1188             # For now, just pick the highest bitrate
1189             format,rtmp_video_url = turls[-1]
1190
1191             # Get the format arg from the arg stream
1192             req_format = self._downloader.params.get('format', None)
1193
1194             # Select format if we can find one
1195             for f,v in turls:
1196                 if f == req_format:
1197                     format, rtmp_video_url = f, v
1198                     break
1199
1200             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1201             if not m:
1202                 raise ExtractorError(u'Cannot transform RTMP url')
1203             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1204             video_url = base + m.group('finalid')
1205
1206             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1207             info = {
1208                 'id': shortMediaId,
1209                 'url': video_url,
1210                 'uploader': showId,
1211                 'upload_date': officialDate,
1212                 'title': effTitle,
1213                 'ext': 'mp4',
1214                 'format': format,
1215                 'thumbnail': None,
1216                 'description': officialTitle,
1217             }
1218             results.append(info)
1219
1220         return results
1221
1222
1223 class EscapistIE(InfoExtractor):
1224     """Information extractor for The Escapist """
1225
1226     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1227     IE_NAME = u'escapist'
1228
1229     def _real_extract(self, url):
1230         mobj = re.match(self._VALID_URL, url)
1231         if mobj is None:
1232             raise ExtractorError(u'Invalid URL: %s' % url)
1233         showName = mobj.group('showname')
1234         videoId = mobj.group('episode')
1235
1236         self.report_extraction(videoId)
1237         webpage = self._download_webpage(url, videoId)
1238
1239         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1240             webpage, u'description', fatal=False)
1241
1242         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1243             webpage, u'thumbnail', fatal=False)
1244
1245         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1246             webpage, u'player url')
1247
1248         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1249             webpage, u'player url').split(' : ')[-1]
1250
1251         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1252         configUrl = compat_urllib_parse.unquote(configUrl)
1253
1254         configJSON = self._download_webpage(configUrl, videoId,
1255                                             u'Downloading configuration',
1256                                             u'unable to download configuration')
1257
1258         # Technically, it's JavaScript, not JSON
1259         configJSON = configJSON.replace("'", '"')
1260
1261         try:
1262             config = json.loads(configJSON)
1263         except (ValueError,) as err:
1264             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1265
1266         playlist = config['playlist']
1267         videoUrl = playlist[1]['url']
1268
1269         info = {
1270             'id': videoId,
1271             'url': videoUrl,
1272             'uploader': showName,
1273             'upload_date': None,
1274             'title': title,
1275             'ext': 'mp4',
1276             'thumbnail': imgUrl,
1277             'description': videoDesc,
1278             'player_url': playerUrl,
1279         }
1280
1281         return [info]
1282
1283 class CollegeHumorIE(InfoExtractor):
1284     """Information extractor for collegehumor.com"""
1285
1286     _WORKING = False
1287     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1288     IE_NAME = u'collegehumor'
1289
1290     def report_manifest(self, video_id):
1291         """Report information extraction."""
1292         self.to_screen(u'%s: Downloading XML manifest' % video_id)
1293
1294     def _real_extract(self, url):
1295         mobj = re.match(self._VALID_URL, url)
1296         if mobj is None:
1297             raise ExtractorError(u'Invalid URL: %s' % url)
1298         video_id = mobj.group('videoid')
1299
1300         info = {
1301             'id': video_id,
1302             'uploader': None,
1303             'upload_date': None,
1304         }
1305
1306         self.report_extraction(video_id)
1307         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1308         try:
1309             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1310         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1311             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1312
1313         mdoc = xml.etree.ElementTree.fromstring(metaXml)
1314         try:
1315             videoNode = mdoc.findall('./video')[0]
1316             info['description'] = videoNode.findall('./description')[0].text
1317             info['title'] = videoNode.findall('./caption')[0].text
1318             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1319             manifest_url = videoNode.findall('./file')[0].text
1320         except IndexError:
1321             raise ExtractorError(u'Invalid metadata XML file')
1322
1323         manifest_url += '?hdcore=2.10.3'
1324         self.report_manifest(video_id)
1325         try:
1326             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1327         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1328             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1329
1330         adoc = xml.etree.ElementTree.fromstring(manifestXml)
1331         try:
1332             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1333             node_id = media_node.attrib['url']
1334             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1335         except IndexError as err:
1336             raise ExtractorError(u'Invalid manifest file')
1337
1338         url_pr = compat_urllib_parse_urlparse(manifest_url)
1339         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1340
1341         info['url'] = url
1342         info['ext'] = 'f4f'
1343         return [info]
1344
1345
1346 class XVideosIE(InfoExtractor):
1347     """Information extractor for xvideos.com"""
1348
1349     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1350     IE_NAME = u'xvideos'
1351
1352     def _real_extract(self, url):
1353         mobj = re.match(self._VALID_URL, url)
1354         if mobj is None:
1355             raise ExtractorError(u'Invalid URL: %s' % url)
1356         video_id = mobj.group(1)
1357
1358         webpage = self._download_webpage(url, video_id)
1359
1360         self.report_extraction(video_id)
1361
1362         # Extract video URL
1363         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1364             webpage, u'video URL'))
1365
1366         # Extract title
1367         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1368             webpage, u'title')
1369
1370         # Extract video thumbnail
1371         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1372             webpage, u'thumbnail', fatal=False)
1373
1374         info = {
1375             'id': video_id,
1376             'url': video_url,
1377             'uploader': None,
1378             'upload_date': None,
1379             'title': video_title,
1380             'ext': 'flv',
1381             'thumbnail': video_thumbnail,
1382             'description': None,
1383         }
1384
1385         return [info]
1386
1387
1388 class SoundcloudIE(InfoExtractor):
1389     """Information extractor for soundcloud.com
1390        To access the media, the uid of the song and a stream token
1391        must be extracted from the page source and the script must make
1392        a request to media.soundcloud.com/crossdomain.xml. Then
1393        the media can be grabbed by requesting from an url composed
1394        of the stream token and uid
1395      """
1396
1397     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1398     IE_NAME = u'soundcloud'
1399
1400     def report_resolve(self, video_id):
1401         """Report information extraction."""
1402         self.to_screen(u'%s: Resolving id' % video_id)
1403
1404     def _real_extract(self, url):
1405         mobj = re.match(self._VALID_URL, url)
1406         if mobj is None:
1407             raise ExtractorError(u'Invalid URL: %s' % url)
1408
1409         # extract uploader (which is in the url)
1410         uploader = mobj.group(1)
1411         # extract simple title (uploader + slug of song title)
1412         slug_title =  mobj.group(2)
1413         simple_title = uploader + u'-' + slug_title
1414         full_title = '%s/%s' % (uploader, slug_title)
1415
1416         self.report_resolve(full_title)
1417
1418         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1419         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1420         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1421
1422         info = json.loads(info_json)
1423         video_id = info['id']
1424         self.report_extraction(full_title)
1425
1426         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1427         stream_json = self._download_webpage(streams_url, full_title,
1428                                              u'Downloading stream definitions',
1429                                              u'unable to download stream definitions')
1430
1431         streams = json.loads(stream_json)
1432         mediaURL = streams['http_mp3_128_url']
1433         upload_date = unified_strdate(info['created_at'])
1434
1435         return [{
1436             'id':       info['id'],
1437             'url':      mediaURL,
1438             'uploader': info['user']['username'],
1439             'upload_date': upload_date,
1440             'title':    info['title'],
1441             'ext':      u'mp3',
1442             'description': info['description'],
1443         }]
1444
1445 class SoundcloudSetIE(InfoExtractor):
1446     """Information extractor for soundcloud.com sets
1447        To access the media, the uid of the song and a stream token
1448        must be extracted from the page source and the script must make
1449        a request to media.soundcloud.com/crossdomain.xml. Then
1450        the media can be grabbed by requesting from an url composed
1451        of the stream token and uid
1452      """
1453
1454     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1455     IE_NAME = u'soundcloud:set'
1456
1457     def report_resolve(self, video_id):
1458         """Report information extraction."""
1459         self.to_screen(u'%s: Resolving id' % video_id)
1460
1461     def _real_extract(self, url):
1462         mobj = re.match(self._VALID_URL, url)
1463         if mobj is None:
1464             raise ExtractorError(u'Invalid URL: %s' % url)
1465
1466         # extract uploader (which is in the url)
1467         uploader = mobj.group(1)
1468         # extract simple title (uploader + slug of song title)
1469         slug_title =  mobj.group(2)
1470         simple_title = uploader + u'-' + slug_title
1471         full_title = '%s/sets/%s' % (uploader, slug_title)
1472
1473         self.report_resolve(full_title)
1474
1475         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1476         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1477         info_json = self._download_webpage(resolv_url, full_title)
1478
1479         videos = []
1480         info = json.loads(info_json)
1481         if 'errors' in info:
1482             for err in info['errors']:
1483                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1484             return
1485
1486         self.report_extraction(full_title)
1487         for track in info['tracks']:
1488             video_id = track['id']
1489
1490             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1491             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1492
1493             self.report_extraction(video_id)
1494             streams = json.loads(stream_json)
1495             mediaURL = streams['http_mp3_128_url']
1496
1497             videos.append({
1498                 'id':       video_id,
1499                 'url':      mediaURL,
1500                 'uploader': track['user']['username'],
1501                 'upload_date':  unified_strdate(track['created_at']),
1502                 'title':    track['title'],
1503                 'ext':      u'mp3',
1504                 'description': track['description'],
1505             })
1506         return videos
1507
1508
1509 class InfoQIE(InfoExtractor):
1510     """Information extractor for infoq.com"""
1511     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1512
1513     def _real_extract(self, url):
1514         mobj = re.match(self._VALID_URL, url)
1515         if mobj is None:
1516             raise ExtractorError(u'Invalid URL: %s' % url)
1517
1518         webpage = self._download_webpage(url, video_id=url)
1519         self.report_extraction(url)
1520
1521         # Extract video URL
1522         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1523         if mobj is None:
1524             raise ExtractorError(u'Unable to extract video url')
1525         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1526         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1527
1528         # Extract title
1529         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1530             webpage, u'title')
1531
1532         # Extract description
1533         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1534             webpage, u'description', fatal=False)
1535
1536         video_filename = video_url.split('/')[-1]
1537         video_id, extension = video_filename.split('.')
1538
1539         info = {
1540             'id': video_id,
1541             'url': video_url,
1542             'uploader': None,
1543             'upload_date': None,
1544             'title': video_title,
1545             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1546             'thumbnail': None,
1547             'description': video_description,
1548         }
1549
1550         return [info]
1551
1552 class MixcloudIE(InfoExtractor):
1553     """Information extractor for www.mixcloud.com"""
1554
1555     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1556     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1557     IE_NAME = u'mixcloud'
1558
1559     def report_download_json(self, file_id):
1560         """Report JSON download."""
1561         self.to_screen(u'Downloading json')
1562
1563     def get_urls(self, jsonData, fmt, bitrate='best'):
1564         """Get urls from 'audio_formats' section in json"""
1565         file_url = None
1566         try:
1567             bitrate_list = jsonData[fmt]
1568             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1569                 bitrate = max(bitrate_list) # select highest
1570
1571             url_list = jsonData[fmt][bitrate]
1572         except TypeError: # we have no bitrate info.
1573             url_list = jsonData[fmt]
1574         return url_list
1575
1576     def check_urls(self, url_list):
1577         """Returns 1st active url from list"""
1578         for url in url_list:
1579             try:
1580                 compat_urllib_request.urlopen(url)
1581                 return url
1582             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1583                 url = None
1584
1585         return None
1586
1587     def _print_formats(self, formats):
1588         print('Available formats:')
1589         for fmt in formats.keys():
1590             for b in formats[fmt]:
1591                 try:
1592                     ext = formats[fmt][b][0]
1593                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1594                 except TypeError: # we have no bitrate info
1595                     ext = formats[fmt][0]
1596                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1597                     break
1598
1599     def _real_extract(self, url):
1600         mobj = re.match(self._VALID_URL, url)
1601         if mobj is None:
1602             raise ExtractorError(u'Invalid URL: %s' % url)
1603         # extract uploader & filename from url
1604         uploader = mobj.group(1).decode('utf-8')
1605         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1606
1607         # construct API request
1608         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1609         # retrieve .json file with links to files
1610         request = compat_urllib_request.Request(file_url)
1611         try:
1612             self.report_download_json(file_url)
1613             jsonData = compat_urllib_request.urlopen(request).read()
1614         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1615             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1616
1617         # parse JSON
1618         json_data = json.loads(jsonData)
1619         player_url = json_data['player_swf_url']
1620         formats = dict(json_data['audio_formats'])
1621
1622         req_format = self._downloader.params.get('format', None)
1623         bitrate = None
1624
1625         if self._downloader.params.get('listformats', None):
1626             self._print_formats(formats)
1627             return
1628
1629         if req_format is None or req_format == 'best':
1630             for format_param in formats.keys():
1631                 url_list = self.get_urls(formats, format_param)
1632                 # check urls
1633                 file_url = self.check_urls(url_list)
1634                 if file_url is not None:
1635                     break # got it!
1636         else:
1637             if req_format not in formats:
1638                 raise ExtractorError(u'Format is not available')
1639
1640             url_list = self.get_urls(formats, req_format)
1641             file_url = self.check_urls(url_list)
1642             format_param = req_format
1643
1644         return [{
1645             'id': file_id.decode('utf-8'),
1646             'url': file_url.decode('utf-8'),
1647             'uploader': uploader.decode('utf-8'),
1648             'upload_date': None,
1649             'title': json_data['name'],
1650             'ext': file_url.split('.')[-1].decode('utf-8'),
1651             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1652             'thumbnail': json_data['thumbnail_url'],
1653             'description': json_data['description'],
1654             'player_url': player_url.decode('utf-8'),
1655         }]
1656
1657 class StanfordOpenClassroomIE(InfoExtractor):
1658     """Information extractor for Stanford's Open ClassRoom"""
1659
1660     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1661     IE_NAME = u'stanfordoc'
1662
1663     def _real_extract(self, url):
1664         mobj = re.match(self._VALID_URL, url)
1665         if mobj is None:
1666             raise ExtractorError(u'Invalid URL: %s' % url)
1667
1668         if mobj.group('course') and mobj.group('video'): # A specific video
1669             course = mobj.group('course')
1670             video = mobj.group('video')
1671             info = {
1672                 'id': course + '_' + video,
1673                 'uploader': None,
1674                 'upload_date': None,
1675             }
1676
1677             self.report_extraction(info['id'])
1678             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1679             xmlUrl = baseUrl + video + '.xml'
1680             try:
1681                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1682             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1683                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1684             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1685             try:
1686                 info['title'] = mdoc.findall('./title')[0].text
1687                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1688             except IndexError:
1689                 raise ExtractorError(u'Invalid metadata XML file')
1690             info['ext'] = info['url'].rpartition('.')[2]
1691             return [info]
1692         elif mobj.group('course'): # A course page
1693             course = mobj.group('course')
1694             info = {
1695                 'id': course,
1696                 'type': 'playlist',
1697                 'uploader': None,
1698                 'upload_date': None,
1699             }
1700
1701             coursepage = self._download_webpage(url, info['id'],
1702                                         note='Downloading course info page',
1703                                         errnote='Unable to download course info page')
1704
1705             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1706
1707             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1708                 coursepage, u'description', fatal=False)
1709
1710             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1711             info['list'] = [
1712                 {
1713                     'type': 'reference',
1714                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1715                 }
1716                     for vpage in links]
1717             results = []
1718             for entry in info['list']:
1719                 assert entry['type'] == 'reference'
1720                 results += self.extract(entry['url'])
1721             return results
1722         else: # Root page
1723             info = {
1724                 'id': 'Stanford OpenClassroom',
1725                 'type': 'playlist',
1726                 'uploader': None,
1727                 'upload_date': None,
1728             }
1729
1730             self.report_download_webpage(info['id'])
1731             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1732             try:
1733                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1734             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1735                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1736
1737             info['title'] = info['id']
1738
1739             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1740             info['list'] = [
1741                 {
1742                     'type': 'reference',
1743                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1744                 }
1745                     for cpage in links]
1746
1747             results = []
1748             for entry in info['list']:
1749                 assert entry['type'] == 'reference'
1750                 results += self.extract(entry['url'])
1751             return results
1752
1753 class MTVIE(InfoExtractor):
1754     """Information extractor for MTV.com"""
1755
1756     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1757     IE_NAME = u'mtv'
1758
1759     def _real_extract(self, url):
1760         mobj = re.match(self._VALID_URL, url)
1761         if mobj is None:
1762             raise ExtractorError(u'Invalid URL: %s' % url)
1763         if not mobj.group('proto'):
1764             url = 'http://' + url
1765         video_id = mobj.group('videoid')
1766
1767         webpage = self._download_webpage(url, video_id)
1768
1769         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1770             webpage, u'song name', fatal=False)
1771
1772         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1773             webpage, u'title')
1774
1775         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1776             webpage, u'mtvn_uri', fatal=False)
1777
1778         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1779             webpage, u'content id', fatal=False)
1780
1781         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1782         self.report_extraction(video_id)
1783         request = compat_urllib_request.Request(videogen_url)
1784         try:
1785             metadataXml = compat_urllib_request.urlopen(request).read()
1786         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1787             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1788
1789         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1790         renditions = mdoc.findall('.//rendition')
1791
1792         # For now, always pick the highest quality.
1793         rendition = renditions[-1]
1794
1795         try:
1796             _,_,ext = rendition.attrib['type'].partition('/')
1797             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1798             video_url = rendition.find('./src').text
1799         except KeyError:
1800             raise ExtractorError('Invalid rendition field.')
1801
1802         info = {
1803             'id': video_id,
1804             'url': video_url,
1805             'uploader': performer,
1806             'upload_date': None,
1807             'title': video_title,
1808             'ext': ext,
1809             'format': format,
1810         }
1811
1812         return [info]
1813
1814
1815 class YoukuIE(InfoExtractor):
1816     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1817
1818     def _gen_sid(self):
1819         nowTime = int(time.time() * 1000)
1820         random1 = random.randint(1000,1998)
1821         random2 = random.randint(1000,9999)
1822
1823         return "%d%d%d" %(nowTime,random1,random2)
1824
1825     def _get_file_ID_mix_string(self, seed):
1826         mixed = []
1827         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1828         seed = float(seed)
1829         for i in range(len(source)):
1830             seed  =  (seed * 211 + 30031 ) % 65536
1831             index  =  math.floor(seed / 65536 * len(source) )
1832             mixed.append(source[int(index)])
1833             source.remove(source[int(index)])
1834         #return ''.join(mixed)
1835         return mixed
1836
1837     def _get_file_id(self, fileId, seed):
1838         mixed = self._get_file_ID_mix_string(seed)
1839         ids = fileId.split('*')
1840         realId = []
1841         for ch in ids:
1842             if ch:
1843                 realId.append(mixed[int(ch)])
1844         return ''.join(realId)
1845
1846     def _real_extract(self, url):
1847         mobj = re.match(self._VALID_URL, url)
1848         if mobj is None:
1849             raise ExtractorError(u'Invalid URL: %s' % url)
1850         video_id = mobj.group('ID')
1851
1852         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1853
1854         jsondata = self._download_webpage(info_url, video_id)
1855
1856         self.report_extraction(video_id)
1857         try:
1858             config = json.loads(jsondata)
1859
1860             video_title =  config['data'][0]['title']
1861             seed = config['data'][0]['seed']
1862
1863             format = self._downloader.params.get('format', None)
1864             supported_format = list(config['data'][0]['streamfileids'].keys())
1865
1866             if format is None or format == 'best':
1867                 if 'hd2' in supported_format:
1868                     format = 'hd2'
1869                 else:
1870                     format = 'flv'
1871                 ext = u'flv'
1872             elif format == 'worst':
1873                 format = 'mp4'
1874                 ext = u'mp4'
1875             else:
1876                 format = 'flv'
1877                 ext = u'flv'
1878
1879
1880             fileid = config['data'][0]['streamfileids'][format]
1881             keys = [s['k'] for s in config['data'][0]['segs'][format]]
1882         except (UnicodeDecodeError, ValueError, KeyError):
1883             raise ExtractorError(u'Unable to extract info section')
1884
1885         files_info=[]
1886         sid = self._gen_sid()
1887         fileid = self._get_file_id(fileid, seed)
1888
1889         #column 8,9 of fileid represent the segment number
1890         #fileid[7:9] should be changed
1891         for index, key in enumerate(keys):
1892
1893             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1894             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1895
1896             info = {
1897                 'id': '%s_part%02d' % (video_id, index),
1898                 'url': download_url,
1899                 'uploader': None,
1900                 'upload_date': None,
1901                 'title': video_title,
1902                 'ext': ext,
1903             }
1904             files_info.append(info)
1905
1906         return files_info
1907
1908
1909 class XNXXIE(InfoExtractor):
1910     """Information extractor for xnxx.com"""
1911
1912     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1913     IE_NAME = u'xnxx'
1914     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
1915     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1916     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
1917
1918     def _real_extract(self, url):
1919         mobj = re.match(self._VALID_URL, url)
1920         if mobj is None:
1921             raise ExtractorError(u'Invalid URL: %s' % url)
1922         video_id = mobj.group(1)
1923
1924         # Get webpage content
1925         webpage = self._download_webpage(url, video_id)
1926
1927         video_url = self._search_regex(self.VIDEO_URL_RE,
1928             webpage, u'video URL')
1929         video_url = compat_urllib_parse.unquote(video_url)
1930
1931         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1932             webpage, u'title')
1933
1934         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1935             webpage, u'thumbnail', fatal=False)
1936
1937         return [{
1938             'id': video_id,
1939             'url': video_url,
1940             'uploader': None,
1941             'upload_date': None,
1942             'title': video_title,
1943             'ext': 'flv',
1944             'thumbnail': video_thumbnail,
1945             'description': None,
1946         }]
1947
1948
1949 class GooglePlusIE(InfoExtractor):
1950     """Information extractor for plus.google.com."""
1951
1952     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1953     IE_NAME = u'plus.google'
1954
1955     def _real_extract(self, url):
1956         # Extract id from URL
1957         mobj = re.match(self._VALID_URL, url)
1958         if mobj is None:
1959             raise ExtractorError(u'Invalid URL: %s' % url)
1960
1961         post_url = mobj.group(0)
1962         video_id = mobj.group(1)
1963
1964         video_extension = 'flv'
1965
1966         # Step 1, Retrieve post webpage to extract further information
1967         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1968
1969         self.report_extraction(video_id)
1970
1971         # Extract update date
1972         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1973             webpage, u'upload date', fatal=False)
1974         if upload_date:
1975             # Convert timestring to a format suitable for filename
1976             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1977             upload_date = upload_date.strftime('%Y%m%d')
1978
1979         # Extract uploader
1980         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1981             webpage, u'uploader', fatal=False)
1982
1983         # Extract title
1984         # Get the first line for title
1985         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1986             webpage, 'title', default=u'NA')
1987
1988         # Step 2, Stimulate clicking the image box to launch video
1989         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1990             webpage, u'video page URL')
1991         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1992
1993         # Extract video links on video page
1994         """Extract video links of all sizes"""
1995         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1996         mobj = re.findall(pattern, webpage)
1997         if len(mobj) == 0:
1998             raise ExtractorError(u'Unable to extract video links')
1999
2000         # Sort in resolution
2001         links = sorted(mobj)
2002
2003         # Choose the lowest of the sort, i.e. highest resolution
2004         video_url = links[-1]
2005         # Only get the url. The resolution part in the tuple has no use anymore
2006         video_url = video_url[-1]
2007         # Treat escaped \u0026 style hex
2008         try:
2009             video_url = video_url.decode("unicode_escape")
2010         except AttributeError: # Python 3
2011             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2012
2013
2014         return [{
2015             'id':       video_id,
2016             'url':      video_url,
2017             'uploader': uploader,
2018             'upload_date':  upload_date,
2019             'title':    video_title,
2020             'ext':      video_extension,
2021         }]
2022
2023 class NBAIE(InfoExtractor):
2024     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2025     IE_NAME = u'nba'
2026
2027     def _real_extract(self, url):
2028         mobj = re.match(self._VALID_URL, url)
2029         if mobj is None:
2030             raise ExtractorError(u'Invalid URL: %s' % url)
2031
2032         video_id = mobj.group(1)
2033
2034         webpage = self._download_webpage(url, video_id)
2035
2036         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2037
2038         shortened_video_id = video_id.rpartition('/')[2]
2039         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2040             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2041
2042         # It isn't there in the HTML it returns to us
2043         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2044
2045         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2046
2047         info = {
2048             'id': shortened_video_id,
2049             'url': video_url,
2050             'ext': 'mp4',
2051             'title': title,
2052             # 'uploader_date': uploader_date,
2053             'description': description,
2054         }
2055         return [info]
2056
2057 class JustinTVIE(InfoExtractor):
2058     """Information extractor for justin.tv and twitch.tv"""
2059     # TODO: One broadcast may be split into multiple videos. The key
2060     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
2061     # starts at 1 and increases. Can we treat all parts as one video?
2062
2063     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
2064         (?:
2065             (?P<channelid>[^/]+)|
2066             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2067             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2068         )
2069         /?(?:\#.*)?$
2070         """
2071     _JUSTIN_PAGE_LIMIT = 100
2072     IE_NAME = u'justin.tv'
2073
2074     def report_download_page(self, channel, offset):
2075         """Report attempt to download a single page of videos."""
2076         self.to_screen(u'%s: Downloading video information from %d to %d' %
2077                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2078
2079     # Return count of items, list of *valid* items
2080     def _parse_page(self, url, video_id):
2081         webpage = self._download_webpage(url, video_id,
2082                                          u'Downloading video info JSON',
2083                                          u'unable to download video info JSON')
2084
2085         response = json.loads(webpage)
2086         if type(response) != list:
2087             error_text = response.get('error', 'unknown error')
2088             raise ExtractorError(u'Justin.tv API: %s' % error_text)
2089         info = []
2090         for clip in response:
2091             video_url = clip['video_file_url']
2092             if video_url:
2093                 video_extension = os.path.splitext(video_url)[1][1:]
2094                 video_date = re.sub('-', '', clip['start_time'][:10])
2095                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2096                 video_id = clip['id']
2097                 video_title = clip.get('title', video_id)
2098                 info.append({
2099                     'id': video_id,
2100                     'url': video_url,
2101                     'title': video_title,
2102                     'uploader': clip.get('channel_name', video_uploader_id),
2103                     'uploader_id': video_uploader_id,
2104                     'upload_date': video_date,
2105                     'ext': video_extension,
2106                 })
2107         return (len(response), info)
2108
2109     def _real_extract(self, url):
2110         mobj = re.match(self._VALID_URL, url)
2111         if mobj is None:
2112             raise ExtractorError(u'invalid URL: %s' % url)
2113
2114         api_base = 'http://api.justin.tv'
2115         paged = False
2116         if mobj.group('channelid'):
2117             paged = True
2118             video_id = mobj.group('channelid')
2119             api = api_base + '/channel/archives/%s.json' % video_id
2120         elif mobj.group('chapterid'):
2121             chapter_id = mobj.group('chapterid')
2122
2123             webpage = self._download_webpage(url, chapter_id)
2124             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2125             if not m:
2126                 raise ExtractorError(u'Cannot find archive of a chapter')
2127             archive_id = m.group(1)
2128
2129             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2130             chapter_info_xml = self._download_webpage(api, chapter_id,
2131                                              note=u'Downloading chapter information',
2132                                              errnote=u'Chapter information download failed')
2133             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2134             for a in doc.findall('.//archive'):
2135                 if archive_id == a.find('./id').text:
2136                     break
2137             else:
2138                 raise ExtractorError(u'Could not find chapter in chapter information')
2139
2140             video_url = a.find('./video_file_url').text
2141             video_ext = video_url.rpartition('.')[2] or u'flv'
2142
2143             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2144             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2145                                    note='Downloading chapter metadata',
2146                                    errnote='Download of chapter metadata failed')
2147             chapter_info = json.loads(chapter_info_json)
2148
2149             bracket_start = int(doc.find('.//bracket_start').text)
2150             bracket_end = int(doc.find('.//bracket_end').text)
2151
2152             # TODO determine start (and probably fix up file)
2153             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2154             #video_url += u'?start=' + TODO:start_timestamp
2155             # bracket_start is 13290, but we want 51670615
2156             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2157                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2158
2159             info = {
2160                 'id': u'c' + chapter_id,
2161                 'url': video_url,
2162                 'ext': video_ext,
2163                 'title': chapter_info['title'],
2164                 'thumbnail': chapter_info['preview'],
2165                 'description': chapter_info['description'],
2166                 'uploader': chapter_info['channel']['display_name'],
2167                 'uploader_id': chapter_info['channel']['name'],
2168             }
2169             return [info]
2170         else:
2171             video_id = mobj.group('videoid')
2172             api = api_base + '/broadcast/by_archive/%s.json' % video_id
2173
2174         self.report_extraction(video_id)
2175
2176         info = []
2177         offset = 0
2178         limit = self._JUSTIN_PAGE_LIMIT
2179         while True:
2180             if paged:
2181                 self.report_download_page(video_id, offset)
2182             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2183             page_count, page_info = self._parse_page(page_url, video_id)
2184             info.extend(page_info)
2185             if not paged or page_count != limit:
2186                 break
2187             offset += limit
2188         return info
2189
2190 class FunnyOrDieIE(InfoExtractor):
2191     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2192
2193     def _real_extract(self, url):
2194         mobj = re.match(self._VALID_URL, url)
2195         if mobj is None:
2196             raise ExtractorError(u'invalid URL: %s' % url)
2197
2198         video_id = mobj.group('id')
2199         webpage = self._download_webpage(url, video_id)
2200
2201         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2202             webpage, u'video URL', flags=re.DOTALL)
2203
2204         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2205             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2206
2207         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2208             webpage, u'description', fatal=False, flags=re.DOTALL)
2209
2210         info = {
2211             'id': video_id,
2212             'url': video_url,
2213             'ext': 'mp4',
2214             'title': title,
2215             'description': video_description,
2216         }
2217         return [info]
2218
2219 class SteamIE(InfoExtractor):
2220     _VALID_URL = r"""http://store\.steampowered\.com/
2221                 (agecheck/)?
2222                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2223                 (?P<gameID>\d+)/?
2224                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2225                 """
2226     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2227     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2228
2229     @classmethod
2230     def suitable(cls, url):
2231         """Receives a URL and returns True if suitable for this IE."""
2232         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2233
2234     def _real_extract(self, url):
2235         m = re.match(self._VALID_URL, url, re.VERBOSE)
2236         gameID = m.group('gameID')
2237
2238         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2239         webpage = self._download_webpage(videourl, gameID)
2240
2241         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2242             videourl = self._AGECHECK_TEMPLATE % gameID
2243             self.report_age_confirmation()
2244             webpage = self._download_webpage(videourl, gameID)
2245
2246         self.report_extraction(gameID)
2247         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2248                                              webpage, 'game title')
2249
2250         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2251         mweb = re.finditer(urlRE, webpage)
2252         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2253         titles = re.finditer(namesRE, webpage)
2254         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2255         thumbs = re.finditer(thumbsRE, webpage)
2256         videos = []
2257         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2258             video_id = vid.group('videoID')
2259             title = vtitle.group('videoName')
2260             video_url = vid.group('videoURL')
2261             video_thumb = thumb.group('thumbnail')
2262             if not video_url:
2263                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2264             info = {
2265                 'id':video_id,
2266                 'url':video_url,
2267                 'ext': 'flv',
2268                 'title': unescapeHTML(title),
2269                 'thumbnail': video_thumb
2270                   }
2271             videos.append(info)
2272         return [self.playlist_result(videos, gameID, game_title)]
2273
2274 class UstreamIE(InfoExtractor):
2275     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2276     IE_NAME = u'ustream'
2277
2278     def _real_extract(self, url):
2279         m = re.match(self._VALID_URL, url)
2280         video_id = m.group('videoID')
2281
2282         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2283         webpage = self._download_webpage(url, video_id)
2284
2285         self.report_extraction(video_id)
2286
2287         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2288             webpage, u'title')
2289
2290         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2291             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2292
2293         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2294             webpage, u'thumbnail', fatal=False)
2295
2296         info = {
2297                 'id': video_id,
2298                 'url': video_url,
2299                 'ext': 'flv',
2300                 'title': video_title,
2301                 'uploader': uploader,
2302                 'thumbnail': thumbnail,
2303                }
2304         return info
2305
2306 class WorldStarHipHopIE(InfoExtractor):
2307     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2308     IE_NAME = u'WorldStarHipHop'
2309
2310     def _real_extract(self, url):
2311         m = re.match(self._VALID_URL, url)
2312         video_id = m.group('id')
2313
2314         webpage_src = self._download_webpage(url, video_id)
2315
2316         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2317             webpage_src, u'video URL')
2318
2319         if 'mp4' in video_url:
2320             ext = 'mp4'
2321         else:
2322             ext = 'flv'
2323
2324         video_title = self._html_search_regex(r"<title>(.*)</title>",
2325             webpage_src, u'title')
2326
2327         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2328         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2329             webpage_src, u'thumbnail', fatal=False)
2330
2331         if not thumbnail:
2332             _title = r"""candytitles.*>(.*)</span>"""
2333             mobj = re.search(_title, webpage_src)
2334             if mobj is not None:
2335                 video_title = mobj.group(1)
2336
2337         results = [{
2338                     'id': video_id,
2339                     'url' : video_url,
2340                     'title' : video_title,
2341                     'thumbnail' : thumbnail,
2342                     'ext' : ext,
2343                     }]
2344         return results
2345
2346 class RBMARadioIE(InfoExtractor):
2347     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2348
2349     def _real_extract(self, url):
2350         m = re.match(self._VALID_URL, url)
2351         video_id = m.group('videoID')
2352
2353         webpage = self._download_webpage(url, video_id)
2354
2355         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2356             webpage, u'json data', flags=re.MULTILINE)
2357
2358         try:
2359             data = json.loads(json_data)
2360         except ValueError as e:
2361             raise ExtractorError(u'Invalid JSON: ' + str(e))
2362
2363         video_url = data['akamai_url'] + '&cbr=256'
2364         url_parts = compat_urllib_parse_urlparse(video_url)
2365         video_ext = url_parts.path.rpartition('.')[2]
2366         info = {
2367                 'id': video_id,
2368                 'url': video_url,
2369                 'ext': video_ext,
2370                 'title': data['title'],
2371                 'description': data.get('teaser_text'),
2372                 'location': data.get('country_of_origin'),
2373                 'uploader': data.get('host', {}).get('name'),
2374                 'uploader_id': data.get('host', {}).get('slug'),
2375                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2376                 'duration': data.get('duration'),
2377         }
2378         return [info]
2379
2380
2381 class YouPornIE(InfoExtractor):
2382     """Information extractor for youporn.com."""
2383     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2384
2385     def _print_formats(self, formats):
2386         """Print all available formats"""
2387         print(u'Available formats:')
2388         print(u'ext\t\tformat')
2389         print(u'---------------------------------')
2390         for format in formats:
2391             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2392
2393     def _specific(self, req_format, formats):
2394         for x in formats:
2395             if(x["format"]==req_format):
2396                 return x
2397         return None
2398
2399     def _real_extract(self, url):
2400         mobj = re.match(self._VALID_URL, url)
2401         if mobj is None:
2402             raise ExtractorError(u'Invalid URL: %s' % url)
2403         video_id = mobj.group('videoid')
2404
2405         req = compat_urllib_request.Request(url)
2406         req.add_header('Cookie', 'age_verified=1')
2407         webpage = self._download_webpage(req, video_id)
2408
2409         # Get JSON parameters
2410         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2411         try:
2412             params = json.loads(json_params)
2413         except:
2414             raise ExtractorError(u'Invalid JSON')
2415
2416         self.report_extraction(video_id)
2417         try:
2418             video_title = params['title']
2419             upload_date = unified_strdate(params['release_date_f'])
2420             video_description = params['description']
2421             video_uploader = params['submitted_by']
2422             thumbnail = params['thumbnails'][0]['image']
2423         except KeyError:
2424             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2425
2426         # Get all of the formats available
2427         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2428         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2429             webpage, u'download list').strip()
2430
2431         # Get all of the links from the page
2432         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2433         links = re.findall(LINK_RE, download_list_html)
2434         if(len(links) == 0):
2435             raise ExtractorError(u'ERROR: no known formats available for video')
2436
2437         self.to_screen(u'Links found: %d' % len(links))
2438
2439         formats = []
2440         for link in links:
2441
2442             # A link looks like this:
2443             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2444             # A path looks like this:
2445             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2446             video_url = unescapeHTML( link )
2447             path = compat_urllib_parse_urlparse( video_url ).path
2448             extension = os.path.splitext( path )[1][1:]
2449             format = path.split('/')[4].split('_')[:2]
2450             size = format[0]
2451             bitrate = format[1]
2452             format = "-".join( format )
2453             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2454
2455             formats.append({
2456                 'id': video_id,
2457                 'url': video_url,
2458                 'uploader': video_uploader,
2459                 'upload_date': upload_date,
2460                 'title': video_title,
2461                 'ext': extension,
2462                 'format': format,
2463                 'thumbnail': thumbnail,
2464                 'description': video_description
2465             })
2466
2467         if self._downloader.params.get('listformats', None):
2468             self._print_formats(formats)
2469             return
2470
2471         req_format = self._downloader.params.get('format', None)
2472         self.to_screen(u'Format: %s' % req_format)
2473
2474         if req_format is None or req_format == 'best':
2475             return [formats[0]]
2476         elif req_format == 'worst':
2477             return [formats[-1]]
2478         elif req_format in ('-1', 'all'):
2479             return formats
2480         else:
2481             format = self._specific( req_format, formats )
2482             if result is None:
2483                 raise ExtractorError(u'Requested format not available')
2484             return [format]
2485
2486
2487
2488 class PornotubeIE(InfoExtractor):
2489     """Information extractor for pornotube.com."""
2490     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2491
2492     def _real_extract(self, url):
2493         mobj = re.match(self._VALID_URL, url)
2494         if mobj is None:
2495             raise ExtractorError(u'Invalid URL: %s' % url)
2496
2497         video_id = mobj.group('videoid')
2498         video_title = mobj.group('title')
2499
2500         # Get webpage content
2501         webpage = self._download_webpage(url, video_id)
2502
2503         # Get the video URL
2504         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2505         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2506         video_url = compat_urllib_parse.unquote(video_url)
2507
2508         #Get the uploaded date
2509         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2510         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2511         if upload_date: upload_date = unified_strdate(upload_date)
2512
2513         info = {'id': video_id,
2514                 'url': video_url,
2515                 'uploader': None,
2516                 'upload_date': upload_date,
2517                 'title': video_title,
2518                 'ext': 'flv',
2519                 'format': 'flv'}
2520
2521         return [info]
2522
2523 class YouJizzIE(InfoExtractor):
2524     """Information extractor for youjizz.com."""
2525     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2526
2527     def _real_extract(self, url):
2528         mobj = re.match(self._VALID_URL, url)
2529         if mobj is None:
2530             raise ExtractorError(u'Invalid URL: %s' % url)
2531
2532         video_id = mobj.group('videoid')
2533
2534         # Get webpage content
2535         webpage = self._download_webpage(url, video_id)
2536
2537         # Get the video title
2538         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2539             webpage, u'title').strip()
2540
2541         # Get the embed page
2542         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2543         if result is None:
2544             raise ExtractorError(u'ERROR: unable to extract embed page')
2545
2546         embed_page_url = result.group(0).strip()
2547         video_id = result.group('videoid')
2548
2549         webpage = self._download_webpage(embed_page_url, video_id)
2550
2551         # Get the video URL
2552         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2553             webpage, u'video URL')
2554
2555         info = {'id': video_id,
2556                 'url': video_url,
2557                 'title': video_title,
2558                 'ext': 'flv',
2559                 'format': 'flv',
2560                 'player_url': embed_page_url}
2561
2562         return [info]
2563
2564 class EightTracksIE(InfoExtractor):
2565     IE_NAME = '8tracks'
2566     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2567
2568     def _real_extract(self, url):
2569         mobj = re.match(self._VALID_URL, url)
2570         if mobj is None:
2571             raise ExtractorError(u'Invalid URL: %s' % url)
2572         playlist_id = mobj.group('id')
2573
2574         webpage = self._download_webpage(url, playlist_id)
2575
2576         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2577         data = json.loads(json_like)
2578
2579         session = str(random.randint(0, 1000000000))
2580         mix_id = data['id']
2581         track_count = data['tracks_count']
2582         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2583         next_url = first_url
2584         res = []
2585         for i in itertools.count():
2586             api_json = self._download_webpage(next_url, playlist_id,
2587                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2588                 errnote=u'Failed to download song information')
2589             api_data = json.loads(api_json)
2590             track_data = api_data[u'set']['track']
2591             info = {
2592                 'id': track_data['id'],
2593                 'url': track_data['track_file_stream_url'],
2594                 'title': track_data['performer'] + u' - ' + track_data['name'],
2595                 'raw_title': track_data['name'],
2596                 'uploader_id': data['user']['login'],
2597                 'ext': 'm4a',
2598             }
2599             res.append(info)
2600             if api_data['set']['at_last_track']:
2601                 break
2602             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2603         return res
2604
2605 class KeekIE(InfoExtractor):
2606     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2607     IE_NAME = u'keek'
2608
2609     def _real_extract(self, url):
2610         m = re.match(self._VALID_URL, url)
2611         video_id = m.group('videoID')
2612
2613         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2614         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2615         webpage = self._download_webpage(url, video_id)
2616
2617         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2618             webpage, u'title')
2619
2620         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2621             webpage, u'uploader', fatal=False)
2622
2623         info = {
2624                 'id': video_id,
2625                 'url': video_url,
2626                 'ext': 'mp4',
2627                 'title': video_title,
2628                 'thumbnail': thumbnail,
2629                 'uploader': uploader
2630         }
2631         return [info]
2632
2633 class TEDIE(InfoExtractor):
2634     _VALID_URL=r'''http://www\.ted\.com/
2635                    (
2636                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2637                         |
2638                         ((?P<type_talk>talks)) # We have a simple talk
2639                    )
2640                    (/lang/(.*?))? # The url may contain the language
2641                    /(?P<name>\w+) # Here goes the name and then ".html"
2642                    '''
2643
2644     @classmethod
2645     def suitable(cls, url):
2646         """Receives a URL and returns True if suitable for this IE."""
2647         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2648
2649     def _real_extract(self, url):
2650         m=re.match(self._VALID_URL, url, re.VERBOSE)
2651         if m.group('type_talk'):
2652             return [self._talk_info(url)]
2653         else :
2654             playlist_id=m.group('playlist_id')
2655             name=m.group('name')
2656             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2657             return [self._playlist_videos_info(url,name,playlist_id)]
2658
2659     def _playlist_videos_info(self,url,name,playlist_id=0):
2660         '''Returns the videos of the playlist'''
2661         video_RE=r'''
2662                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2663                      ([.\s]*?)data-playlist_item_id="(\d+)"
2664                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2665                      '''
2666         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2667         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2668         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2669         m_names=re.finditer(video_name_RE,webpage)
2670
2671         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2672                                                  webpage, 'playlist title')
2673
2674         playlist_entries = []
2675         for m_video, m_name in zip(m_videos,m_names):
2676             video_id=m_video.group('video_id')
2677             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2678             playlist_entries.append(self.url_result(talk_url, 'TED'))
2679         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2680
2681     def _talk_info(self, url, video_id=0):
2682         """Return the video for the talk in the url"""
2683         m = re.match(self._VALID_URL, url,re.VERBOSE)
2684         video_name = m.group('name')
2685         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2686         self.report_extraction(video_name)
2687         # If the url includes the language we get the title translated
2688         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2689                                         webpage, 'title')
2690         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2691                                     webpage, 'json data')
2692         info = json.loads(json_data)
2693         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2694                                        webpage, 'description', flags = re.DOTALL)
2695
2696         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2697                                        webpage, 'thumbnail')
2698         info = {
2699                 'id': info['id'],
2700                 'url': info['htmlStreams'][-1]['file'],
2701                 'ext': 'mp4',
2702                 'title': title,
2703                 'thumbnail': thumbnail,
2704                 'description': desc,
2705                 }
2706         return info
2707
2708 class MySpassIE(InfoExtractor):
2709     _VALID_URL = r'http://www.myspass.de/.*'
2710
2711     def _real_extract(self, url):
2712         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2713
2714         # video id is the last path element of the URL
2715         # usually there is a trailing slash, so also try the second but last
2716         url_path = compat_urllib_parse_urlparse(url).path
2717         url_parent_path, video_id = os.path.split(url_path)
2718         if not video_id:
2719             _, video_id = os.path.split(url_parent_path)
2720
2721         # get metadata
2722         metadata_url = META_DATA_URL_TEMPLATE % video_id
2723         metadata_text = self._download_webpage(metadata_url, video_id)
2724         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2725
2726         # extract values from metadata
2727         url_flv_el = metadata.find('url_flv')
2728         if url_flv_el is None:
2729             raise ExtractorError(u'Unable to extract download url')
2730         video_url = url_flv_el.text
2731         extension = os.path.splitext(video_url)[1][1:]
2732         title_el = metadata.find('title')
2733         if title_el is None:
2734             raise ExtractorError(u'Unable to extract title')
2735         title = title_el.text
2736         format_id_el = metadata.find('format_id')
2737         if format_id_el is None:
2738             format = ext
2739         else:
2740             format = format_id_el.text
2741         description_el = metadata.find('description')
2742         if description_el is not None:
2743             description = description_el.text
2744         else:
2745             description = None
2746         imagePreview_el = metadata.find('imagePreview')
2747         if imagePreview_el is not None:
2748             thumbnail = imagePreview_el.text
2749         else:
2750             thumbnail = None
2751         info = {
2752             'id': video_id,
2753             'url': video_url,
2754             'title': title,
2755             'ext': extension,
2756             'format': format,
2757             'thumbnail': thumbnail,
2758             'description': description
2759         }
2760         return [info]
2761
2762 class SpiegelIE(InfoExtractor):
2763     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2764
2765     def _real_extract(self, url):
2766         m = re.match(self._VALID_URL, url)
2767         video_id = m.group('videoID')
2768
2769         webpage = self._download_webpage(url, video_id)
2770
2771         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2772             webpage, u'title')
2773
2774         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2775         xml_code = self._download_webpage(xml_url, video_id,
2776                     note=u'Downloading XML', errnote=u'Failed to download XML')
2777
2778         idoc = xml.etree.ElementTree.fromstring(xml_code)
2779         last_type = idoc[-1]
2780         filename = last_type.findall('./filename')[0].text
2781         duration = float(last_type.findall('./duration')[0].text)
2782
2783         video_url = 'http://video2.spiegel.de/flash/' + filename
2784         video_ext = filename.rpartition('.')[2]
2785         info = {
2786             'id': video_id,
2787             'url': video_url,
2788             'ext': video_ext,
2789             'title': video_title,
2790             'duration': duration,
2791         }
2792         return [info]
2793
2794 class LiveLeakIE(InfoExtractor):
2795
2796     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2797     IE_NAME = u'liveleak'
2798
2799     def _real_extract(self, url):
2800         mobj = re.match(self._VALID_URL, url)
2801         if mobj is None:
2802             raise ExtractorError(u'Invalid URL: %s' % url)
2803
2804         video_id = mobj.group('video_id')
2805
2806         webpage = self._download_webpage(url, video_id)
2807
2808         video_url = self._search_regex(r'file: "(.*?)",',
2809             webpage, u'video URL')
2810
2811         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2812             webpage, u'title').replace('LiveLeak.com -', '').strip()
2813
2814         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2815             webpage, u'description', fatal=False)
2816
2817         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2818             webpage, u'uploader', fatal=False)
2819
2820         info = {
2821             'id':  video_id,
2822             'url': video_url,
2823             'ext': 'mp4',
2824             'title': video_title,
2825             'description': video_description,
2826             'uploader': video_uploader
2827         }
2828
2829         return [info]
2830
2831 class ARDIE(InfoExtractor):
2832     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2833     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
2834     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
2835
2836     def _real_extract(self, url):
2837         # determine video id from url
2838         m = re.match(self._VALID_URL, url)
2839
2840         numid = re.search(r'documentId=([0-9]+)', url)
2841         if numid:
2842             video_id = numid.group(1)
2843         else:
2844             video_id = m.group('video_id')
2845
2846         # determine title and media streams from webpage
2847         html = self._download_webpage(url, video_id)
2848         title = re.search(self._TITLE, html).group('title')
2849         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2850         if not streams:
2851             assert '"fsk"' in html
2852             raise ExtractorError(u'This video is only available after 8:00 pm')
2853
2854         # choose default media type and highest quality for now
2855         stream = max([s for s in streams if int(s["media_type"]) == 0],
2856                      key=lambda s: int(s["quality"]))
2857
2858         # there's two possibilities: RTMP stream or HTTP download
2859         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
2860         if stream['rtmp_url']:
2861             self.to_screen(u'RTMP download detected')
2862             assert stream['video_url'].startswith('mp4:')
2863             info["url"] = stream["rtmp_url"]
2864             info["play_path"] = stream['video_url']
2865         else:
2866             assert stream["video_url"].endswith('.mp4')
2867             info["url"] = stream["video_url"]
2868         return [info]
2869
2870 class ZDFIE(InfoExtractor):
2871     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
2872     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
2873     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
2874     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
2875     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
2876
2877     def _real_extract(self, url):
2878         mobj = re.match(self._VALID_URL, url)
2879         if mobj is None:
2880             raise ExtractorError(u'Invalid URL: %s' % url)
2881         video_id = mobj.group('video_id')
2882
2883         html = self._download_webpage(url, video_id)
2884         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
2885         if streams is None:
2886             raise ExtractorError(u'No media url found.')
2887
2888         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
2889         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
2890         # choose first/default media type and highest quality for now
2891         for s in streams:        #find 300 - dsl1000mbit
2892             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
2893                 stream_=s
2894                 break
2895         for s in streams:        #find veryhigh - dsl2000mbit
2896             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
2897                 stream_=s
2898                 break
2899         if stream_ is None:
2900             raise ExtractorError(u'No stream found.')
2901
2902         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
2903
2904         self.report_extraction(video_id)
2905         mobj = re.search(self._TITLE, html)
2906         if mobj is None:
2907             raise ExtractorError(u'Cannot extract title')
2908         title = unescapeHTML(mobj.group('title'))
2909
2910         mobj = re.search(self._MMS_STREAM, media_link)
2911         if mobj is None:
2912             mobj = re.search(self._RTSP_STREAM, media_link)
2913             if mobj is None:
2914                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
2915         mms_url = mobj.group('video_url')
2916
2917         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
2918         if mobj is None:
2919             raise ExtractorError(u'Cannot extract extention')
2920         ext = mobj.group('ext')
2921
2922         return [{'id': video_id,
2923                  'url': mms_url,
2924                  'title': title,
2925                  'ext': ext
2926                  }]
2927
2928 class TumblrIE(InfoExtractor):
2929     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2930
2931     def _real_extract(self, url):
2932         m_url = re.match(self._VALID_URL, url)
2933         video_id = m_url.group('id')
2934         blog = m_url.group('blog_name')
2935
2936         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2937         webpage = self._download_webpage(url, video_id)
2938
2939         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2940         video = re.search(re_video, webpage)
2941         if video is None:
2942            raise ExtractorError(u'Unable to extract video')
2943         video_url = video.group('video_url')
2944         ext = video.group('ext')
2945
2946         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2947             webpage, u'thumbnail', fatal=False)  # We pick the first poster
2948         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2949
2950         # The only place where you can get a title, it's not complete,
2951         # but searching in other places doesn't work for all videos
2952         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2953             webpage, u'title', flags=re.DOTALL)
2954
2955         return [{'id': video_id,
2956                  'url': video_url,
2957                  'title': video_title,
2958                  'thumbnail': video_thumbnail,
2959                  'ext': ext
2960                  }]
2961
2962 class BandcampIE(InfoExtractor):
2963     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2964
2965     def _real_extract(self, url):
2966         mobj = re.match(self._VALID_URL, url)
2967         title = mobj.group('title')
2968         webpage = self._download_webpage(url, title)
2969         # We get the link to the free download page
2970         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2971         if m_download is None:
2972             raise ExtractorError(u'No free songs found')
2973
2974         download_link = m_download.group(1)
2975         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2976                        webpage, re.MULTILINE|re.DOTALL).group('id')
2977
2978         download_webpage = self._download_webpage(download_link, id,
2979                                                   'Downloading free downloads page')
2980         # We get the dictionary of the track from some javascrip code
2981         info = re.search(r'items: (.*?),$',
2982                          download_webpage, re.MULTILINE).group(1)
2983         info = json.loads(info)[0]
2984         # We pick mp3-320 for now, until format selection can be easily implemented.
2985         mp3_info = info[u'downloads'][u'mp3-320']
2986         # If we try to use this url it says the link has expired
2987         initial_url = mp3_info[u'url']
2988         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2989         m_url = re.match(re_url, initial_url)
2990         #We build the url we will use to get the final track url
2991         # This url is build in Bandcamp in the script download_bunde_*.js
2992         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2993         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2994         # If we could correctly generate the .rand field the url would be
2995         #in the "download_url" key
2996         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2997
2998         track_info = {'id':id,
2999                       'title' : info[u'title'],
3000                       'ext' :   'mp3',
3001                       'url' :   final_url,
3002                       'thumbnail' : info[u'thumb_url'],
3003                       'uploader' :  info[u'artist']
3004                       }
3005
3006         return [track_info]
3007
3008 class RedTubeIE(InfoExtractor):
3009     """Information Extractor for redtube"""
3010     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3011
3012     def _real_extract(self,url):
3013         mobj = re.match(self._VALID_URL, url)
3014         if mobj is None:
3015             raise ExtractorError(u'Invalid URL: %s' % url)
3016
3017         video_id = mobj.group('id')
3018         video_extension = 'mp4'
3019         webpage = self._download_webpage(url, video_id)
3020
3021         self.report_extraction(video_id)
3022
3023         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3024             webpage, u'video URL')
3025
3026         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3027             webpage, u'title')
3028
3029         return [{
3030             'id':       video_id,
3031             'url':      video_url,
3032             'ext':      video_extension,
3033             'title':    video_title,
3034         }]
3035
3036 class InaIE(InfoExtractor):
3037     """Information Extractor for Ina.fr"""
3038     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3039
3040     def _real_extract(self,url):
3041         mobj = re.match(self._VALID_URL, url)
3042
3043         video_id = mobj.group('id')
3044         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3045         video_extension = 'mp4'
3046         webpage = self._download_webpage(mrss_url, video_id)
3047
3048         self.report_extraction(video_id)
3049
3050         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
3051             webpage, u'video URL')
3052
3053         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
3054             webpage, u'title')
3055
3056         return [{
3057             'id':       video_id,
3058             'url':      video_url,
3059             'ext':      video_extension,
3060             'title':    video_title,
3061         }]
3062
3063 class HowcastIE(InfoExtractor):
3064     """Information Extractor for Howcast.com"""
3065     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
3066
3067     def _real_extract(self, url):
3068         mobj = re.match(self._VALID_URL, url)
3069
3070         video_id = mobj.group('id')
3071         webpage_url = 'http://www.howcast.com/videos/' + video_id
3072         webpage = self._download_webpage(webpage_url, video_id)
3073
3074         self.report_extraction(video_id)
3075
3076         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3077             webpage, u'video URL')
3078
3079         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3080             webpage, u'title')
3081
3082         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3083             webpage, u'description', fatal=False)
3084
3085         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3086             webpage, u'thumbnail', fatal=False)
3087
3088         return [{
3089             'id':       video_id,
3090             'url':      video_url,
3091             'ext':      'mp4',
3092             'title':    video_title,
3093             'description': video_description,
3094             'thumbnail': thumbnail,
3095         }]
3096
3097 class VineIE(InfoExtractor):
3098     """Information Extractor for Vine.co"""
3099     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3100
3101     def _real_extract(self, url):
3102         mobj = re.match(self._VALID_URL, url)
3103
3104         video_id = mobj.group('id')
3105         webpage_url = 'https://vine.co/v/' + video_id
3106         webpage = self._download_webpage(webpage_url, video_id)
3107
3108         self.report_extraction(video_id)
3109
3110         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3111             webpage, u'video URL')
3112
3113         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3114             webpage, u'title')
3115
3116         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3117             webpage, u'thumbnail', fatal=False)
3118
3119         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3120             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3121
3122         return [{
3123             'id':        video_id,
3124             'url':       video_url,
3125             'ext':       'mp4',
3126             'title':     video_title,
3127             'thumbnail': thumbnail,
3128             'uploader':  uploader,
3129         }]
3130
3131 class FlickrIE(InfoExtractor):
3132     """Information Extractor for Flickr videos"""
3133     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3134
3135     def _real_extract(self, url):
3136         mobj = re.match(self._VALID_URL, url)
3137
3138         video_id = mobj.group('id')
3139         video_uploader_id = mobj.group('uploader_id')
3140         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3141         webpage = self._download_webpage(webpage_url, video_id)
3142
3143         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3144
3145         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3146         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3147
3148         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3149             first_xml, u'node_id')
3150
3151         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3152         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3153
3154         self.report_extraction(video_id)
3155
3156         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3157         if mobj is None:
3158             raise ExtractorError(u'Unable to extract video url')
3159         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3160
3161         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3162             webpage, u'video title')
3163
3164         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3165             webpage, u'description', fatal=False)
3166
3167         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3168             webpage, u'thumbnail', fatal=False)
3169
3170         return [{
3171             'id':          video_id,
3172             'url':         video_url,
3173             'ext':         'mp4',
3174             'title':       video_title,
3175             'description': video_description,
3176             'thumbnail':   thumbnail,
3177             'uploader_id': video_uploader_id,
3178         }]
3179
3180 class TeamcocoIE(InfoExtractor):
3181     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3182
3183     def _real_extract(self, url):
3184         mobj = re.match(self._VALID_URL, url)
3185         if mobj is None:
3186             raise ExtractorError(u'Invalid URL: %s' % url)
3187         url_title = mobj.group('url_title')
3188         webpage = self._download_webpage(url, url_title)
3189
3190         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3191             webpage, u'video id')
3192
3193         self.report_extraction(video_id)
3194
3195         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3196             webpage, u'title')
3197
3198         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3199             webpage, u'thumbnail', fatal=False)
3200
3201         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3202             webpage, u'description', fatal=False)
3203
3204         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3205         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3206
3207         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3208             data, u'video URL')
3209
3210         return [{
3211             'id':          video_id,
3212             'url':         video_url,
3213             'ext':         'mp4',
3214             'title':       video_title,
3215             'thumbnail':   thumbnail,
3216             'description': video_description,
3217         }]
3218
3219 class XHamsterIE(InfoExtractor):
3220     """Information Extractor for xHamster"""
3221     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3222
3223     def _real_extract(self,url):
3224         mobj = re.match(self._VALID_URL, url)
3225
3226         video_id = mobj.group('id')
3227         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3228         webpage = self._download_webpage(mrss_url, video_id)
3229
3230         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3231         if mobj is None:
3232             raise ExtractorError(u'Unable to extract media URL')
3233         if len(mobj.group('server')) == 0:
3234             video_url = compat_urllib_parse.unquote(mobj.group('file'))
3235         else:
3236             video_url = mobj.group('server')+'/key='+mobj.group('file')
3237         video_extension = video_url.split('.')[-1]
3238
3239         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3240             webpage, u'title')
3241
3242         # Can't see the description anywhere in the UI
3243         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3244         #     webpage, u'description', fatal=False)
3245         # if video_description: video_description = unescapeHTML(video_description)
3246
3247         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3248         if mobj:
3249             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3250         else:
3251             video_upload_date = None
3252             self._downloader.report_warning(u'Unable to extract upload date')
3253
3254         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3255             webpage, u'uploader id', default=u'anonymous')
3256
3257         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3258             webpage, u'thumbnail', fatal=False)
3259
3260         return [{
3261             'id':       video_id,
3262             'url':      video_url,
3263             'ext':      video_extension,
3264             'title':    video_title,
3265             # 'description': video_description,
3266             'upload_date': video_upload_date,
3267             'uploader_id': video_uploader_id,
3268             'thumbnail': video_thumbnail
3269         }]
3270
3271 class HypemIE(InfoExtractor):
3272     """Information Extractor for hypem"""
3273     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3274
3275     def _real_extract(self, url):
3276         mobj = re.match(self._VALID_URL, url)
3277         if mobj is None:
3278             raise ExtractorError(u'Invalid URL: %s' % url)
3279         track_id = mobj.group(1)
3280
3281         data = { 'ax': 1, 'ts': time.time() }
3282         data_encoded = compat_urllib_parse.urlencode(data)
3283         complete_url = url + "?" + data_encoded
3284         request = compat_urllib_request.Request(complete_url)
3285         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3286         cookie = urlh.headers.get('Set-Cookie', '')
3287
3288         self.report_extraction(track_id)
3289
3290         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3291             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3292         try:
3293             track_list = json.loads(html_tracks)
3294             track = track_list[u'tracks'][0]
3295         except ValueError:
3296             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3297
3298         key = track[u"key"]
3299         track_id = track[u"id"]
3300         artist = track[u"artist"]
3301         title = track[u"song"]
3302
3303         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3304         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3305         request.add_header('cookie', cookie)
3306         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3307         try:
3308             song_data = json.loads(song_data_json)
3309         except ValueError:
3310             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3311         final_url = song_data[u"url"]
3312
3313         return [{
3314             'id':       track_id,
3315             'url':      final_url,
3316             'ext':      "mp3",
3317             'title':    title,
3318             'artist':   artist,
3319         }]
3320
3321 class Vbox7IE(InfoExtractor):
3322     """Information Extractor for Vbox7"""
3323     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3324
3325     def _real_extract(self,url):
3326         mobj = re.match(self._VALID_URL, url)
3327         if mobj is None:
3328             raise ExtractorError(u'Invalid URL: %s' % url)
3329         video_id = mobj.group(1)
3330
3331         redirect_page, urlh = self._download_webpage_handle(url, video_id)
3332         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3333         redirect_url = urlh.geturl() + new_location
3334         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3335
3336         title = self._html_search_regex(r'<title>(.*)</title>',
3337             webpage, u'title').split('/')[0].strip()
3338
3339         ext = "flv"
3340         info_url = "http://vbox7.com/play/magare.do"
3341         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3342         info_request = compat_urllib_request.Request(info_url, data)
3343         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3344         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3345         if info_response is None:
3346             raise ExtractorError(u'Unable to extract the media url')
3347         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3348
3349         return [{
3350             'id':        video_id,
3351             'url':       final_url,
3352             'ext':       ext,
3353             'title':     title,
3354             'thumbnail': thumbnail_url,
3355         }]
3356
3357 class GametrailersIE(InfoExtractor):
3358     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3359
3360     def _real_extract(self, url):
3361         mobj = re.match(self._VALID_URL, url)
3362         if mobj is None:
3363             raise ExtractorError(u'Invalid URL: %s' % url)
3364         video_id = mobj.group('id')
3365         video_type = mobj.group('type')
3366         webpage = self._download_webpage(url, video_id)
3367         if video_type == 'full-episodes':
3368             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3369         else:
3370             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3371         mgid = self._search_regex(mgid_re, webpage, u'mgid')
3372         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3373
3374         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3375                                            video_id, u'Downloading video info')
3376         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3377                                                video_id, u'Downloading video urls info')
3378
3379         self.report_extraction(video_id)
3380         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3381                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3382                       <image>.*
3383                         <url>(?P<thumb>.*?)</url>.*
3384                       </image>'''
3385
3386         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3387         if m_info is None:
3388             raise ExtractorError(u'Unable to extract video info')
3389         video_title = m_info.group('title')
3390         video_description = m_info.group('description')
3391         video_thumb = m_info.group('thumb')
3392
3393         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3394         if m_urls is None or len(m_urls) == 0:
3395             raise ExtractError(u'Unable to extrat video url')
3396         # They are sorted from worst to best quality
3397         video_url = m_urls[-1].group('url')
3398
3399         return {'url':         video_url,
3400                 'id':          video_id,
3401                 'title':       video_title,
3402                 # Videos are actually flv not mp4
3403                 'ext':         'flv',
3404                 'thumbnail':   video_thumb,
3405                 'description': video_description,
3406                 }
3407
3408 def gen_extractors():
3409     """ Return a list of an instance of every supported extractor.
3410     The order does matter; the first extractor matched is the one handling the URL.
3411     """
3412     return [
3413         YoutubePlaylistIE(),
3414         YoutubeChannelIE(),
3415         YoutubeUserIE(),
3416         YoutubeSearchIE(),
3417         YoutubeIE(),
3418         MetacafeIE(),
3419         DailymotionIE(),
3420         GoogleSearchIE(),
3421         PhotobucketIE(),
3422         YahooIE(),
3423         YahooSearchIE(),
3424         DepositFilesIE(),
3425         FacebookIE(),
3426         BlipTVIE(),
3427         BlipTVUserIE(),
3428         VimeoIE(),
3429         MyVideoIE(),
3430         ComedyCentralIE(),
3431         EscapistIE(),
3432         CollegeHumorIE(),
3433         XVideosIE(),
3434         SoundcloudSetIE(),
3435         SoundcloudIE(),
3436         InfoQIE(),
3437         MixcloudIE(),
3438         StanfordOpenClassroomIE(),
3439         MTVIE(),
3440         YoukuIE(),
3441         XNXXIE(),
3442         YouJizzIE(),
3443         PornotubeIE(),
3444         YouPornIE(),
3445         GooglePlusIE(),
3446         ArteTvIE(),
3447         NBAIE(),
3448         WorldStarHipHopIE(),
3449         JustinTVIE(),
3450         FunnyOrDieIE(),
3451         SteamIE(),
3452         UstreamIE(),
3453         RBMARadioIE(),
3454         EightTracksIE(),
3455         KeekIE(),
3456         TEDIE(),
3457         MySpassIE(),
3458         SpiegelIE(),
3459         LiveLeakIE(),
3460         ARDIE(),
3461         ZDFIE(),
3462         TumblrIE(),
3463         BandcampIE(),
3464         RedTubeIE(),
3465         InaIE(),
3466         HowcastIE(),
3467         VineIE(),
3468         FlickrIE(),
3469         TeamcocoIE(),
3470         XHamsterIE(),
3471         HypemIE(),
3472         Vbox7IE(),
3473         GametrailersIE(),
3474         StatigramIE(),
3475         GenericIE()
3476     ]
3477
3478 def get_info_extractor(ie_name):
3479     """Returns the info extractor class with the given ie_name"""
3480     return globals()[ie_name+'IE']