_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 from .extractor.common import InfoExtractor, SearchInfoExtractor
  27 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
  28
  29
  30
  31 class MetacafeIE(InfoExtractor):
  32     """Information Extractor for metacafe.com."""
  33
  34     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
  35     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
  36     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
  37     IE_NAME = u'metacafe'
  38
  39     def report_disclaimer(self):
  40         """Report disclaimer retrieval."""
  41         self.to_screen(u'Retrieving disclaimer')
  42
  43     def _real_initialize(self):
  44         # Retrieve disclaimer
  45         request = compat_urllib_request.Request(self._DISCLAIMER)
  46         try:
  47             self.report_disclaimer()
  48             disclaimer = compat_urllib_request.urlopen(request).read()
  49         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  50             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
  51
  52         # Confirm age
  53         disclaimer_form = {
  54             'filters': '0',
  55             'submit': "Continue - I'm over 18",
  56             }
  57         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
  58         try:
  59             self.report_age_confirmation()
  60             disclaimer = compat_urllib_request.urlopen(request).read()
  61         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  62             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
  63
  64     def _real_extract(self, url):
  65         # Extract id and simplified title from URL
  66         mobj = re.match(self._VALID_URL, url)
  67         if mobj is None:
  68             raise ExtractorError(u'Invalid URL: %s' % url)
  69
  70         video_id = mobj.group(1)
  71
  72         # Check if video comes from YouTube
  73         mobj2 = re.match(r'^yt-(.*)$', video_id)
  74         if mobj2 is not None:
  75             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
  76
  77         # Retrieve video webpage to extract further information
  78         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
  79
  80         # Extract URL, uploader and title from webpage
  81         self.report_extraction(video_id)
  82         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
  83         if mobj is not None:
  84             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
  85             video_extension = mediaURL[-3:]
  86
  87             # Extract gdaKey if available
  88             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
  89             if mobj is None:
  90                 video_url = mediaURL
  91             else:
  92                 gdaKey = mobj.group(1)
  93                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
  94         else:
  95             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
  96             if mobj is None:
  97                 raise ExtractorError(u'Unable to extract media URL')
  98             vardict = compat_parse_qs(mobj.group(1))
  99             if 'mediaData' not in vardict:
 100                 raise ExtractorError(u'Unable to extract media URL')
 101             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 102             if mobj is None:
 103                 raise ExtractorError(u'Unable to extract media URL')
 104             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 105             video_extension = mediaURL[-3:]
 106             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 107
 108         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 109         if mobj is None:
 110             raise ExtractorError(u'Unable to extract title')
 111         video_title = mobj.group(1).decode('utf-8')
 112
 113         mobj = re.search(r'submitter=(.*?);', webpage)
 114         if mobj is None:
 115             raise ExtractorError(u'Unable to extract uploader nickname')
 116         video_uploader = mobj.group(1)
 117
 118         return [{
 119             'id':       video_id.decode('utf-8'),
 120             'url':      video_url.decode('utf-8'),
 121             'uploader': video_uploader.decode('utf-8'),
 122             'upload_date':  None,
 123             'title':    video_title,
 124             'ext':      video_extension.decode('utf-8'),
 125         }]
 126
 127 class DailymotionIE(InfoExtractor):
 128     """Information Extractor for Dailymotion"""
 129
 130     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 131     IE_NAME = u'dailymotion'
 132
 133     def _real_extract(self, url):
 134         # Extract id and simplified title from URL
 135         mobj = re.match(self._VALID_URL, url)
 136         if mobj is None:
 137             raise ExtractorError(u'Invalid URL: %s' % url)
 138
 139         video_id = mobj.group(1).split('_')[0].split('?')[0]
 140
 141         video_extension = 'mp4'
 142
 143         # Retrieve video webpage to extract further information
 144         request = compat_urllib_request.Request(url)
 145         request.add_header('Cookie', 'family_filter=off')
 146         webpage = self._download_webpage(request, video_id)
 147
 148         # Extract URL, uploader and title from webpage
 149         self.report_extraction(video_id)
 150         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 151         if mobj is None:
 152             raise ExtractorError(u'Unable to extract media URL')
 153         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 154
 155         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 156             if key in flashvars:
 157                 max_quality = key
 158                 self.to_screen(u'Using %s' % key)
 159                 break
 160         else:
 161             raise ExtractorError(u'Unable to extract video URL')
 162
 163         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 164         if mobj is None:
 165             raise ExtractorError(u'Unable to extract video URL')
 166
 167         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 168
 169         # TODO: support choosing qualities
 170
 171         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 172         if mobj is None:
 173             raise ExtractorError(u'Unable to extract title')
 174         video_title = unescapeHTML(mobj.group('title'))
 175
 176         video_uploader = None
 177         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 178                                              # Looking for official user
 179                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 180                                             webpage, 'video uploader')
 181
 182         video_upload_date = None
 183         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 184         if mobj is not None:
 185             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 186
 187         return [{
 188             'id':       video_id,
 189             'url':      video_url,
 190             'uploader': video_uploader,
 191             'upload_date':  video_upload_date,
 192             'title':    video_title,
 193             'ext':      video_extension,
 194         }]
 195
 196
 197 class PhotobucketIE(InfoExtractor):
 198     """Information extractor for photobucket.com."""
 199
 200     # TODO: the original _VALID_URL was:
 201     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 202     # Check if it's necessary to keep the old extracion process
 203     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 204     IE_NAME = u'photobucket'
 205
 206     def _real_extract(self, url):
 207         # Extract id from URL
 208         mobj = re.match(self._VALID_URL, url)
 209         if mobj is None:
 210             raise ExtractorError(u'Invalid URL: %s' % url)
 211
 212         video_id = mobj.group('id')
 213
 214         video_extension = mobj.group('ext')
 215
 216         # Retrieve video webpage to extract further information
 217         webpage = self._download_webpage(url, video_id)
 218
 219         # Extract URL, uploader, and title from webpage
 220         self.report_extraction(video_id)
 221         # We try first by looking the javascript code:
 222         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 223         if mobj is not None:
 224             info = json.loads(mobj.group('json'))
 225             return [{
 226                 'id':       video_id,
 227                 'url':      info[u'downloadUrl'],
 228                 'uploader': info[u'username'],
 229                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 230                 'title':    info[u'title'],
 231                 'ext':      video_extension,
 232                 'thumbnail': info[u'thumbUrl'],
 233             }]
 234
 235         # We try looking in other parts of the webpage
 236         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
 237             webpage, u'video URL')
 238
 239         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 240         if mobj is None:
 241             raise ExtractorError(u'Unable to extract title')
 242         video_title = mobj.group(1).decode('utf-8')
 243         video_uploader = mobj.group(2).decode('utf-8')
 244
 245         return [{
 246             'id':       video_id.decode('utf-8'),
 247             'url':      video_url.decode('utf-8'),
 248             'uploader': video_uploader,
 249             'upload_date':  None,
 250             'title':    video_title,
 251             'ext':      video_extension.decode('utf-8'),
 252         }]
 253
 254
 255 class YahooIE(InfoExtractor):
 256     """Information extractor for screen.yahoo.com."""
 257     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 258
 259     def _real_extract(self, url):
 260         mobj = re.match(self._VALID_URL, url)
 261         if mobj is None:
 262             raise ExtractorError(u'Invalid URL: %s' % url)
 263         video_id = mobj.group('id')
 264         webpage = self._download_webpage(url, video_id)
 265         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
 266
 267         if m_id is None:
 268             # TODO: Check which url parameters are required
 269             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 270             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
 271             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
 272                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
 273                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
 274                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
 275                         '''
 276             self.report_extraction(video_id)
 277             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
 278             if m_info is None:
 279                 raise ExtractorError(u'Unable to extract video info')
 280             video_title = m_info.group('title')
 281             video_description = m_info.group('description')
 282             video_thumb = m_info.group('thumb')
 283             video_date = m_info.group('date')
 284             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
 285
 286             # TODO: Find a way to get mp4 videos
 287             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 288             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
 289             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
 290             video_url = m_rest.group('url')
 291             video_path = m_rest.group('path')
 292             if m_rest is None:
 293                 raise ExtractorError(u'Unable to extract video url')
 294
 295         else: # We have to use a different method if another id is defined
 296             long_id = m_id.group('new_id')
 297             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
 298             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
 299             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
 300             info = json.loads(json_str)
 301             res = info[u'query'][u'results'][u'mediaObj'][0]
 302             stream = res[u'streams'][0]
 303             video_path = stream[u'path']
 304             video_url = stream[u'host']
 305             meta = res[u'meta']
 306             video_title = meta[u'title']
 307             video_description = meta[u'description']
 308             video_thumb = meta[u'thumbnail']
 309             video_date = None # I can't find it
 310
 311         info_dict = {
 312                      'id': video_id,
 313                      'url': video_url,
 314                      'play_path': video_path,
 315                      'title':video_title,
 316                      'description': video_description,
 317                      'thumbnail': video_thumb,
 318                      'upload_date': video_date,
 319                      'ext': 'flv',
 320                      }
 321         return info_dict
 322
 323 class VimeoIE(InfoExtractor):
 324     """Information extractor for vimeo.com."""
 325
 326     # _VALID_URL matches Vimeo URLs
 327     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 328     IE_NAME = u'vimeo'
 329
 330     def _verify_video_password(self, url, video_id, webpage):
 331         password = self._downloader.params.get('password', None)
 332         if password is None:
 333             raise ExtractorError(u'This video is protected by a password, use the --password option')
 334         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
 335         data = compat_urllib_parse.urlencode({'password': password,
 336                                               'token': token})
 337         # I didn't manage to use the password with https
 338         if url.startswith('https'):
 339             pass_url = url.replace('https','http')
 340         else:
 341             pass_url = url
 342         password_request = compat_urllib_request.Request(pass_url+'/password', data)
 343         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 344         password_request.add_header('Cookie', 'xsrft=%s' % token)
 345         pass_web = self._download_webpage(password_request, video_id,
 346                                           u'Verifying the password',
 347                                           u'Wrong password')
 348
 349     def _real_extract(self, url, new_video=True):
 350         # Extract ID from URL
 351         mobj = re.match(self._VALID_URL, url)
 352         if mobj is None:
 353             raise ExtractorError(u'Invalid URL: %s' % url)
 354
 355         video_id = mobj.group('id')
 356         if not mobj.group('proto'):
 357             url = 'https://' + url
 358         if mobj.group('direct_link') or mobj.group('pro'):
 359             url = 'https://vimeo.com/' + video_id
 360
 361         # Retrieve video webpage to extract further information
 362         request = compat_urllib_request.Request(url, None, std_headers)
 363         webpage = self._download_webpage(request, video_id)
 364
 365         # Now we begin extracting as much information as we can from what we
 366         # retrieved. First we extract the information common to all extractors,
 367         # and latter we extract those that are Vimeo specific.
 368         self.report_extraction(video_id)
 369
 370         # Extract the config JSON
 371         try:
 372             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 373             config = json.loads(config)
 374         except:
 375             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
 376                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
 377
 378             if re.search('If so please provide the correct password.', webpage):
 379                 self._verify_video_password(url, video_id, webpage)
 380                 return self._real_extract(url)
 381             else:
 382                 raise ExtractorError(u'Unable to extract info section')
 383
 384         # Extract title
 385         video_title = config["video"]["title"]
 386
 387         # Extract uploader and uploader_id
 388         video_uploader = config["video"]["owner"]["name"]
 389         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
 390
 391         # Extract video thumbnail
 392         video_thumbnail = config["video"]["thumbnail"]
 393
 394         # Extract video description
 395         video_description = get_element_by_attribute("itemprop", "description", webpage)
 396         if video_description: video_description = clean_html(video_description)
 397         else: video_description = u''
 398
 399         # Extract upload date
 400         video_upload_date = None
 401         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
 402         if mobj is not None:
 403             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
 404
 405         # Vimeo specific: extract request signature and timestamp
 406         sig = config['request']['signature']
 407         timestamp = config['request']['timestamp']
 408
 409         # Vimeo specific: extract video codec and quality information
 410         # First consider quality, then codecs, then take everything
 411         # TODO bind to format param
 412         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
 413         files = { 'hd': [], 'sd': [], 'other': []}
 414         for codec_name, codec_extension in codecs:
 415             if codec_name in config["video"]["files"]:
 416                 if 'hd' in config["video"]["files"][codec_name]:
 417                     files['hd'].append((codec_name, codec_extension, 'hd'))
 418                 elif 'sd' in config["video"]["files"][codec_name]:
 419                     files['sd'].append((codec_name, codec_extension, 'sd'))
 420                 else:
 421                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
 422
 423         for quality in ('hd', 'sd', 'other'):
 424             if len(files[quality]) > 0:
 425                 video_quality = files[quality][0][2]
 426                 video_codec = files[quality][0][0]
 427                 video_extension = files[quality][0][1]
 428                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
 429                 break
 430         else:
 431             raise ExtractorError(u'No known codec found')
 432
 433         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 434                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
 435
 436         return [{
 437             'id':       video_id,
 438             'url':      video_url,
 439             'uploader': video_uploader,
 440             'uploader_id': video_uploader_id,
 441             'upload_date':  video_upload_date,
 442             'title':    video_title,
 443             'ext':      video_extension,
 444             'thumbnail':    video_thumbnail,
 445             'description':  video_description,
 446         }]
 447
 448
 449 class ArteTvIE(InfoExtractor):
 450     """arte.tv information extractor."""
 451
 452     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
 453     _LIVE_URL = r'index-[0-9]+\.html$'
 454
 455     IE_NAME = u'arte.tv'
 456
 457     def fetch_webpage(self, url):
 458         request = compat_urllib_request.Request(url)
 459         try:
 460             self.report_download_webpage(url)
 461             webpage = compat_urllib_request.urlopen(request).read()
 462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 463             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
 464         except ValueError as err:
 465             raise ExtractorError(u'Invalid URL: %s' % url)
 466         return webpage
 467
 468     def grep_webpage(self, url, regex, regexFlags, matchTuples):
 469         page = self.fetch_webpage(url)
 470         mobj = re.search(regex, page, regexFlags)
 471         info = {}
 472
 473         if mobj is None:
 474             raise ExtractorError(u'Invalid URL: %s' % url)
 475
 476         for (i, key, err) in matchTuples:
 477             if mobj.group(i) is None:
 478                 raise ExtractorError(err)
 479             else:
 480                 info[key] = mobj.group(i)
 481
 482         return info
 483
 484     def extractLiveStream(self, url):
 485         video_lang = url.split('/')[-4]
 486         info = self.grep_webpage(
 487             url,
 488             r'src="(.*?/videothek_js.*?\.js)',
 489             0,
 490             [
 491                 (1, 'url', u'Invalid URL: %s' % url)
 492             ]
 493         )
 494         http_host = url.split('/')[2]
 495         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
 496         info = self.grep_webpage(
 497             next_url,
 498             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
 499                 '(http://.*?\.swf).*?' +
 500                 '(rtmp://.*?)\'',
 501             re.DOTALL,
 502             [
 503                 (1, 'path',   u'could not extract video path: %s' % url),
 504                 (2, 'player', u'could not extract video player: %s' % url),
 505                 (3, 'url',    u'could not extract video url: %s' % url)
 506             ]
 507         )
 508         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
 509
 510     def extractPlus7Stream(self, url):
 511         video_lang = url.split('/')[-3]
 512         info = self.grep_webpage(
 513             url,
 514             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
 515             0,
 516             [
 517                 (1, 'url', u'Invalid URL: %s' % url)
 518             ]
 519         )
 520         next_url = compat_urllib_parse.unquote(info.get('url'))
 521         info = self.grep_webpage(
 522             next_url,
 523             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
 524             0,
 525             [
 526                 (1, 'url', u'Could not find <video> tag: %s' % url)
 527             ]
 528         )
 529         next_url = compat_urllib_parse.unquote(info.get('url'))
 530
 531         info = self.grep_webpage(
 532             next_url,
 533             r'<video id="(.*?)".*?>.*?' +
 534                 '<name>(.*?)</name>.*?' +
 535                 '<dateVideo>(.*?)</dateVideo>.*?' +
 536                 '<url quality="hd">(.*?)</url>',
 537             re.DOTALL,
 538             [
 539                 (1, 'id',    u'could not extract video id: %s' % url),
 540                 (2, 'title', u'could not extract video title: %s' % url),
 541                 (3, 'date',  u'could not extract video date: %s' % url),
 542                 (4, 'url',   u'could not extract video url: %s' % url)
 543             ]
 544         )
 545
 546         return {
 547             'id':           info.get('id'),
 548             'url':          compat_urllib_parse.unquote(info.get('url')),
 549             'uploader':     u'arte.tv',
 550             'upload_date':  unified_strdate(info.get('date')),
 551             'title':        info.get('title').decode('utf-8'),
 552             'ext':          u'mp4',
 553             'format':       u'NA',
 554             'player_url':   None,
 555         }
 556
 557     def _real_extract(self, url):
 558         video_id = url.split('/')[-1]
 559         self.report_extraction(video_id)
 560
 561         if re.search(self._LIVE_URL, video_id) is not None:
 562             self.extractLiveStream(url)
 563             return
 564         else:
 565             info = self.extractPlus7Stream(url)
 566
 567         return [info]
 568
 569
 570 class GenericIE(InfoExtractor):
 571     """Generic last-resort information extractor."""
 572
 573     _VALID_URL = r'.*'
 574     IE_NAME = u'generic'
 575
 576     def report_download_webpage(self, video_id):
 577         """Report webpage download."""
 578         if not self._downloader.params.get('test', False):
 579             self._downloader.report_warning(u'Falling back on generic information extractor.')
 580         super(GenericIE, self).report_download_webpage(video_id)
 581
 582     def report_following_redirect(self, new_url):
 583         """Report information extraction."""
 584         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
 585
 586     def _test_redirect(self, url):
 587         """Check if it is a redirect, like url shorteners, in case return the new url."""
 588         class HeadRequest(compat_urllib_request.Request):
 589             def get_method(self):
 590                 return "HEAD"
 591
 592         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
 593             """
 594             Subclass the HTTPRedirectHandler to make it use our
 595             HeadRequest also on the redirected URL
 596             """
 597             def redirect_request(self, req, fp, code, msg, headers, newurl):
 598                 if code in (301, 302, 303, 307):
 599                     newurl = newurl.replace(' ', '%20')
 600                     newheaders = dict((k,v) for k,v in req.headers.items()
 601                                       if k.lower() not in ("content-length", "content-type"))
 602                     return HeadRequest(newurl,
 603                                        headers=newheaders,
 604                                        origin_req_host=req.get_origin_req_host(),
 605                                        unverifiable=True)
 606                 else:
 607                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
 608
 609         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
 610             """
 611             Fallback to GET if HEAD is not allowed (405 HTTP error)
 612             """
 613             def http_error_405(self, req, fp, code, msg, headers):
 614                 fp.read()
 615                 fp.close()
 616
 617                 newheaders = dict((k,v) for k,v in req.headers.items()
 618                                   if k.lower() not in ("content-length", "content-type"))
 619                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
 620                                                  headers=newheaders,
 621                                                  origin_req_host=req.get_origin_req_host(),
 622                                                  unverifiable=True))
 623
 624         # Build our opener
 625         opener = compat_urllib_request.OpenerDirector()
 626         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
 627                         HTTPMethodFallback, HEADRedirectHandler,
 628                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
 629             opener.add_handler(handler())
 630
 631         response = opener.open(HeadRequest(url))
 632         if response is None:
 633             raise ExtractorError(u'Invalid URL protocol')
 634         new_url = response.geturl()
 635
 636         if url == new_url:
 637             return False
 638
 639         self.report_following_redirect(new_url)
 640         return new_url
 641
 642     def _real_extract(self, url):
 643         new_url = self._test_redirect(url)
 644         if new_url: return [self.url_result(new_url)]
 645
 646         video_id = url.split('/')[-1]
 647         try:
 648             webpage = self._download_webpage(url, video_id)
 649         except ValueError as err:
 650             # since this is the last-resort InfoExtractor, if
 651             # this error is thrown, it'll be thrown here
 652             raise ExtractorError(u'Invalid URL: %s' % url)
 653
 654         self.report_extraction(video_id)
 655         # Start with something easy: JW Player in SWFObject
 656         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
 657         if mobj is None:
 658             # Broaden the search a little bit
 659             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
 660         if mobj is None:
 661             # Broaden the search a little bit: JWPlayer JS loader
 662             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
 663         if mobj is None:
 664             # Try to find twitter cards info
 665             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
 666         if mobj is None:
 667             # We look for Open Graph info:
 668             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
 669             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
 670             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
 671             if m_video_type is not None:
 672                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
 673         if mobj is None:
 674             raise ExtractorError(u'Invalid URL: %s' % url)
 675
 676         # It's possible that one of the regexes
 677         # matched, but returned an empty group:
 678         if mobj.group(1) is None:
 679             raise ExtractorError(u'Invalid URL: %s' % url)
 680
 681         video_url = compat_urllib_parse.unquote(mobj.group(1))
 682         video_id = os.path.basename(video_url)
 683
 684         # here's a fun little line of code for you:
 685         video_extension = os.path.splitext(video_id)[1][1:]
 686         video_id = os.path.splitext(video_id)[0]
 687
 688         # it's tempting to parse this further, but you would
 689         # have to take into account all the variations like
 690         #   Video Title - Site Name
 691         #   Site Name | Video Title
 692         #   Video Title - Tagline | Site Name
 693         # and so on and so forth; it's just not practical
 694         video_title = self._html_search_regex(r'<title>(.*)</title>',
 695             webpage, u'video title')
 696
 697         # video uploader is domain name
 698         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
 699             url, u'video uploader')
 700
 701         return [{
 702             'id':       video_id,
 703             'url':      video_url,
 704             'uploader': video_uploader,
 705             'upload_date':  None,
 706             'title':    video_title,
 707             'ext':      video_extension,
 708         }]
 709
 710
 711 class YoutubeSearchIE(SearchInfoExtractor):
 712     """Information Extractor for YouTube search queries."""
 713     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
 714     _MAX_RESULTS = 1000
 715     IE_NAME = u'youtube:search'
 716     _SEARCH_KEY = 'ytsearch'
 717
 718     def report_download_page(self, query, pagenum):
 719         """Report attempt to download search page with given number."""
 720         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
 721
 722     def _get_n_results(self, query, n):
 723         """Get a specified number of results for a query"""
 724
 725         video_ids = []
 726         pagenum = 0
 727         limit = n
 728
 729         while (50 * pagenum) < limit:
 730             self.report_download_page(query, pagenum+1)
 731             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
 732             request = compat_urllib_request.Request(result_url)
 733             try:
 734                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
 735             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 736                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
 737             api_response = json.loads(data)['data']
 738
 739             if not 'items' in api_response:
 740                 raise ExtractorError(u'[youtube] No video results')
 741
 742             new_ids = list(video['id'] for video in api_response['items'])
 743             video_ids += new_ids
 744
 745             limit = min(n, api_response['totalItems'])
 746             pagenum += 1
 747
 748         if len(video_ids) > n:
 749             video_ids = video_ids[:n]
 750         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
 751         return self.playlist_result(videos, query)
 752
 753
 754 class GoogleSearchIE(SearchInfoExtractor):
 755     """Information Extractor for Google Video search queries."""
 756     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
 757     _MAX_RESULTS = 1000
 758     IE_NAME = u'video.google:search'
 759     _SEARCH_KEY = 'gvsearch'
 760
 761     def _get_n_results(self, query, n):
 762         """Get a specified number of results for a query"""
 763
 764         res = {
 765             '_type': 'playlist',
 766             'id': query,
 767             'entries': []
 768         }
 769
 770         for pagenum in itertools.count(1):
 771             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
 772             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
 773                                              note='Downloading result page ' + str(pagenum))
 774
 775             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
 776                 e = {
 777                     '_type': 'url',
 778                     'url': mobj.group(1)
 779                 }
 780                 res['entries'].append(e)
 781
 782             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
 783                 return res
 784
 785 class YahooSearchIE(SearchInfoExtractor):
 786     """Information Extractor for Yahoo! Video search queries."""
 787
 788     _MAX_RESULTS = 1000
 789     IE_NAME = u'screen.yahoo:search'
 790     _SEARCH_KEY = 'yvsearch'
 791
 792     def _get_n_results(self, query, n):
 793         """Get a specified number of results for a query"""
 794
 795         res = {
 796             '_type': 'playlist',
 797             'id': query,
 798             'entries': []
 799         }
 800         for pagenum in itertools.count(0):
 801             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
 802             webpage = self._download_webpage(result_url, query,
 803                                              note='Downloading results page '+str(pagenum+1))
 804             info = json.loads(webpage)
 805             m = info[u'm']
 806             results = info[u'results']
 807
 808             for (i, r) in enumerate(results):
 809                 if (pagenum * 30) +i >= n:
 810                     break
 811                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
 812                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
 813                 res['entries'].append(e)
 814             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
 815                 break
 816
 817         return res
 818
 819
 820 class BlipTVUserIE(InfoExtractor):
 821     """Information Extractor for blip.tv users."""
 822
 823     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
 824     _PAGE_SIZE = 12
 825     IE_NAME = u'blip.tv:user'
 826
 827     def _real_extract(self, url):
 828         # Extract username
 829         mobj = re.match(self._VALID_URL, url)
 830         if mobj is None:
 831             raise ExtractorError(u'Invalid URL: %s' % url)
 832
 833         username = mobj.group(1)
 834
 835         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
 836
 837         page = self._download_webpage(url, username, u'Downloading user page')
 838         mobj = re.search(r'data-users-id="([^"]+)"', page)
 839         page_base = page_base % mobj.group(1)
 840
 841
 842         # Download video ids using BlipTV Ajax calls. Result size per
 843         # query is limited (currently to 12 videos) so we need to query
 844         # page by page until there are no video ids - it means we got
 845         # all of them.
 846
 847         video_ids = []
 848         pagenum = 1
 849
 850         while True:
 851             url = page_base + "&page=" + str(pagenum)
 852             page = self._download_webpage(url, username,
 853                                           u'Downloading video ids from page %d' % pagenum)
 854
 855             # Extract video identifiers
 856             ids_in_page = []
 857
 858             for mobj in re.finditer(r'href="/([^"]+)"', page):
 859                 if mobj.group(1) not in ids_in_page:
 860                     ids_in_page.append(unescapeHTML(mobj.group(1)))
 861
 862             video_ids.extend(ids_in_page)
 863
 864             # A little optimization - if current page is not
 865             # "full", ie. does not contain PAGE_SIZE video ids then
 866             # we can assume that this page is the last one - there
 867             # are no more ids on further pages - no need to query
 868             # again.
 869
 870             if len(ids_in_page) < self._PAGE_SIZE:
 871                 break
 872
 873             pagenum += 1
 874
 875         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 876         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
 877         return [self.playlist_result(url_entries, playlist_title = username)]
 878
 879
 880 class DepositFilesIE(InfoExtractor):
 881     """Information extractor for depositfiles.com"""
 882
 883     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
 884
 885     def _real_extract(self, url):
 886         file_id = url.split('/')[-1]
 887         # Rebuild url in english locale
 888         url = 'http://depositfiles.com/en/files/' + file_id
 889
 890         # Retrieve file webpage with 'Free download' button pressed
 891         free_download_indication = { 'gateway_result' : '1' }
 892         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
 893         try:
 894             self.report_download_webpage(file_id)
 895             webpage = compat_urllib_request.urlopen(request).read()
 896         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 897             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
 898
 899         # Search for the real file URL
 900         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
 901         if (mobj is None) or (mobj.group(1) is None):
 902             # Try to figure out reason of the error.
 903             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
 904             if (mobj is not None) and (mobj.group(1) is not None):
 905                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
 906                 raise ExtractorError(u'%s' % restriction_message)
 907             else:
 908                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
 909
 910         file_url = mobj.group(1)
 911         file_extension = os.path.splitext(file_url)[1][1:]
 912
 913         # Search for file title
 914         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
 915
 916         return [{
 917             'id':       file_id.decode('utf-8'),
 918             'url':      file_url.decode('utf-8'),
 919             'uploader': None,
 920             'upload_date':  None,
 921             'title':    file_title,
 922             'ext':      file_extension.decode('utf-8'),
 923         }]
 924
 925
 926 class FacebookIE(InfoExtractor):
 927     """Information Extractor for Facebook"""
 928
 929     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
 930     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
 931     _NETRC_MACHINE = 'facebook'
 932     IE_NAME = u'facebook'
 933
 934     def report_login(self):
 935         """Report attempt to log in."""
 936         self.to_screen(u'Logging in')
 937
 938     def _real_initialize(self):
 939         if self._downloader is None:
 940             return
 941
 942         useremail = None
 943         password = None
 944         downloader_params = self._downloader.params
 945
 946         # Attempt to use provided username and password or .netrc data
 947         if downloader_params.get('username', None) is not None:
 948             useremail = downloader_params['username']
 949             password = downloader_params['password']
 950         elif downloader_params.get('usenetrc', False):
 951             try:
 952                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 953                 if info is not None:
 954                     useremail = info[0]
 955                     password = info[2]
 956                 else:
 957                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 958             except (IOError, netrc.NetrcParseError) as err:
 959                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 960                 return
 961
 962         if useremail is None:
 963             return
 964
 965         # Log in
 966         login_form = {
 967             'email': useremail,
 968             'pass': password,
 969             'login': 'Log+In'
 970             }
 971         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
 972         try:
 973             self.report_login()
 974             login_results = compat_urllib_request.urlopen(request).read()
 975             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
 976                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
 977                 return
 978         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 979             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 980             return
 981
 982     def _real_extract(self, url):
 983         mobj = re.match(self._VALID_URL, url)
 984         if mobj is None:
 985             raise ExtractorError(u'Invalid URL: %s' % url)
 986         video_id = mobj.group('ID')
 987
 988         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
 989         webpage = self._download_webpage(url, video_id)
 990
 991         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
 992         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
 993         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
 994         if not m:
 995             raise ExtractorError(u'Cannot parse data')
 996         data = dict(json.loads(m.group(1)))
 997         params_raw = compat_urllib_parse.unquote(data['params'])
 998         params = json.loads(params_raw)
 999         video_data = params['video_data'][0]
1000         video_url = video_data.get('hd_src')
1001         if not video_url:
1002             video_url = video_data['sd_src']
1003         if not video_url:
1004             raise ExtractorError(u'Cannot find video URL')
1005         video_duration = int(video_data['video_duration'])
1006         thumbnail = video_data['thumbnail_src']
1007
1008         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1009             webpage, u'title')
1010
1011         info = {
1012             'id': video_id,
1013             'title': video_title,
1014             'url': video_url,
1015             'ext': 'mp4',
1016             'duration': video_duration,
1017             'thumbnail': thumbnail,
1018         }
1019         return [info]
1020
1021
1022 class BlipTVIE(InfoExtractor):
1023     """Information extractor for blip.tv"""
1024
1025     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1026     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1027     IE_NAME = u'blip.tv'
1028
1029     def report_direct_download(self, title):
1030         """Report information extraction."""
1031         self.to_screen(u'%s: Direct download detected' % title)
1032
1033     def _real_extract(self, url):
1034         mobj = re.match(self._VALID_URL, url)
1035         if mobj is None:
1036             raise ExtractorError(u'Invalid URL: %s' % url)
1037
1038         # See https://github.com/rg3/youtube-dl/issues/857
1039         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1040         if api_mobj is not None:
1041             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1042         urlp = compat_urllib_parse_urlparse(url)
1043         if urlp.path.startswith('/play/'):
1044             request = compat_urllib_request.Request(url)
1045             response = compat_urllib_request.urlopen(request)
1046             redirecturl = response.geturl()
1047             rurlp = compat_urllib_parse_urlparse(redirecturl)
1048             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1049             url = 'http://blip.tv/a/a-' + file_id
1050             return self._real_extract(url)
1051
1052
1053         if '?' in url:
1054             cchar = '&'
1055         else:
1056             cchar = '?'
1057         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1058         request = compat_urllib_request.Request(json_url)
1059         request.add_header('User-Agent', 'iTunes/10.6.1')
1060         self.report_extraction(mobj.group(1))
1061         info = None
1062         try:
1063             urlh = compat_urllib_request.urlopen(request)
1064             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1065                 basename = url.split('/')[-1]
1066                 title,ext = os.path.splitext(basename)
1067                 title = title.decode('UTF-8')
1068                 ext = ext.replace('.', '')
1069                 self.report_direct_download(title)
1070                 info = {
1071                     'id': title,
1072                     'url': url,
1073                     'uploader': None,
1074                     'upload_date': None,
1075                     'title': title,
1076                     'ext': ext,
1077                     'urlhandle': urlh
1078                 }
1079         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1080             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1081         if info is None: # Regular URL
1082             try:
1083                 json_code_bytes = urlh.read()
1084                 json_code = json_code_bytes.decode('utf-8')
1085             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1086                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1087
1088             try:
1089                 json_data = json.loads(json_code)
1090                 if 'Post' in json_data:
1091                     data = json_data['Post']
1092                 else:
1093                     data = json_data
1094
1095                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1096                 video_url = data['media']['url']
1097                 umobj = re.match(self._URL_EXT, video_url)
1098                 if umobj is None:
1099                     raise ValueError('Can not determine filename extension')
1100                 ext = umobj.group(1)
1101
1102                 info = {
1103                     'id': data['item_id'],
1104                     'url': video_url,
1105                     'uploader': data['display_name'],
1106                     'upload_date': upload_date,
1107                     'title': data['title'],
1108                     'ext': ext,
1109                     'format': data['media']['mimeType'],
1110                     'thumbnail': data['thumbnailUrl'],
1111                     'description': data['description'],
1112                     'player_url': data['embedUrl'],
1113                     'user_agent': 'iTunes/10.6.1',
1114                 }
1115             except (ValueError,KeyError) as err:
1116                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1117
1118         return [info]
1119
1120
1121 class MyVideoIE(InfoExtractor):
1122     """Information Extractor for myvideo.de."""
1123
1124     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1125     IE_NAME = u'myvideo'
1126
1127     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1128     # Released into the Public Domain by Tristan Fischer on 2013-05-19
1129     # https://github.com/rg3/youtube-dl/pull/842
1130     def __rc4crypt(self,data, key):
1131         x = 0
1132         box = list(range(256))
1133         for i in list(range(256)):
1134             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1135             box[i], box[x] = box[x], box[i]
1136         x = 0
1137         y = 0
1138         out = ''
1139         for char in data:
1140             x = (x + 1) % 256
1141             y = (y + box[x]) % 256
1142             box[x], box[y] = box[y], box[x]
1143             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
1144         return out
1145
1146     def __md5(self,s):
1147         return hashlib.md5(s).hexdigest().encode()
1148
1149     def _real_extract(self,url):
1150         mobj = re.match(self._VALID_URL, url)
1151         if mobj is None:
1152             raise ExtractorError(u'invalid URL: %s' % url)
1153
1154         video_id = mobj.group(1)
1155
1156         GK = (
1157           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
1158           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
1159           b'TnpsbA0KTVRkbU1tSTRNdz09'
1160         )
1161
1162         # Get video webpage
1163         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1164         webpage = self._download_webpage(webpage_url, video_id)
1165
1166         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
1167         if mobj is not None:
1168             self.report_extraction(video_id)
1169             video_url = mobj.group(1) + '.flv'
1170
1171             video_title = self._html_search_regex('<title>([^<]+)</title>',
1172                 webpage, u'title')
1173
1174             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
1175
1176             return [{
1177                 'id':       video_id,
1178                 'url':      video_url,
1179                 'uploader': None,
1180                 'upload_date':  None,
1181                 'title':    video_title,
1182                 'ext':      u'flv',
1183             }]
1184
1185         # try encxml
1186         mobj = re.search('var flashvars={(.+?)}', webpage)
1187         if mobj is None:
1188             raise ExtractorError(u'Unable to extract video')
1189
1190         params = {}
1191         encxml = ''
1192         sec = mobj.group(1)
1193         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
1194             if not a == '_encxml':
1195                 params[a] = b
1196             else:
1197                 encxml = compat_urllib_parse.unquote(b)
1198         if not params.get('domain'):
1199             params['domain'] = 'www.myvideo.de'
1200         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
1201         if 'flash_playertype=MTV' in xmldata_url:
1202             self._downloader.report_warning(u'avoiding MTV player')
1203             xmldata_url = (
1204                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
1205                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
1206             ) % video_id
1207
1208         # get enc data
1209         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
1210         enc_data_b = binascii.unhexlify(enc_data)
1211         sk = self.__md5(
1212             base64.b64decode(base64.b64decode(GK)) +
1213             self.__md5(
1214                 str(video_id).encode('utf-8')
1215             )
1216         )
1217         dec_data = self.__rc4crypt(enc_data_b, sk)
1218
1219         # extracting infos
1220         self.report_extraction(video_id)
1221
1222         video_url = None
1223         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1224         if mobj:
1225             video_url = compat_urllib_parse.unquote(mobj.group(1))
1226             if 'myvideo2flash' in video_url:
1227                 self._downloader.report_warning(u'forcing RTMPT ...')
1228                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1229
1230         if not video_url:
1231             # extract non rtmp videos
1232             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1233             if mobj is None:
1234                 raise ExtractorError(u'unable to extract url')
1235             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1236
1237         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1238         video_file = compat_urllib_parse.unquote(video_file)
1239
1240         if not video_file.endswith('f4m'):
1241             ppath, prefix = video_file.split('.')
1242             video_playpath = '%s:%s' % (prefix, ppath)
1243             video_hls_playlist = ''
1244         else:
1245             video_playpath = ''
1246             video_hls_playlist = (
1247                 video_filepath + video_file
1248             ).replace('.f4m', '.m3u8')
1249
1250         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1251         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1252
1253         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1254             webpage, u'title')
1255
1256         return [{
1257             'id':                 video_id,
1258             'url':                video_url,
1259             'tc_url':             video_url,
1260             'uploader':           None,
1261             'upload_date':        None,
1262             'title':              video_title,
1263             'ext':                u'flv',
1264             'play_path':          video_playpath,
1265             'video_file':         video_file,
1266             'video_hls_playlist': video_hls_playlist,
1267             'player_url':         video_swfobj,
1268         }]
1269
1270
1271 class ComedyCentralIE(InfoExtractor):
1272     """Information extractor for The Daily Show and Colbert Report """
1273
1274     # urls can be abbreviations like :thedailyshow or :colbert
1275     # urls for episodes like:
1276     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
1277     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
1278     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
1279     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
1280                       |(https?://)?(www\.)?
1281                           (?P<showname>thedailyshow|colbertnation)\.com/
1282                          (full-episodes/(?P<episode>.*)|
1283                           (?P<clip>
1284                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1285                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1286                      $"""
1287
1288     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1289
1290     _video_extensions = {
1291         '3500': 'mp4',
1292         '2200': 'mp4',
1293         '1700': 'mp4',
1294         '1200': 'mp4',
1295         '750': 'mp4',
1296         '400': 'mp4',
1297     }
1298     _video_dimensions = {
1299         '3500': '1280x720',
1300         '2200': '960x540',
1301         '1700': '768x432',
1302         '1200': '640x360',
1303         '750': '512x288',
1304         '400': '384x216',
1305     }
1306
1307     @classmethod
1308     def suitable(cls, url):
1309         """Receives a URL and returns True if suitable for this IE."""
1310         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1311
1312     def _print_formats(self, formats):
1313         print('Available formats:')
1314         for x in formats:
1315             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1316
1317
1318     def _real_extract(self, url):
1319         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1320         if mobj is None:
1321             raise ExtractorError(u'Invalid URL: %s' % url)
1322
1323         if mobj.group('shortname'):
1324             if mobj.group('shortname') in ('tds', 'thedailyshow'):
1325                 url = u'http://www.thedailyshow.com/full-episodes/'
1326             else:
1327                 url = u'http://www.colbertnation.com/full-episodes/'
1328             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1329             assert mobj is not None
1330
1331         if mobj.group('clip'):
1332             if mobj.group('showname') == 'thedailyshow':
1333                 epTitle = mobj.group('tdstitle')
1334             else:
1335                 epTitle = mobj.group('cntitle')
1336             dlNewest = False
1337         else:
1338             dlNewest = not mobj.group('episode')
1339             if dlNewest:
1340                 epTitle = mobj.group('showname')
1341             else:
1342                 epTitle = mobj.group('episode')
1343
1344         self.report_extraction(epTitle)
1345         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1346         if dlNewest:
1347             url = htmlHandle.geturl()
1348             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1349             if mobj is None:
1350                 raise ExtractorError(u'Invalid redirected URL: ' + url)
1351             if mobj.group('episode') == '':
1352                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1353             epTitle = mobj.group('episode')
1354
1355         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1356
1357         if len(mMovieParams) == 0:
1358             # The Colbert Report embeds the information in a without
1359             # a URL prefix; so extract the alternate reference
1360             # and then add the URL prefix manually.
1361
1362             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1363             if len(altMovieParams) == 0:
1364                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1365             else:
1366                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1367
1368         uri = mMovieParams[0][1]
1369         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1370         indexXml = self._download_webpage(indexUrl, epTitle,
1371                                           u'Downloading show index',
1372                                           u'unable to download episode index')
1373
1374         results = []
1375
1376         idoc = xml.etree.ElementTree.fromstring(indexXml)
1377         itemEls = idoc.findall('.//item')
1378         for partNum,itemEl in enumerate(itemEls):
1379             mediaId = itemEl.findall('./guid')[0].text
1380             shortMediaId = mediaId.split(':')[-1]
1381             showId = mediaId.split(':')[-2].replace('.com', '')
1382             officialTitle = itemEl.findall('./title')[0].text
1383             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1384
1385             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1386                         compat_urllib_parse.urlencode({'uri': mediaId}))
1387             configXml = self._download_webpage(configUrl, epTitle,
1388                                                u'Downloading configuration for %s' % shortMediaId)
1389
1390             cdoc = xml.etree.ElementTree.fromstring(configXml)
1391             turls = []
1392             for rendition in cdoc.findall('.//rendition'):
1393                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1394                 turls.append(finfo)
1395
1396             if len(turls) == 0:
1397                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1398                 continue
1399
1400             if self._downloader.params.get('listformats', None):
1401                 self._print_formats([i[0] for i in turls])
1402                 return
1403
1404             # For now, just pick the highest bitrate
1405             format,rtmp_video_url = turls[-1]
1406
1407             # Get the format arg from the arg stream
1408             req_format = self._downloader.params.get('format', None)
1409
1410             # Select format if we can find one
1411             for f,v in turls:
1412                 if f == req_format:
1413                     format, rtmp_video_url = f, v
1414                     break
1415
1416             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1417             if not m:
1418                 raise ExtractorError(u'Cannot transform RTMP url')
1419             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1420             video_url = base + m.group('finalid')
1421
1422             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1423             info = {
1424                 'id': shortMediaId,
1425                 'url': video_url,
1426                 'uploader': showId,
1427                 'upload_date': officialDate,
1428                 'title': effTitle,
1429                 'ext': 'mp4',
1430                 'format': format,
1431                 'thumbnail': None,
1432                 'description': officialTitle,
1433             }
1434             results.append(info)
1435
1436         return results
1437
1438
1439 class EscapistIE(InfoExtractor):
1440     """Information extractor for The Escapist """
1441
1442     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1443     IE_NAME = u'escapist'
1444
1445     def _real_extract(self, url):
1446         mobj = re.match(self._VALID_URL, url)
1447         if mobj is None:
1448             raise ExtractorError(u'Invalid URL: %s' % url)
1449         showName = mobj.group('showname')
1450         videoId = mobj.group('episode')
1451
1452         self.report_extraction(videoId)
1453         webpage = self._download_webpage(url, videoId)
1454
1455         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1456             webpage, u'description', fatal=False)
1457
1458         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1459             webpage, u'thumbnail', fatal=False)
1460
1461         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1462             webpage, u'player url')
1463
1464         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1465             webpage, u'player url').split(' : ')[-1]
1466
1467         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1468         configUrl = compat_urllib_parse.unquote(configUrl)
1469
1470         configJSON = self._download_webpage(configUrl, videoId,
1471                                             u'Downloading configuration',
1472                                             u'unable to download configuration')
1473
1474         # Technically, it's JavaScript, not JSON
1475         configJSON = configJSON.replace("'", '"')
1476
1477         try:
1478             config = json.loads(configJSON)
1479         except (ValueError,) as err:
1480             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1481
1482         playlist = config['playlist']
1483         videoUrl = playlist[1]['url']
1484
1485         info = {
1486             'id': videoId,
1487             'url': videoUrl,
1488             'uploader': showName,
1489             'upload_date': None,
1490             'title': title,
1491             'ext': 'mp4',
1492             'thumbnail': imgUrl,
1493             'description': videoDesc,
1494             'player_url': playerUrl,
1495         }
1496
1497         return [info]
1498
1499 class CollegeHumorIE(InfoExtractor):
1500     """Information extractor for collegehumor.com"""
1501
1502     _WORKING = False
1503     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1504     IE_NAME = u'collegehumor'
1505
1506     def report_manifest(self, video_id):
1507         """Report information extraction."""
1508         self.to_screen(u'%s: Downloading XML manifest' % video_id)
1509
1510     def _real_extract(self, url):
1511         mobj = re.match(self._VALID_URL, url)
1512         if mobj is None:
1513             raise ExtractorError(u'Invalid URL: %s' % url)
1514         video_id = mobj.group('videoid')
1515
1516         info = {
1517             'id': video_id,
1518             'uploader': None,
1519             'upload_date': None,
1520         }
1521
1522         self.report_extraction(video_id)
1523         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1524         try:
1525             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1526         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1527             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1528
1529         mdoc = xml.etree.ElementTree.fromstring(metaXml)
1530         try:
1531             videoNode = mdoc.findall('./video')[0]
1532             info['description'] = videoNode.findall('./description')[0].text
1533             info['title'] = videoNode.findall('./caption')[0].text
1534             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1535             manifest_url = videoNode.findall('./file')[0].text
1536         except IndexError:
1537             raise ExtractorError(u'Invalid metadata XML file')
1538
1539         manifest_url += '?hdcore=2.10.3'
1540         self.report_manifest(video_id)
1541         try:
1542             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1543         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1544             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1545
1546         adoc = xml.etree.ElementTree.fromstring(manifestXml)
1547         try:
1548             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1549             node_id = media_node.attrib['url']
1550             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1551         except IndexError as err:
1552             raise ExtractorError(u'Invalid manifest file')
1553
1554         url_pr = compat_urllib_parse_urlparse(manifest_url)
1555         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1556
1557         info['url'] = url
1558         info['ext'] = 'f4f'
1559         return [info]
1560
1561
1562 class XVideosIE(InfoExtractor):
1563     """Information extractor for xvideos.com"""
1564
1565     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1566     IE_NAME = u'xvideos'
1567
1568     def _real_extract(self, url):
1569         mobj = re.match(self._VALID_URL, url)
1570         if mobj is None:
1571             raise ExtractorError(u'Invalid URL: %s' % url)
1572         video_id = mobj.group(1)
1573
1574         webpage = self._download_webpage(url, video_id)
1575
1576         self.report_extraction(video_id)
1577
1578         # Extract video URL
1579         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1580             webpage, u'video URL'))
1581
1582         # Extract title
1583         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1584             webpage, u'title')
1585
1586         # Extract video thumbnail
1587         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1588             webpage, u'thumbnail', fatal=False)
1589
1590         info = {
1591             'id': video_id,
1592             'url': video_url,
1593             'uploader': None,
1594             'upload_date': None,
1595             'title': video_title,
1596             'ext': 'flv',
1597             'thumbnail': video_thumbnail,
1598             'description': None,
1599         }
1600
1601         return [info]
1602
1603
1604 class SoundcloudIE(InfoExtractor):
1605     """Information extractor for soundcloud.com
1606        To access the media, the uid of the song and a stream token
1607        must be extracted from the page source and the script must make
1608        a request to media.soundcloud.com/crossdomain.xml. Then
1609        the media can be grabbed by requesting from an url composed
1610        of the stream token and uid
1611      """
1612
1613     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1614     IE_NAME = u'soundcloud'
1615
1616     def report_resolve(self, video_id):
1617         """Report information extraction."""
1618         self.to_screen(u'%s: Resolving id' % video_id)
1619
1620     def _real_extract(self, url):
1621         mobj = re.match(self._VALID_URL, url)
1622         if mobj is None:
1623             raise ExtractorError(u'Invalid URL: %s' % url)
1624
1625         # extract uploader (which is in the url)
1626         uploader = mobj.group(1)
1627         # extract simple title (uploader + slug of song title)
1628         slug_title =  mobj.group(2)
1629         simple_title = uploader + u'-' + slug_title
1630         full_title = '%s/%s' % (uploader, slug_title)
1631
1632         self.report_resolve(full_title)
1633
1634         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1635         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1636         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1637
1638         info = json.loads(info_json)
1639         video_id = info['id']
1640         self.report_extraction(full_title)
1641
1642         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1643         stream_json = self._download_webpage(streams_url, full_title,
1644                                              u'Downloading stream definitions',
1645                                              u'unable to download stream definitions')
1646
1647         streams = json.loads(stream_json)
1648         mediaURL = streams['http_mp3_128_url']
1649         upload_date = unified_strdate(info['created_at'])
1650
1651         return [{
1652             'id':       info['id'],
1653             'url':      mediaURL,
1654             'uploader': info['user']['username'],
1655             'upload_date': upload_date,
1656             'title':    info['title'],
1657             'ext':      u'mp3',
1658             'description': info['description'],
1659         }]
1660
1661 class SoundcloudSetIE(InfoExtractor):
1662     """Information extractor for soundcloud.com sets
1663        To access the media, the uid of the song and a stream token
1664        must be extracted from the page source and the script must make
1665        a request to media.soundcloud.com/crossdomain.xml. Then
1666        the media can be grabbed by requesting from an url composed
1667        of the stream token and uid
1668      """
1669
1670     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1671     IE_NAME = u'soundcloud:set'
1672
1673     def report_resolve(self, video_id):
1674         """Report information extraction."""
1675         self.to_screen(u'%s: Resolving id' % video_id)
1676
1677     def _real_extract(self, url):
1678         mobj = re.match(self._VALID_URL, url)
1679         if mobj is None:
1680             raise ExtractorError(u'Invalid URL: %s' % url)
1681
1682         # extract uploader (which is in the url)
1683         uploader = mobj.group(1)
1684         # extract simple title (uploader + slug of song title)
1685         slug_title =  mobj.group(2)
1686         simple_title = uploader + u'-' + slug_title
1687         full_title = '%s/sets/%s' % (uploader, slug_title)
1688
1689         self.report_resolve(full_title)
1690
1691         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1692         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1693         info_json = self._download_webpage(resolv_url, full_title)
1694
1695         videos = []
1696         info = json.loads(info_json)
1697         if 'errors' in info:
1698             for err in info['errors']:
1699                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1700             return
1701
1702         self.report_extraction(full_title)
1703         for track in info['tracks']:
1704             video_id = track['id']
1705
1706             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1707             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1708
1709             self.report_extraction(video_id)
1710             streams = json.loads(stream_json)
1711             mediaURL = streams['http_mp3_128_url']
1712
1713             videos.append({
1714                 'id':       video_id,
1715                 'url':      mediaURL,
1716                 'uploader': track['user']['username'],
1717                 'upload_date':  unified_strdate(track['created_at']),
1718                 'title':    track['title'],
1719                 'ext':      u'mp3',
1720                 'description': track['description'],
1721             })
1722         return videos
1723
1724
1725 class InfoQIE(InfoExtractor):
1726     """Information extractor for infoq.com"""
1727     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1728
1729     def _real_extract(self, url):
1730         mobj = re.match(self._VALID_URL, url)
1731         if mobj is None:
1732             raise ExtractorError(u'Invalid URL: %s' % url)
1733
1734         webpage = self._download_webpage(url, video_id=url)
1735         self.report_extraction(url)
1736
1737         # Extract video URL
1738         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1739         if mobj is None:
1740             raise ExtractorError(u'Unable to extract video url')
1741         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1742         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1743
1744         # Extract title
1745         video_title = self._search_regex(r'contentTitle = "(.*?)";',
1746             webpage, u'title')
1747
1748         # Extract description
1749         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1750             webpage, u'description', fatal=False)
1751
1752         video_filename = video_url.split('/')[-1]
1753         video_id, extension = video_filename.split('.')
1754
1755         info = {
1756             'id': video_id,
1757             'url': video_url,
1758             'uploader': None,
1759             'upload_date': None,
1760             'title': video_title,
1761             'ext': extension, # Extension is always(?) mp4, but seems to be flv
1762             'thumbnail': None,
1763             'description': video_description,
1764         }
1765
1766         return [info]
1767
1768 class MixcloudIE(InfoExtractor):
1769     """Information extractor for www.mixcloud.com"""
1770
1771     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1772     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1773     IE_NAME = u'mixcloud'
1774
1775     def report_download_json(self, file_id):
1776         """Report JSON download."""
1777         self.to_screen(u'Downloading json')
1778
1779     def get_urls(self, jsonData, fmt, bitrate='best'):
1780         """Get urls from 'audio_formats' section in json"""
1781         file_url = None
1782         try:
1783             bitrate_list = jsonData[fmt]
1784             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1785                 bitrate = max(bitrate_list) # select highest
1786
1787             url_list = jsonData[fmt][bitrate]
1788         except TypeError: # we have no bitrate info.
1789             url_list = jsonData[fmt]
1790         return url_list
1791
1792     def check_urls(self, url_list):
1793         """Returns 1st active url from list"""
1794         for url in url_list:
1795             try:
1796                 compat_urllib_request.urlopen(url)
1797                 return url
1798             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1799                 url = None
1800
1801         return None
1802
1803     def _print_formats(self, formats):
1804         print('Available formats:')
1805         for fmt in formats.keys():
1806             for b in formats[fmt]:
1807                 try:
1808                     ext = formats[fmt][b][0]
1809                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1810                 except TypeError: # we have no bitrate info
1811                     ext = formats[fmt][0]
1812                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1813                     break
1814
1815     def _real_extract(self, url):
1816         mobj = re.match(self._VALID_URL, url)
1817         if mobj is None:
1818             raise ExtractorError(u'Invalid URL: %s' % url)
1819         # extract uploader & filename from url
1820         uploader = mobj.group(1).decode('utf-8')
1821         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1822
1823         # construct API request
1824         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1825         # retrieve .json file with links to files
1826         request = compat_urllib_request.Request(file_url)
1827         try:
1828             self.report_download_json(file_url)
1829             jsonData = compat_urllib_request.urlopen(request).read()
1830         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1831             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1832
1833         # parse JSON
1834         json_data = json.loads(jsonData)
1835         player_url = json_data['player_swf_url']
1836         formats = dict(json_data['audio_formats'])
1837
1838         req_format = self._downloader.params.get('format', None)
1839         bitrate = None
1840
1841         if self._downloader.params.get('listformats', None):
1842             self._print_formats(formats)
1843             return
1844
1845         if req_format is None or req_format == 'best':
1846             for format_param in formats.keys():
1847                 url_list = self.get_urls(formats, format_param)
1848                 # check urls
1849                 file_url = self.check_urls(url_list)
1850                 if file_url is not None:
1851                     break # got it!
1852         else:
1853             if req_format not in formats:
1854                 raise ExtractorError(u'Format is not available')
1855
1856             url_list = self.get_urls(formats, req_format)
1857             file_url = self.check_urls(url_list)
1858             format_param = req_format
1859
1860         return [{
1861             'id': file_id.decode('utf-8'),
1862             'url': file_url.decode('utf-8'),
1863             'uploader': uploader.decode('utf-8'),
1864             'upload_date': None,
1865             'title': json_data['name'],
1866             'ext': file_url.split('.')[-1].decode('utf-8'),
1867             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1868             'thumbnail': json_data['thumbnail_url'],
1869             'description': json_data['description'],
1870             'player_url': player_url.decode('utf-8'),
1871         }]
1872
1873 class StanfordOpenClassroomIE(InfoExtractor):
1874     """Information extractor for Stanford's Open ClassRoom"""
1875
1876     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1877     IE_NAME = u'stanfordoc'
1878
1879     def _real_extract(self, url):
1880         mobj = re.match(self._VALID_URL, url)
1881         if mobj is None:
1882             raise ExtractorError(u'Invalid URL: %s' % url)
1883
1884         if mobj.group('course') and mobj.group('video'): # A specific video
1885             course = mobj.group('course')
1886             video = mobj.group('video')
1887             info = {
1888                 'id': course + '_' + video,
1889                 'uploader': None,
1890                 'upload_date': None,
1891             }
1892
1893             self.report_extraction(info['id'])
1894             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1895             xmlUrl = baseUrl + video + '.xml'
1896             try:
1897                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1898             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1900             mdoc = xml.etree.ElementTree.fromstring(metaXml)
1901             try:
1902                 info['title'] = mdoc.findall('./title')[0].text
1903                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1904             except IndexError:
1905                 raise ExtractorError(u'Invalid metadata XML file')
1906             info['ext'] = info['url'].rpartition('.')[2]
1907             return [info]
1908         elif mobj.group('course'): # A course page
1909             course = mobj.group('course')
1910             info = {
1911                 'id': course,
1912                 'type': 'playlist',
1913                 'uploader': None,
1914                 'upload_date': None,
1915             }
1916
1917             coursepage = self._download_webpage(url, info['id'],
1918                                         note='Downloading course info page',
1919                                         errnote='Unable to download course info page')
1920
1921             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1922
1923             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1924                 coursepage, u'description', fatal=False)
1925
1926             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1927             info['list'] = [
1928                 {
1929                     'type': 'reference',
1930                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1931                 }
1932                     for vpage in links]
1933             results = []
1934             for entry in info['list']:
1935                 assert entry['type'] == 'reference'
1936                 results += self.extract(entry['url'])
1937             return results
1938         else: # Root page
1939             info = {
1940                 'id': 'Stanford OpenClassroom',
1941                 'type': 'playlist',
1942                 'uploader': None,
1943                 'upload_date': None,
1944             }
1945
1946             self.report_download_webpage(info['id'])
1947             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1948             try:
1949                 rootpage = compat_urllib_request.urlopen(rootURL).read()
1950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1951                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1952
1953             info['title'] = info['id']
1954
1955             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1956             info['list'] = [
1957                 {
1958                     'type': 'reference',
1959                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1960                 }
1961                     for cpage in links]
1962
1963             results = []
1964             for entry in info['list']:
1965                 assert entry['type'] == 'reference'
1966                 results += self.extract(entry['url'])
1967             return results
1968
1969 class MTVIE(InfoExtractor):
1970     """Information extractor for MTV.com"""
1971
1972     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1973     IE_NAME = u'mtv'
1974
1975     def _real_extract(self, url):
1976         mobj = re.match(self._VALID_URL, url)
1977         if mobj is None:
1978             raise ExtractorError(u'Invalid URL: %s' % url)
1979         if not mobj.group('proto'):
1980             url = 'http://' + url
1981         video_id = mobj.group('videoid')
1982
1983         webpage = self._download_webpage(url, video_id)
1984
1985         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1986             webpage, u'song name', fatal=False)
1987
1988         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1989             webpage, u'title')
1990
1991         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1992             webpage, u'mtvn_uri', fatal=False)
1993
1994         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1995             webpage, u'content id', fatal=False)
1996
1997         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1998         self.report_extraction(video_id)
1999         request = compat_urllib_request.Request(videogen_url)
2000         try:
2001             metadataXml = compat_urllib_request.urlopen(request).read()
2002         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2003             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2004
2005         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2006         renditions = mdoc.findall('.//rendition')
2007
2008         # For now, always pick the highest quality.
2009         rendition = renditions[-1]
2010
2011         try:
2012             _,_,ext = rendition.attrib['type'].partition('/')
2013             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2014             video_url = rendition.find('./src').text
2015         except KeyError:
2016             raise ExtractorError('Invalid rendition field.')
2017
2018         info = {
2019             'id': video_id,
2020             'url': video_url,
2021             'uploader': performer,
2022             'upload_date': None,
2023             'title': video_title,
2024             'ext': ext,
2025             'format': format,
2026         }
2027
2028         return [info]
2029
2030
2031 class YoukuIE(InfoExtractor):
2032     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2033
2034     def _gen_sid(self):
2035         nowTime = int(time.time() * 1000)
2036         random1 = random.randint(1000,1998)
2037         random2 = random.randint(1000,9999)
2038
2039         return "%d%d%d" %(nowTime,random1,random2)
2040
2041     def _get_file_ID_mix_string(self, seed):
2042         mixed = []
2043         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2044         seed = float(seed)
2045         for i in range(len(source)):
2046             seed  =  (seed * 211 + 30031 ) % 65536
2047             index  =  math.floor(seed / 65536 * len(source) )
2048             mixed.append(source[int(index)])
2049             source.remove(source[int(index)])
2050         #return ''.join(mixed)
2051         return mixed
2052
2053     def _get_file_id(self, fileId, seed):
2054         mixed = self._get_file_ID_mix_string(seed)
2055         ids = fileId.split('*')
2056         realId = []
2057         for ch in ids:
2058             if ch:
2059                 realId.append(mixed[int(ch)])
2060         return ''.join(realId)
2061
2062     def _real_extract(self, url):
2063         mobj = re.match(self._VALID_URL, url)
2064         if mobj is None:
2065             raise ExtractorError(u'Invalid URL: %s' % url)
2066         video_id = mobj.group('ID')
2067
2068         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2069
2070         jsondata = self._download_webpage(info_url, video_id)
2071
2072         self.report_extraction(video_id)
2073         try:
2074             config = json.loads(jsondata)
2075
2076             video_title =  config['data'][0]['title']
2077             seed = config['data'][0]['seed']
2078
2079             format = self._downloader.params.get('format', None)
2080             supported_format = list(config['data'][0]['streamfileids'].keys())
2081
2082             if format is None or format == 'best':
2083                 if 'hd2' in supported_format:
2084                     format = 'hd2'
2085                 else:
2086                     format = 'flv'
2087                 ext = u'flv'
2088             elif format == 'worst':
2089                 format = 'mp4'
2090                 ext = u'mp4'
2091             else:
2092                 format = 'flv'
2093                 ext = u'flv'
2094
2095
2096             fileid = config['data'][0]['streamfileids'][format]
2097             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2098         except (UnicodeDecodeError, ValueError, KeyError):
2099             raise ExtractorError(u'Unable to extract info section')
2100
2101         files_info=[]
2102         sid = self._gen_sid()
2103         fileid = self._get_file_id(fileid, seed)
2104
2105         #column 8,9 of fileid represent the segment number
2106         #fileid[7:9] should be changed
2107         for index, key in enumerate(keys):
2108
2109             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2110             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2111
2112             info = {
2113                 'id': '%s_part%02d' % (video_id, index),
2114                 'url': download_url,
2115                 'uploader': None,
2116                 'upload_date': None,
2117                 'title': video_title,
2118                 'ext': ext,
2119             }
2120             files_info.append(info)
2121
2122         return files_info
2123
2124
2125 class XNXXIE(InfoExtractor):
2126     """Information extractor for xnxx.com"""
2127
2128     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2129     IE_NAME = u'xnxx'
2130     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2131     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2132     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2133
2134     def _real_extract(self, url):
2135         mobj = re.match(self._VALID_URL, url)
2136         if mobj is None:
2137             raise ExtractorError(u'Invalid URL: %s' % url)
2138         video_id = mobj.group(1)
2139
2140         # Get webpage content
2141         webpage = self._download_webpage(url, video_id)
2142
2143         video_url = self._search_regex(self.VIDEO_URL_RE,
2144             webpage, u'video URL')
2145         video_url = compat_urllib_parse.unquote(video_url)
2146
2147         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
2148             webpage, u'title')
2149
2150         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
2151             webpage, u'thumbnail', fatal=False)
2152
2153         return [{
2154             'id': video_id,
2155             'url': video_url,
2156             'uploader': None,
2157             'upload_date': None,
2158             'title': video_title,
2159             'ext': 'flv',
2160             'thumbnail': video_thumbnail,
2161             'description': None,
2162         }]
2163
2164
2165 class GooglePlusIE(InfoExtractor):
2166     """Information extractor for plus.google.com."""
2167
2168     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2169     IE_NAME = u'plus.google'
2170
2171     def _real_extract(self, url):
2172         # Extract id from URL
2173         mobj = re.match(self._VALID_URL, url)
2174         if mobj is None:
2175             raise ExtractorError(u'Invalid URL: %s' % url)
2176
2177         post_url = mobj.group(0)
2178         video_id = mobj.group(1)
2179
2180         video_extension = 'flv'
2181
2182         # Step 1, Retrieve post webpage to extract further information
2183         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2184
2185         self.report_extraction(video_id)
2186
2187         # Extract update date
2188         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
2189             webpage, u'upload date', fatal=False)
2190         if upload_date:
2191             # Convert timestring to a format suitable for filename
2192             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2193             upload_date = upload_date.strftime('%Y%m%d')
2194
2195         # Extract uploader
2196         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
2197             webpage, u'uploader', fatal=False)
2198
2199         # Extract title
2200         # Get the first line for title
2201         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
2202             webpage, 'title', default=u'NA')
2203
2204         # Step 2, Stimulate clicking the image box to launch video
2205         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
2206             webpage, u'video page URL')
2207         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
2208
2209         # Extract video links on video page
2210         """Extract video links of all sizes"""
2211         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
2212         mobj = re.findall(pattern, webpage)
2213         if len(mobj) == 0:
2214             raise ExtractorError(u'Unable to extract video links')
2215
2216         # Sort in resolution
2217         links = sorted(mobj)
2218
2219         # Choose the lowest of the sort, i.e. highest resolution
2220         video_url = links[-1]
2221         # Only get the url. The resolution part in the tuple has no use anymore
2222         video_url = video_url[-1]
2223         # Treat escaped \u0026 style hex
2224         try:
2225             video_url = video_url.decode("unicode_escape")
2226         except AttributeError: # Python 3
2227             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2228
2229
2230         return [{
2231             'id':       video_id,
2232             'url':      video_url,
2233             'uploader': uploader,
2234             'upload_date':  upload_date,
2235             'title':    video_title,
2236             'ext':      video_extension,
2237         }]
2238
2239 class NBAIE(InfoExtractor):
2240     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2241     IE_NAME = u'nba'
2242
2243     def _real_extract(self, url):
2244         mobj = re.match(self._VALID_URL, url)
2245         if mobj is None:
2246             raise ExtractorError(u'Invalid URL: %s' % url)
2247
2248         video_id = mobj.group(1)
2249
2250         webpage = self._download_webpage(url, video_id)
2251
2252         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2253
2254         shortened_video_id = video_id.rpartition('/')[2]
2255         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2256             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2257
2258         # It isn't there in the HTML it returns to us
2259         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2260
2261         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2262
2263         info = {
2264             'id': shortened_video_id,
2265             'url': video_url,
2266             'ext': 'mp4',
2267             'title': title,
2268             # 'uploader_date': uploader_date,
2269             'description': description,
2270         }
2271         return [info]
2272
2273 class JustinTVIE(InfoExtractor):
2274     """Information extractor for justin.tv and twitch.tv"""
2275     # TODO: One broadcast may be split into multiple videos. The key
2276     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
2277     # starts at 1 and increases. Can we treat all parts as one video?
2278
2279     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
2280         (?:
2281             (?P<channelid>[^/]+)|
2282             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2283             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2284         )
2285         /?(?:\#.*)?$
2286         """
2287     _JUSTIN_PAGE_LIMIT = 100
2288     IE_NAME = u'justin.tv'
2289
2290     def report_download_page(self, channel, offset):
2291         """Report attempt to download a single page of videos."""
2292         self.to_screen(u'%s: Downloading video information from %d to %d' %
2293                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2294
2295     # Return count of items, list of *valid* items
2296     def _parse_page(self, url, video_id):
2297         webpage = self._download_webpage(url, video_id,
2298                                          u'Downloading video info JSON',
2299                                          u'unable to download video info JSON')
2300
2301         response = json.loads(webpage)
2302         if type(response) != list:
2303             error_text = response.get('error', 'unknown error')
2304             raise ExtractorError(u'Justin.tv API: %s' % error_text)
2305         info = []
2306         for clip in response:
2307             video_url = clip['video_file_url']
2308             if video_url:
2309                 video_extension = os.path.splitext(video_url)[1][1:]
2310                 video_date = re.sub('-', '', clip['start_time'][:10])
2311                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2312                 video_id = clip['id']
2313                 video_title = clip.get('title', video_id)
2314                 info.append({
2315                     'id': video_id,
2316                     'url': video_url,
2317                     'title': video_title,
2318                     'uploader': clip.get('channel_name', video_uploader_id),
2319                     'uploader_id': video_uploader_id,
2320                     'upload_date': video_date,
2321                     'ext': video_extension,
2322                 })
2323         return (len(response), info)
2324
2325     def _real_extract(self, url):
2326         mobj = re.match(self._VALID_URL, url)
2327         if mobj is None:
2328             raise ExtractorError(u'invalid URL: %s' % url)
2329
2330         api_base = 'http://api.justin.tv'
2331         paged = False
2332         if mobj.group('channelid'):
2333             paged = True
2334             video_id = mobj.group('channelid')
2335             api = api_base + '/channel/archives/%s.json' % video_id
2336         elif mobj.group('chapterid'):
2337             chapter_id = mobj.group('chapterid')
2338
2339             webpage = self._download_webpage(url, chapter_id)
2340             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2341             if not m:
2342                 raise ExtractorError(u'Cannot find archive of a chapter')
2343             archive_id = m.group(1)
2344
2345             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2346             chapter_info_xml = self._download_webpage(api, chapter_id,
2347                                              note=u'Downloading chapter information',
2348                                              errnote=u'Chapter information download failed')
2349             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2350             for a in doc.findall('.//archive'):
2351                 if archive_id == a.find('./id').text:
2352                     break
2353             else:
2354                 raise ExtractorError(u'Could not find chapter in chapter information')
2355
2356             video_url = a.find('./video_file_url').text
2357             video_ext = video_url.rpartition('.')[2] or u'flv'
2358
2359             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2360             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2361                                    note='Downloading chapter metadata',
2362                                    errnote='Download of chapter metadata failed')
2363             chapter_info = json.loads(chapter_info_json)
2364
2365             bracket_start = int(doc.find('.//bracket_start').text)
2366             bracket_end = int(doc.find('.//bracket_end').text)
2367
2368             # TODO determine start (and probably fix up file)
2369             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2370             #video_url += u'?start=' + TODO:start_timestamp
2371             # bracket_start is 13290, but we want 51670615
2372             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2373                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2374
2375             info = {
2376                 'id': u'c' + chapter_id,
2377                 'url': video_url,
2378                 'ext': video_ext,
2379                 'title': chapter_info['title'],
2380                 'thumbnail': chapter_info['preview'],
2381                 'description': chapter_info['description'],
2382                 'uploader': chapter_info['channel']['display_name'],
2383                 'uploader_id': chapter_info['channel']['name'],
2384             }
2385             return [info]
2386         else:
2387             video_id = mobj.group('videoid')
2388             api = api_base + '/broadcast/by_archive/%s.json' % video_id
2389
2390         self.report_extraction(video_id)
2391
2392         info = []
2393         offset = 0
2394         limit = self._JUSTIN_PAGE_LIMIT
2395         while True:
2396             if paged:
2397                 self.report_download_page(video_id, offset)
2398             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2399             page_count, page_info = self._parse_page(page_url, video_id)
2400             info.extend(page_info)
2401             if not paged or page_count != limit:
2402                 break
2403             offset += limit
2404         return info
2405
2406 class FunnyOrDieIE(InfoExtractor):
2407     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2408
2409     def _real_extract(self, url):
2410         mobj = re.match(self._VALID_URL, url)
2411         if mobj is None:
2412             raise ExtractorError(u'invalid URL: %s' % url)
2413
2414         video_id = mobj.group('id')
2415         webpage = self._download_webpage(url, video_id)
2416
2417         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2418             webpage, u'video URL', flags=re.DOTALL)
2419
2420         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2421             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2422
2423         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2424             webpage, u'description', fatal=False, flags=re.DOTALL)
2425
2426         info = {
2427             'id': video_id,
2428             'url': video_url,
2429             'ext': 'mp4',
2430             'title': title,
2431             'description': video_description,
2432         }
2433         return [info]
2434
2435 class SteamIE(InfoExtractor):
2436     _VALID_URL = r"""http://store\.steampowered\.com/
2437                 (agecheck/)?
2438                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2439                 (?P<gameID>\d+)/?
2440                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2441                 """
2442     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2443     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2444
2445     @classmethod
2446     def suitable(cls, url):
2447         """Receives a URL and returns True if suitable for this IE."""
2448         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2449
2450     def _real_extract(self, url):
2451         m = re.match(self._VALID_URL, url, re.VERBOSE)
2452         gameID = m.group('gameID')
2453
2454         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2455         webpage = self._download_webpage(videourl, gameID)
2456
2457         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2458             videourl = self._AGECHECK_TEMPLATE % gameID
2459             self.report_age_confirmation()
2460             webpage = self._download_webpage(videourl, gameID)
2461
2462         self.report_extraction(gameID)
2463         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2464                                              webpage, 'game title')
2465
2466         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2467         mweb = re.finditer(urlRE, webpage)
2468         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2469         titles = re.finditer(namesRE, webpage)
2470         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2471         thumbs = re.finditer(thumbsRE, webpage)
2472         videos = []
2473         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2474             video_id = vid.group('videoID')
2475             title = vtitle.group('videoName')
2476             video_url = vid.group('videoURL')
2477             video_thumb = thumb.group('thumbnail')
2478             if not video_url:
2479                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2480             info = {
2481                 'id':video_id,
2482                 'url':video_url,
2483                 'ext': 'flv',
2484                 'title': unescapeHTML(title),
2485                 'thumbnail': video_thumb
2486                   }
2487             videos.append(info)
2488         return [self.playlist_result(videos, gameID, game_title)]
2489
2490 class UstreamIE(InfoExtractor):
2491     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2492     IE_NAME = u'ustream'
2493
2494     def _real_extract(self, url):
2495         m = re.match(self._VALID_URL, url)
2496         video_id = m.group('videoID')
2497
2498         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2499         webpage = self._download_webpage(url, video_id)
2500
2501         self.report_extraction(video_id)
2502
2503         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2504             webpage, u'title')
2505
2506         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2507             webpage, u'uploader', fatal=False, flags=re.DOTALL)
2508
2509         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2510             webpage, u'thumbnail', fatal=False)
2511
2512         info = {
2513                 'id': video_id,
2514                 'url': video_url,
2515                 'ext': 'flv',
2516                 'title': video_title,
2517                 'uploader': uploader,
2518                 'thumbnail': thumbnail,
2519                }
2520         return info
2521
2522 class WorldStarHipHopIE(InfoExtractor):
2523     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2524     IE_NAME = u'WorldStarHipHop'
2525
2526     def _real_extract(self, url):
2527         m = re.match(self._VALID_URL, url)
2528         video_id = m.group('id')
2529
2530         webpage_src = self._download_webpage(url, video_id)
2531
2532         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2533             webpage_src, u'video URL')
2534
2535         if 'mp4' in video_url:
2536             ext = 'mp4'
2537         else:
2538             ext = 'flv'
2539
2540         video_title = self._html_search_regex(r"<title>(.*)</title>",
2541             webpage_src, u'title')
2542
2543         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2544         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2545             webpage_src, u'thumbnail', fatal=False)
2546
2547         if not thumbnail:
2548             _title = r"""candytitles.*>(.*)</span>"""
2549             mobj = re.search(_title, webpage_src)
2550             if mobj is not None:
2551                 video_title = mobj.group(1)
2552
2553         results = [{
2554                     'id': video_id,
2555                     'url' : video_url,
2556                     'title' : video_title,
2557                     'thumbnail' : thumbnail,
2558                     'ext' : ext,
2559                     }]
2560         return results
2561
2562 class RBMARadioIE(InfoExtractor):
2563     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2564
2565     def _real_extract(self, url):
2566         m = re.match(self._VALID_URL, url)
2567         video_id = m.group('videoID')
2568
2569         webpage = self._download_webpage(url, video_id)
2570
2571         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2572             webpage, u'json data', flags=re.MULTILINE)
2573
2574         try:
2575             data = json.loads(json_data)
2576         except ValueError as e:
2577             raise ExtractorError(u'Invalid JSON: ' + str(e))
2578
2579         video_url = data['akamai_url'] + '&cbr=256'
2580         url_parts = compat_urllib_parse_urlparse(video_url)
2581         video_ext = url_parts.path.rpartition('.')[2]
2582         info = {
2583                 'id': video_id,
2584                 'url': video_url,
2585                 'ext': video_ext,
2586                 'title': data['title'],
2587                 'description': data.get('teaser_text'),
2588                 'location': data.get('country_of_origin'),
2589                 'uploader': data.get('host', {}).get('name'),
2590                 'uploader_id': data.get('host', {}).get('slug'),
2591                 'thumbnail': data.get('image', {}).get('large_url_2x'),
2592                 'duration': data.get('duration'),
2593         }
2594         return [info]
2595
2596
2597 class YouPornIE(InfoExtractor):
2598     """Information extractor for youporn.com."""
2599     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2600
2601     def _print_formats(self, formats):
2602         """Print all available formats"""
2603         print(u'Available formats:')
2604         print(u'ext\t\tformat')
2605         print(u'---------------------------------')
2606         for format in formats:
2607             print(u'%s\t\t%s'  % (format['ext'], format['format']))
2608
2609     def _specific(self, req_format, formats):
2610         for x in formats:
2611             if(x["format"]==req_format):
2612                 return x
2613         return None
2614
2615     def _real_extract(self, url):
2616         mobj = re.match(self._VALID_URL, url)
2617         if mobj is None:
2618             raise ExtractorError(u'Invalid URL: %s' % url)
2619         video_id = mobj.group('videoid')
2620
2621         req = compat_urllib_request.Request(url)
2622         req.add_header('Cookie', 'age_verified=1')
2623         webpage = self._download_webpage(req, video_id)
2624
2625         # Get JSON parameters
2626         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2627         try:
2628             params = json.loads(json_params)
2629         except:
2630             raise ExtractorError(u'Invalid JSON')
2631
2632         self.report_extraction(video_id)
2633         try:
2634             video_title = params['title']
2635             upload_date = unified_strdate(params['release_date_f'])
2636             video_description = params['description']
2637             video_uploader = params['submitted_by']
2638             thumbnail = params['thumbnails'][0]['image']
2639         except KeyError:
2640             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2641
2642         # Get all of the formats available
2643         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2644         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2645             webpage, u'download list').strip()
2646
2647         # Get all of the links from the page
2648         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2649         links = re.findall(LINK_RE, download_list_html)
2650         if(len(links) == 0):
2651             raise ExtractorError(u'ERROR: no known formats available for video')
2652
2653         self.to_screen(u'Links found: %d' % len(links))
2654
2655         formats = []
2656         for link in links:
2657
2658             # A link looks like this:
2659             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2660             # A path looks like this:
2661             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2662             video_url = unescapeHTML( link )
2663             path = compat_urllib_parse_urlparse( video_url ).path
2664             extension = os.path.splitext( path )[1][1:]
2665             format = path.split('/')[4].split('_')[:2]
2666             size = format[0]
2667             bitrate = format[1]
2668             format = "-".join( format )
2669             # title = u'%s-%s-%s' % (video_title, size, bitrate)
2670
2671             formats.append({
2672                 'id': video_id,
2673                 'url': video_url,
2674                 'uploader': video_uploader,
2675                 'upload_date': upload_date,
2676                 'title': video_title,
2677                 'ext': extension,
2678                 'format': format,
2679                 'thumbnail': thumbnail,
2680                 'description': video_description
2681             })
2682
2683         if self._downloader.params.get('listformats', None):
2684             self._print_formats(formats)
2685             return
2686
2687         req_format = self._downloader.params.get('format', None)
2688         self.to_screen(u'Format: %s' % req_format)
2689
2690         if req_format is None or req_format == 'best':
2691             return [formats[0]]
2692         elif req_format == 'worst':
2693             return [formats[-1]]
2694         elif req_format in ('-1', 'all'):
2695             return formats
2696         else:
2697             format = self._specific( req_format, formats )
2698             if result is None:
2699                 raise ExtractorError(u'Requested format not available')
2700             return [format]
2701
2702
2703
2704 class PornotubeIE(InfoExtractor):
2705     """Information extractor for pornotube.com."""
2706     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2707
2708     def _real_extract(self, url):
2709         mobj = re.match(self._VALID_URL, url)
2710         if mobj is None:
2711             raise ExtractorError(u'Invalid URL: %s' % url)
2712
2713         video_id = mobj.group('videoid')
2714         video_title = mobj.group('title')
2715
2716         # Get webpage content
2717         webpage = self._download_webpage(url, video_id)
2718
2719         # Get the video URL
2720         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2721         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2722         video_url = compat_urllib_parse.unquote(video_url)
2723
2724         #Get the uploaded date
2725         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2726         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2727         if upload_date: upload_date = unified_strdate(upload_date)
2728
2729         info = {'id': video_id,
2730                 'url': video_url,
2731                 'uploader': None,
2732                 'upload_date': upload_date,
2733                 'title': video_title,
2734                 'ext': 'flv',
2735                 'format': 'flv'}
2736
2737         return [info]
2738
2739 class YouJizzIE(InfoExtractor):
2740     """Information extractor for youjizz.com."""
2741     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2742
2743     def _real_extract(self, url):
2744         mobj = re.match(self._VALID_URL, url)
2745         if mobj is None:
2746             raise ExtractorError(u'Invalid URL: %s' % url)
2747
2748         video_id = mobj.group('videoid')
2749
2750         # Get webpage content
2751         webpage = self._download_webpage(url, video_id)
2752
2753         # Get the video title
2754         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2755             webpage, u'title').strip()
2756
2757         # Get the embed page
2758         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2759         if result is None:
2760             raise ExtractorError(u'ERROR: unable to extract embed page')
2761
2762         embed_page_url = result.group(0).strip()
2763         video_id = result.group('videoid')
2764
2765         webpage = self._download_webpage(embed_page_url, video_id)
2766
2767         # Get the video URL
2768         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2769             webpage, u'video URL')
2770
2771         info = {'id': video_id,
2772                 'url': video_url,
2773                 'title': video_title,
2774                 'ext': 'flv',
2775                 'format': 'flv',
2776                 'player_url': embed_page_url}
2777
2778         return [info]
2779
2780 class EightTracksIE(InfoExtractor):
2781     IE_NAME = '8tracks'
2782     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2783
2784     def _real_extract(self, url):
2785         mobj = re.match(self._VALID_URL, url)
2786         if mobj is None:
2787             raise ExtractorError(u'Invalid URL: %s' % url)
2788         playlist_id = mobj.group('id')
2789
2790         webpage = self._download_webpage(url, playlist_id)
2791
2792         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2793         data = json.loads(json_like)
2794
2795         session = str(random.randint(0, 1000000000))
2796         mix_id = data['id']
2797         track_count = data['tracks_count']
2798         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2799         next_url = first_url
2800         res = []
2801         for i in itertools.count():
2802             api_json = self._download_webpage(next_url, playlist_id,
2803                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2804                 errnote=u'Failed to download song information')
2805             api_data = json.loads(api_json)
2806             track_data = api_data[u'set']['track']
2807             info = {
2808                 'id': track_data['id'],
2809                 'url': track_data['track_file_stream_url'],
2810                 'title': track_data['performer'] + u' - ' + track_data['name'],
2811                 'raw_title': track_data['name'],
2812                 'uploader_id': data['user']['login'],
2813                 'ext': 'm4a',
2814             }
2815             res.append(info)
2816             if api_data['set']['at_last_track']:
2817                 break
2818             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2819         return res
2820
2821 class KeekIE(InfoExtractor):
2822     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2823     IE_NAME = u'keek'
2824
2825     def _real_extract(self, url):
2826         m = re.match(self._VALID_URL, url)
2827         video_id = m.group('videoID')
2828
2829         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2830         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2831         webpage = self._download_webpage(url, video_id)
2832
2833         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2834             webpage, u'title')
2835
2836         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2837             webpage, u'uploader', fatal=False)
2838
2839         info = {
2840                 'id': video_id,
2841                 'url': video_url,
2842                 'ext': 'mp4',
2843                 'title': video_title,
2844                 'thumbnail': thumbnail,
2845                 'uploader': uploader
2846         }
2847         return [info]
2848
2849 class TEDIE(InfoExtractor):
2850     _VALID_URL=r'''http://www\.ted\.com/
2851                    (
2852                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2853                         |
2854                         ((?P<type_talk>talks)) # We have a simple talk
2855                    )
2856                    (/lang/(.*?))? # The url may contain the language
2857                    /(?P<name>\w+) # Here goes the name and then ".html"
2858                    '''
2859
2860     @classmethod
2861     def suitable(cls, url):
2862         """Receives a URL and returns True if suitable for this IE."""
2863         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2864
2865     def _real_extract(self, url):
2866         m=re.match(self._VALID_URL, url, re.VERBOSE)
2867         if m.group('type_talk'):
2868             return [self._talk_info(url)]
2869         else :
2870             playlist_id=m.group('playlist_id')
2871             name=m.group('name')
2872             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2873             return [self._playlist_videos_info(url,name,playlist_id)]
2874
2875     def _playlist_videos_info(self,url,name,playlist_id=0):
2876         '''Returns the videos of the playlist'''
2877         video_RE=r'''
2878                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2879                      ([.\s]*?)data-playlist_item_id="(\d+)"
2880                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2881                      '''
2882         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2883         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2884         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2885         m_names=re.finditer(video_name_RE,webpage)
2886
2887         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2888                                                  webpage, 'playlist title')
2889
2890         playlist_entries = []
2891         for m_video, m_name in zip(m_videos,m_names):
2892             video_id=m_video.group('video_id')
2893             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2894             playlist_entries.append(self.url_result(talk_url, 'TED'))
2895         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2896
2897     def _talk_info(self, url, video_id=0):
2898         """Return the video for the talk in the url"""
2899         m = re.match(self._VALID_URL, url,re.VERBOSE)
2900         video_name = m.group('name')
2901         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2902         self.report_extraction(video_name)
2903         # If the url includes the language we get the title translated
2904         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2905                                         webpage, 'title')
2906         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2907                                     webpage, 'json data')
2908         info = json.loads(json_data)
2909         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2910                                        webpage, 'description', flags = re.DOTALL)
2911
2912         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2913                                        webpage, 'thumbnail')
2914         info = {
2915                 'id': info['id'],
2916                 'url': info['htmlStreams'][-1]['file'],
2917                 'ext': 'mp4',
2918                 'title': title,
2919                 'thumbnail': thumbnail,
2920                 'description': desc,
2921                 }
2922         return info
2923
2924 class MySpassIE(InfoExtractor):
2925     _VALID_URL = r'http://www.myspass.de/.*'
2926
2927     def _real_extract(self, url):
2928         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2929
2930         # video id is the last path element of the URL
2931         # usually there is a trailing slash, so also try the second but last
2932         url_path = compat_urllib_parse_urlparse(url).path
2933         url_parent_path, video_id = os.path.split(url_path)
2934         if not video_id:
2935             _, video_id = os.path.split(url_parent_path)
2936
2937         # get metadata
2938         metadata_url = META_DATA_URL_TEMPLATE % video_id
2939         metadata_text = self._download_webpage(metadata_url, video_id)
2940         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2941
2942         # extract values from metadata
2943         url_flv_el = metadata.find('url_flv')
2944         if url_flv_el is None:
2945             raise ExtractorError(u'Unable to extract download url')
2946         video_url = url_flv_el.text
2947         extension = os.path.splitext(video_url)[1][1:]
2948         title_el = metadata.find('title')
2949         if title_el is None:
2950             raise ExtractorError(u'Unable to extract title')
2951         title = title_el.text
2952         format_id_el = metadata.find('format_id')
2953         if format_id_el is None:
2954             format = ext
2955         else:
2956             format = format_id_el.text
2957         description_el = metadata.find('description')
2958         if description_el is not None:
2959             description = description_el.text
2960         else:
2961             description = None
2962         imagePreview_el = metadata.find('imagePreview')
2963         if imagePreview_el is not None:
2964             thumbnail = imagePreview_el.text
2965         else:
2966             thumbnail = None
2967         info = {
2968             'id': video_id,
2969             'url': video_url,
2970             'title': title,
2971             'ext': extension,
2972             'format': format,
2973             'thumbnail': thumbnail,
2974             'description': description
2975         }
2976         return [info]
2977
2978 class SpiegelIE(InfoExtractor):
2979     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2980
2981     def _real_extract(self, url):
2982         m = re.match(self._VALID_URL, url)
2983         video_id = m.group('videoID')
2984
2985         webpage = self._download_webpage(url, video_id)
2986
2987         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2988             webpage, u'title')
2989
2990         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2991         xml_code = self._download_webpage(xml_url, video_id,
2992                     note=u'Downloading XML', errnote=u'Failed to download XML')
2993
2994         idoc = xml.etree.ElementTree.fromstring(xml_code)
2995         last_type = idoc[-1]
2996         filename = last_type.findall('./filename')[0].text
2997         duration = float(last_type.findall('./duration')[0].text)
2998
2999         video_url = 'http://video2.spiegel.de/flash/' + filename
3000         video_ext = filename.rpartition('.')[2]
3001         info = {
3002             'id': video_id,
3003             'url': video_url,
3004             'ext': video_ext,
3005             'title': video_title,
3006             'duration': duration,
3007         }
3008         return [info]
3009
3010 class LiveLeakIE(InfoExtractor):
3011
3012     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3013     IE_NAME = u'liveleak'
3014
3015     def _real_extract(self, url):
3016         mobj = re.match(self._VALID_URL, url)
3017         if mobj is None:
3018             raise ExtractorError(u'Invalid URL: %s' % url)
3019
3020         video_id = mobj.group('video_id')
3021
3022         webpage = self._download_webpage(url, video_id)
3023
3024         video_url = self._search_regex(r'file: "(.*?)",',
3025             webpage, u'video URL')
3026
3027         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3028             webpage, u'title').replace('LiveLeak.com -', '').strip()
3029
3030         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3031             webpage, u'description', fatal=False)
3032
3033         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3034             webpage, u'uploader', fatal=False)
3035
3036         info = {
3037             'id':  video_id,
3038             'url': video_url,
3039             'ext': 'mp4',
3040             'title': video_title,
3041             'description': video_description,
3042             'uploader': video_uploader
3043         }
3044
3045         return [info]
3046
3047 class ARDIE(InfoExtractor):
3048     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3049     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3050     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3051
3052     def _real_extract(self, url):
3053         # determine video id from url
3054         m = re.match(self._VALID_URL, url)
3055
3056         numid = re.search(r'documentId=([0-9]+)', url)
3057         if numid:
3058             video_id = numid.group(1)
3059         else:
3060             video_id = m.group('video_id')
3061
3062         # determine title and media streams from webpage
3063         html = self._download_webpage(url, video_id)
3064         title = re.search(self._TITLE, html).group('title')
3065         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3066         if not streams:
3067             assert '"fsk"' in html
3068             raise ExtractorError(u'This video is only available after 8:00 pm')
3069
3070         # choose default media type and highest quality for now
3071         stream = max([s for s in streams if int(s["media_type"]) == 0],
3072                      key=lambda s: int(s["quality"]))
3073
3074         # there's two possibilities: RTMP stream or HTTP download
3075         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3076         if stream['rtmp_url']:
3077             self.to_screen(u'RTMP download detected')
3078             assert stream['video_url'].startswith('mp4:')
3079             info["url"] = stream["rtmp_url"]
3080             info["play_path"] = stream['video_url']
3081         else:
3082             assert stream["video_url"].endswith('.mp4')
3083             info["url"] = stream["video_url"]
3084         return [info]
3085
3086 class ZDFIE(InfoExtractor):
3087     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3088     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
3089     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
3090     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
3091     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
3092
3093     def _real_extract(self, url):
3094         mobj = re.match(self._VALID_URL, url)
3095         if mobj is None:
3096             raise ExtractorError(u'Invalid URL: %s' % url)
3097         video_id = mobj.group('video_id')
3098
3099         html = self._download_webpage(url, video_id)
3100         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3101         if streams is None:
3102             raise ExtractorError(u'No media url found.')
3103
3104         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
3105         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
3106         # choose first/default media type and highest quality for now
3107         for s in streams:        #find 300 - dsl1000mbit
3108             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
3109                 stream_=s
3110                 break
3111         for s in streams:        #find veryhigh - dsl2000mbit
3112             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
3113                 stream_=s
3114                 break
3115         if stream_ is None:
3116             raise ExtractorError(u'No stream found.')
3117
3118         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
3119
3120         self.report_extraction(video_id)
3121         mobj = re.search(self._TITLE, html)
3122         if mobj is None:
3123             raise ExtractorError(u'Cannot extract title')
3124         title = unescapeHTML(mobj.group('title'))
3125
3126         mobj = re.search(self._MMS_STREAM, media_link)
3127         if mobj is None:
3128             mobj = re.search(self._RTSP_STREAM, media_link)
3129             if mobj is None:
3130                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
3131         mms_url = mobj.group('video_url')
3132
3133         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
3134         if mobj is None:
3135             raise ExtractorError(u'Cannot extract extention')
3136         ext = mobj.group('ext')
3137
3138         return [{'id': video_id,
3139                  'url': mms_url,
3140                  'title': title,
3141                  'ext': ext
3142                  }]
3143
3144 class TumblrIE(InfoExtractor):
3145     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3146
3147     def _real_extract(self, url):
3148         m_url = re.match(self._VALID_URL, url)
3149         video_id = m_url.group('id')
3150         blog = m_url.group('blog_name')
3151
3152         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3153         webpage = self._download_webpage(url, video_id)
3154
3155         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3156         video = re.search(re_video, webpage)
3157         if video is None:
3158            raise ExtractorError(u'Unable to extract video')
3159         video_url = video.group('video_url')
3160         ext = video.group('ext')
3161
3162         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
3163             webpage, u'thumbnail', fatal=False)  # We pick the first poster
3164         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
3165
3166         # The only place where you can get a title, it's not complete,
3167         # but searching in other places doesn't work for all videos
3168         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
3169             webpage, u'title', flags=re.DOTALL)
3170
3171         return [{'id': video_id,
3172                  'url': video_url,
3173                  'title': video_title,
3174                  'thumbnail': video_thumbnail,
3175                  'ext': ext
3176                  }]
3177
3178 class BandcampIE(InfoExtractor):
3179     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3180
3181     def _real_extract(self, url):
3182         mobj = re.match(self._VALID_URL, url)
3183         title = mobj.group('title')
3184         webpage = self._download_webpage(url, title)
3185         # We get the link to the free download page
3186         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3187         if m_download is None:
3188             raise ExtractorError(u'No free songs found')
3189
3190         download_link = m_download.group(1)
3191         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3192                        webpage, re.MULTILINE|re.DOTALL).group('id')
3193
3194         download_webpage = self._download_webpage(download_link, id,
3195                                                   'Downloading free downloads page')
3196         # We get the dictionary of the track from some javascrip code
3197         info = re.search(r'items: (.*?),$',
3198                          download_webpage, re.MULTILINE).group(1)
3199         info = json.loads(info)[0]
3200         # We pick mp3-320 for now, until format selection can be easily implemented.
3201         mp3_info = info[u'downloads'][u'mp3-320']
3202         # If we try to use this url it says the link has expired
3203         initial_url = mp3_info[u'url']
3204         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3205         m_url = re.match(re_url, initial_url)
3206         #We build the url we will use to get the final track url
3207         # This url is build in Bandcamp in the script download_bunde_*.js
3208         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3209         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3210         # If we could correctly generate the .rand field the url would be
3211         #in the "download_url" key
3212         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3213
3214         track_info = {'id':id,
3215                       'title' : info[u'title'],
3216                       'ext' :   'mp3',
3217                       'url' :   final_url,
3218                       'thumbnail' : info[u'thumb_url'],
3219                       'uploader' :  info[u'artist']
3220                       }
3221
3222         return [track_info]
3223
3224 class RedTubeIE(InfoExtractor):
3225     """Information Extractor for redtube"""
3226     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3227
3228     def _real_extract(self,url):
3229         mobj = re.match(self._VALID_URL, url)
3230         if mobj is None:
3231             raise ExtractorError(u'Invalid URL: %s' % url)
3232
3233         video_id = mobj.group('id')
3234         video_extension = 'mp4'
3235         webpage = self._download_webpage(url, video_id)
3236
3237         self.report_extraction(video_id)
3238
3239         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3240             webpage, u'video URL')
3241
3242         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3243             webpage, u'title')
3244
3245         return [{
3246             'id':       video_id,
3247             'url':      video_url,
3248             'ext':      video_extension,
3249             'title':    video_title,
3250         }]
3251
3252 class InaIE(InfoExtractor):
3253     """Information Extractor for Ina.fr"""
3254     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3255
3256     def _real_extract(self,url):
3257         mobj = re.match(self._VALID_URL, url)
3258
3259         video_id = mobj.group('id')
3260         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3261         video_extension = 'mp4'
3262         webpage = self._download_webpage(mrss_url, video_id)
3263
3264         self.report_extraction(video_id)
3265
3266         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
3267             webpage, u'video URL')
3268
3269         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
3270             webpage, u'title')
3271
3272         return [{
3273             'id':       video_id,
3274             'url':      video_url,
3275             'ext':      video_extension,
3276             'title':    video_title,
3277         }]
3278
3279 class HowcastIE(InfoExtractor):
3280     """Information Extractor for Howcast.com"""
3281     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
3282
3283     def _real_extract(self, url):
3284         mobj = re.match(self._VALID_URL, url)
3285
3286         video_id = mobj.group('id')
3287         webpage_url = 'http://www.howcast.com/videos/' + video_id
3288         webpage = self._download_webpage(webpage_url, video_id)
3289
3290         self.report_extraction(video_id)
3291
3292         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3293             webpage, u'video URL')
3294
3295         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3296             webpage, u'title')
3297
3298         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3299             webpage, u'description', fatal=False)
3300
3301         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3302             webpage, u'thumbnail', fatal=False)
3303
3304         return [{
3305             'id':       video_id,
3306             'url':      video_url,
3307             'ext':      'mp4',
3308             'title':    video_title,
3309             'description': video_description,
3310             'thumbnail': thumbnail,
3311         }]
3312
3313 class VineIE(InfoExtractor):
3314     """Information Extractor for Vine.co"""
3315     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3316
3317     def _real_extract(self, url):
3318         mobj = re.match(self._VALID_URL, url)
3319
3320         video_id = mobj.group('id')
3321         webpage_url = 'https://vine.co/v/' + video_id
3322         webpage = self._download_webpage(webpage_url, video_id)
3323
3324         self.report_extraction(video_id)
3325
3326         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3327             webpage, u'video URL')
3328
3329         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3330             webpage, u'title')
3331
3332         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3333             webpage, u'thumbnail', fatal=False)
3334
3335         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3336             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3337
3338         return [{
3339             'id':        video_id,
3340             'url':       video_url,
3341             'ext':       'mp4',
3342             'title':     video_title,
3343             'thumbnail': thumbnail,
3344             'uploader':  uploader,
3345         }]
3346
3347 class FlickrIE(InfoExtractor):
3348     """Information Extractor for Flickr videos"""
3349     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3350
3351     def _real_extract(self, url):
3352         mobj = re.match(self._VALID_URL, url)
3353
3354         video_id = mobj.group('id')
3355         video_uploader_id = mobj.group('uploader_id')
3356         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3357         webpage = self._download_webpage(webpage_url, video_id)
3358
3359         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3360
3361         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3362         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3363
3364         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3365             first_xml, u'node_id')
3366
3367         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3368         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3369
3370         self.report_extraction(video_id)
3371
3372         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3373         if mobj is None:
3374             raise ExtractorError(u'Unable to extract video url')
3375         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3376
3377         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3378             webpage, u'video title')
3379
3380         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3381             webpage, u'description', fatal=False)
3382
3383         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3384             webpage, u'thumbnail', fatal=False)
3385
3386         return [{
3387             'id':          video_id,
3388             'url':         video_url,
3389             'ext':         'mp4',
3390             'title':       video_title,
3391             'description': video_description,
3392             'thumbnail':   thumbnail,
3393             'uploader_id': video_uploader_id,
3394         }]
3395
3396 class TeamcocoIE(InfoExtractor):
3397     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3398
3399     def _real_extract(self, url):
3400         mobj = re.match(self._VALID_URL, url)
3401         if mobj is None:
3402             raise ExtractorError(u'Invalid URL: %s' % url)
3403         url_title = mobj.group('url_title')
3404         webpage = self._download_webpage(url, url_title)
3405
3406         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3407             webpage, u'video id')
3408
3409         self.report_extraction(video_id)
3410
3411         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3412             webpage, u'title')
3413
3414         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3415             webpage, u'thumbnail', fatal=False)
3416
3417         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3418             webpage, u'description', fatal=False)
3419
3420         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3421         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3422
3423         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3424             data, u'video URL')
3425
3426         return [{
3427             'id':          video_id,
3428             'url':         video_url,
3429             'ext':         'mp4',
3430             'title':       video_title,
3431             'thumbnail':   thumbnail,
3432             'description': video_description,
3433         }]
3434
3435 class XHamsterIE(InfoExtractor):
3436     """Information Extractor for xHamster"""
3437     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3438
3439     def _real_extract(self,url):
3440         mobj = re.match(self._VALID_URL, url)
3441
3442         video_id = mobj.group('id')
3443         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3444         webpage = self._download_webpage(mrss_url, video_id)
3445
3446         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3447         if mobj is None:
3448             raise ExtractorError(u'Unable to extract media URL')
3449         if len(mobj.group('server')) == 0:
3450             video_url = compat_urllib_parse.unquote(mobj.group('file'))
3451         else:
3452             video_url = mobj.group('server')+'/key='+mobj.group('file')
3453         video_extension = video_url.split('.')[-1]
3454
3455         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3456             webpage, u'title')
3457
3458         # Can't see the description anywhere in the UI
3459         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3460         #     webpage, u'description', fatal=False)
3461         # if video_description: video_description = unescapeHTML(video_description)
3462
3463         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3464         if mobj:
3465             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3466         else:
3467             video_upload_date = None
3468             self._downloader.report_warning(u'Unable to extract upload date')
3469
3470         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3471             webpage, u'uploader id', default=u'anonymous')
3472
3473         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3474             webpage, u'thumbnail', fatal=False)
3475
3476         return [{
3477             'id':       video_id,
3478             'url':      video_url,
3479             'ext':      video_extension,
3480             'title':    video_title,
3481             # 'description': video_description,
3482             'upload_date': video_upload_date,
3483             'uploader_id': video_uploader_id,
3484             'thumbnail': video_thumbnail
3485         }]
3486
3487 class HypemIE(InfoExtractor):
3488     """Information Extractor for hypem"""
3489     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3490
3491     def _real_extract(self, url):
3492         mobj = re.match(self._VALID_URL, url)
3493         if mobj is None:
3494             raise ExtractorError(u'Invalid URL: %s' % url)
3495         track_id = mobj.group(1)
3496
3497         data = { 'ax': 1, 'ts': time.time() }
3498         data_encoded = compat_urllib_parse.urlencode(data)
3499         complete_url = url + "?" + data_encoded
3500         request = compat_urllib_request.Request(complete_url)
3501         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3502         cookie = urlh.headers.get('Set-Cookie', '')
3503
3504         self.report_extraction(track_id)
3505
3506         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3507             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3508         try:
3509             track_list = json.loads(html_tracks)
3510             track = track_list[u'tracks'][0]
3511         except ValueError:
3512             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3513
3514         key = track[u"key"]
3515         track_id = track[u"id"]
3516         artist = track[u"artist"]
3517         title = track[u"song"]
3518
3519         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3520         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3521         request.add_header('cookie', cookie)
3522         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3523         try:
3524             song_data = json.loads(song_data_json)
3525         except ValueError:
3526             raise ExtractorError(u'Hypemachine contained invalid JSON.')
3527         final_url = song_data[u"url"]
3528
3529         return [{
3530             'id':       track_id,
3531             'url':      final_url,
3532             'ext':      "mp3",
3533             'title':    title,
3534             'artist':   artist,
3535         }]
3536
3537 class Vbox7IE(InfoExtractor):
3538     """Information Extractor for Vbox7"""
3539     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3540
3541     def _real_extract(self,url):
3542         mobj = re.match(self._VALID_URL, url)
3543         if mobj is None:
3544             raise ExtractorError(u'Invalid URL: %s' % url)
3545         video_id = mobj.group(1)
3546
3547         redirect_page, urlh = self._download_webpage_handle(url, video_id)
3548         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3549         redirect_url = urlh.geturl() + new_location
3550         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3551
3552         title = self._html_search_regex(r'<title>(.*)</title>',
3553             webpage, u'title').split('/')[0].strip()
3554
3555         ext = "flv"
3556         info_url = "http://vbox7.com/play/magare.do"
3557         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3558         info_request = compat_urllib_request.Request(info_url, data)
3559         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3560         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3561         if info_response is None:
3562             raise ExtractorError(u'Unable to extract the media url')
3563         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3564
3565         return [{
3566             'id':        video_id,
3567             'url':       final_url,
3568             'ext':       ext,
3569             'title':     title,
3570             'thumbnail': thumbnail_url,
3571         }]
3572
3573 class GametrailersIE(InfoExtractor):
3574     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3575
3576     def _real_extract(self, url):
3577         mobj = re.match(self._VALID_URL, url)
3578         if mobj is None:
3579             raise ExtractorError(u'Invalid URL: %s' % url)
3580         video_id = mobj.group('id')
3581         video_type = mobj.group('type')
3582         webpage = self._download_webpage(url, video_id)
3583         if video_type == 'full-episodes':
3584             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3585         else:
3586             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3587         mgid = self._search_regex(mgid_re, webpage, u'mgid')
3588         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3589
3590         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3591                                            video_id, u'Downloading video info')
3592         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3593                                                video_id, u'Downloading video urls info')
3594
3595         self.report_extraction(video_id)
3596         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3597                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3598                       <image>.*
3599                         <url>(?P<thumb>.*?)</url>.*
3600                       </image>'''
3601
3602         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3603         if m_info is None:
3604             raise ExtractorError(u'Unable to extract video info')
3605         video_title = m_info.group('title')
3606         video_description = m_info.group('description')
3607         video_thumb = m_info.group('thumb')
3608
3609         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3610         if m_urls is None or len(m_urls) == 0:
3611             raise ExtractError(u'Unable to extrat video url')
3612         # They are sorted from worst to best quality
3613         video_url = m_urls[-1].group('url')
3614
3615         return {'url':         video_url,
3616                 'id':          video_id,
3617                 'title':       video_title,
3618                 # Videos are actually flv not mp4
3619                 'ext':         'flv',
3620                 'thumbnail':   video_thumb,
3621                 'description': video_description,
3622                 }
3623
3624 class StatigramIE(InfoExtractor):
3625     _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
3626
3627     def _real_extract(self, url):
3628         mobj = re.match(self._VALID_URL, url)
3629
3630         video_id = mobj.group(1)
3631         webpage = self._download_webpage(url, video_id)
3632         video_url = self._html_search_regex(
3633             r'<meta property="og:video:secure_url" content="(.+?)">',
3634             webpage, u'video URL')
3635         thumbnail_url = self._html_search_regex(
3636             r'<meta property="og:image" content="(.+?)" />',
3637             webpage, u'thumbnail URL', fatal=False)
3638         html_title = self._html_search_regex(
3639             r'<title>(.+?)</title>',
3640             webpage, u'title')
3641         title = html_title.rpartition(u' | Statigram')[0]
3642         uploader_id = self._html_search_regex(
3643             r'@([^ ]+)', title, u'uploader name', fatal=False)
3644         ext = 'mp4'
3645
3646         return [{
3647             'id':        video_id,
3648             'url':       video_url,
3649             'ext':       ext,
3650             'title':     title,
3651             'thumbnail': thumbnail_url,
3652             'uploader_id' : uploader_id
3653         }]
3654
3655 def gen_extractors():
3656     """ Return a list of an instance of every supported extractor.
3657     The order does matter; the first extractor matched is the one handling the URL.
3658     """
3659     return [
3660         YoutubePlaylistIE(),
3661         YoutubeChannelIE(),
3662         YoutubeUserIE(),
3663         YoutubeSearchIE(),
3664         YoutubeIE(),
3665         MetacafeIE(),
3666         DailymotionIE(),
3667         GoogleSearchIE(),
3668         PhotobucketIE(),
3669         YahooIE(),
3670         YahooSearchIE(),
3671         DepositFilesIE(),
3672         FacebookIE(),
3673         BlipTVIE(),
3674         BlipTVUserIE(),
3675         VimeoIE(),
3676         MyVideoIE(),
3677         ComedyCentralIE(),
3678         EscapistIE(),
3679         CollegeHumorIE(),
3680         XVideosIE(),
3681         SoundcloudSetIE(),
3682         SoundcloudIE(),
3683         InfoQIE(),
3684         MixcloudIE(),
3685         StanfordOpenClassroomIE(),
3686         MTVIE(),
3687         YoukuIE(),
3688         XNXXIE(),
3689         YouJizzIE(),
3690         PornotubeIE(),
3691         YouPornIE(),
3692         GooglePlusIE(),
3693         ArteTvIE(),
3694         NBAIE(),
3695         WorldStarHipHopIE(),
3696         JustinTVIE(),
3697         FunnyOrDieIE(),
3698         SteamIE(),
3699         UstreamIE(),
3700         RBMARadioIE(),
3701         EightTracksIE(),
3702         KeekIE(),
3703         TEDIE(),
3704         MySpassIE(),
3705         SpiegelIE(),
3706         LiveLeakIE(),
3707         ARDIE(),
3708         ZDFIE(),
3709         TumblrIE(),
3710         BandcampIE(),
3711         RedTubeIE(),
3712         InaIE(),
3713         HowcastIE(),
3714         VineIE(),
3715         FlickrIE(),
3716         TeamcocoIE(),
3717         XHamsterIE(),
3718         HypemIE(),
3719         Vbox7IE(),
3720         GametrailersIE(),
3721         StatigramIE(),
3722         GenericIE()
3723     ]
3724
3725 def get_info_extractor(ie_name):
3726     """Returns the info extractor class with the given ie_name"""
3727     return globals()[ie_name+'IE']