_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19 import hashlib
  20 import binascii
  21 import urllib
  22
  23 from .utils import *
  24
  25
  26 from .extractor.common import InfoExtractor, SearchInfoExtractor
  27
  28
  29 class YoutubeIE(InfoExtractor):
  30     """Information extractor for youtube.com."""
  31
  32     _VALID_URL = r"""^
  33                      (
  34                          (?:https?://)?                                       # http(s):// (optional)
  35                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
  36                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
  37                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
  38                          (?:                                                  # the various things that can precede the ID:
  39                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
  40                              |(?:                                             # or the v= param in all its forms
  41                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
  42                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
  43                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
  44                                  v=
  45                              )
  46                          )?                                                   # optional -> youtube.com/xxxx is OK
  47                      )?                                                       # all until now is optional -> you can pass the naked ID
  48                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
  49                      (?(1).+)?                                                # if we found the ID, everything can follow
  50                      $"""
  51     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  52     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  53     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  54     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
  55     _NETRC_MACHINE = 'youtube'
  56     # Listed in order of quality
  57     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
  58     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
  59     _video_extensions = {
  60         '13': '3gp',
  61         '17': 'mp4',
  62         '18': 'mp4',
  63         '22': 'mp4',
  64         '37': 'mp4',
  65         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
  66         '43': 'webm',
  67         '44': 'webm',
  68         '45': 'webm',
  69         '46': 'webm',
  70     }
  71     _video_dimensions = {
  72         '5': '240x400',
  73         '6': '???',
  74         '13': '???',
  75         '17': '144x176',
  76         '18': '360x640',
  77         '22': '720x1280',
  78         '34': '360x640',
  79         '35': '480x854',
  80         '37': '1080x1920',
  81         '38': '3072x4096',
  82         '43': '360x640',
  83         '44': '480x854',
  84         '45': '720x1280',
  85         '46': '1080x1920',
  86     }
  87     IE_NAME = u'youtube'
  88
  89     @classmethod
  90     def suitable(cls, url):
  91         """Receives a URL and returns True if suitable for this IE."""
  92         if YoutubePlaylistIE.suitable(url): return False
  93         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  94
  95     def report_lang(self):
  96         """Report attempt to set language."""
  97         self.to_screen(u'Setting language')
  98
  99     def report_login(self):
 100         """Report attempt to log in."""
 101         self.to_screen(u'Logging in')
 102
 103     def report_video_webpage_download(self, video_id):
 104         """Report attempt to download video webpage."""
 105         self.to_screen(u'%s: Downloading video webpage' % video_id)
 106
 107     def report_video_info_webpage_download(self, video_id):
 108         """Report attempt to download video info webpage."""
 109         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 110
 111     def report_video_subtitles_download(self, video_id):
 112         """Report attempt to download video info webpage."""
 113         self.to_screen(u'%s: Checking available subtitles' % video_id)
 114
 115     def report_video_subtitles_request(self, video_id, sub_lang, format):
 116         """Report attempt to download video info webpage."""
 117         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 118
 119     def report_video_subtitles_available(self, video_id, sub_lang_list):
 120         """Report available subtitles."""
 121         sub_lang = ",".join(list(sub_lang_list.keys()))
 122         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 123
 124     def report_information_extraction(self, video_id):
 125         """Report attempt to extract video information."""
 126         self.to_screen(u'%s: Extracting video information' % video_id)
 127
 128     def report_unavailable_format(self, video_id, format):
 129         """Report extracted video URL."""
 130         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 131
 132     def report_rtmp_download(self):
 133         """Indicate the download will use the RTMP protocol."""
 134         self.to_screen(u'RTMP download detected')
 135
 136     @staticmethod
 137     def _decrypt_signature(s):
 138         """Decrypt the key the two subkeys must have a length of 43"""
 139         (a,b) = s.split('.')
 140         if len(a) != 43 or len(b) != 43:
 141             raise ExtractorError(u'Unable to decrypt signature, subkeys lengths not valid')
 142         b = ''.join([b[:8],a[0],b[9:18],b[-4],b[19:39], b[18]])[0:40]
 143         a = a[-40:]
 144         s_dec = '.'.join((a,b))[::-1]
 145         return s_dec
 146
 147     def _get_available_subtitles(self, video_id):
 148         self.report_video_subtitles_download(video_id)
 149         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 150         try:
 151             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 152         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 153             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 154         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 155         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 156         if not sub_lang_list:
 157             return (u'video doesn\'t have subtitles', None)
 158         return sub_lang_list
 159
 160     def _list_available_subtitles(self, video_id):
 161         sub_lang_list = self._get_available_subtitles(video_id)
 162         self.report_video_subtitles_available(video_id, sub_lang_list)
 163
 164     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 165         """
 166         Return tuple:
 167         (error_message, sub_lang, sub)
 168         """
 169         self.report_video_subtitles_request(video_id, sub_lang, format)
 170         params = compat_urllib_parse.urlencode({
 171             'lang': sub_lang,
 172             'name': sub_name,
 173             'v': video_id,
 174             'fmt': format,
 175         })
 176         url = 'http://www.youtube.com/api/timedtext?' + params
 177         try:
 178             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 179         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 180             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 181         if not sub:
 182             return (u'Did not fetch video subtitles', None, None)
 183         return (None, sub_lang, sub)
 184
 185     def _request_automatic_caption(self, video_id, webpage):
 186         """We need the webpage for getting the captions url, pass it as an
 187            argument to speed up the process."""
 188         sub_lang = self._downloader.params.get('subtitleslang') or 'en'
 189         sub_format = self._downloader.params.get('subtitlesformat')
 190         self.to_screen(u'%s: Looking for automatic captions' % video_id)
 191         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 192         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
 193         if mobj is None:
 194             return [(err_msg, None, None)]
 195         player_config = json.loads(mobj.group(1))
 196         try:
 197             args = player_config[u'args']
 198             caption_url = args[u'ttsurl']
 199             timestamp = args[u'timestamp']
 200             params = compat_urllib_parse.urlencode({
 201                 'lang': 'en',
 202                 'tlang': sub_lang,
 203                 'fmt': sub_format,
 204                 'ts': timestamp,
 205                 'kind': 'asr',
 206             })
 207             subtitles_url = caption_url + '&' + params
 208             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
 209             return [(None, sub_lang, sub)]
 210         except KeyError:
 211             return [(err_msg, None, None)]
 212
 213     def _extract_subtitle(self, video_id):
 214         """
 215         Return a list with a tuple:
 216         [(error_message, sub_lang, sub)]
 217         """
 218         sub_lang_list = self._get_available_subtitles(video_id)
 219         sub_format = self._downloader.params.get('subtitlesformat')
 220         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 221             return [(sub_lang_list[0], None, None)]
 222         if self._downloader.params.get('subtitleslang', False):
 223             sub_lang = self._downloader.params.get('subtitleslang')
 224         elif 'en' in sub_lang_list:
 225             sub_lang = 'en'
 226         else:
 227             sub_lang = list(sub_lang_list.keys())[0]
 228         if not sub_lang in sub_lang_list:
 229             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 230
 231         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 232         return [subtitle]
 233
 234     def _extract_all_subtitles(self, video_id):
 235         sub_lang_list = self._get_available_subtitles(video_id)
 236         sub_format = self._downloader.params.get('subtitlesformat')
 237         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 238             return [(sub_lang_list[0], None, None)]
 239         subtitles = []
 240         for sub_lang in sub_lang_list:
 241             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 242             subtitles.append(subtitle)
 243         return subtitles
 244
 245     def _print_formats(self, formats):
 246         print('Available formats:')
 247         for x in formats:
 248             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 249
 250     def _real_initialize(self):
 251         if self._downloader is None:
 252             return
 253
 254         username = None
 255         password = None
 256         downloader_params = self._downloader.params
 257
 258         # Attempt to use provided username and password or .netrc data
 259         if downloader_params.get('username', None) is not None:
 260             username = downloader_params['username']
 261             password = downloader_params['password']
 262         elif downloader_params.get('usenetrc', False):
 263             try:
 264                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 265                 if info is not None:
 266                     username = info[0]
 267                     password = info[2]
 268                 else:
 269                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 270             except (IOError, netrc.NetrcParseError) as err:
 271                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 272                 return
 273
 274         # Set language
 275         request = compat_urllib_request.Request(self._LANG_URL)
 276         try:
 277             self.report_lang()
 278             compat_urllib_request.urlopen(request).read()
 279         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 280             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 281             return
 282
 283         # No authentication to be performed
 284         if username is None:
 285             return
 286
 287         request = compat_urllib_request.Request(self._LOGIN_URL)
 288         try:
 289             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 290         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 291             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 292             return
 293
 294         galx = None
 295         dsh = None
 296         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 297         if match:
 298           galx = match.group(1)
 299
 300         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 301         if match:
 302           dsh = match.group(1)
 303
 304         # Log in
 305         login_form_strs = {
 306                 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 307                 u'Email': username,
 308                 u'GALX': galx,
 309                 u'Passwd': password,
 310                 u'PersistentCookie': u'yes',
 311                 u'_utf8': u'霱',
 312                 u'bgresponse': u'js_disabled',
 313                 u'checkConnection': u'',
 314                 u'checkedDomains': u'youtube',
 315                 u'dnConn': u'',
 316                 u'dsh': dsh,
 317                 u'pstMsg': u'0',
 318                 u'rmShown': u'1',
 319                 u'secTok': u'',
 320                 u'signIn': u'Sign in',
 321                 u'timeStmp': u'',
 322                 u'service': u'youtube',
 323                 u'uilel': u'3',
 324                 u'hl': u'en_US',
 325         }
 326         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 327         # chokes on unicode
 328         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 329         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 330         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 331         try:
 332             self.report_login()
 333             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 334             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 335                 self._downloader.report_warning(u'unable to log in: bad username or password')
 336                 return
 337         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 338             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 339             return
 340
 341         # Confirm age
 342         age_form = {
 343                 'next_url':     '/',
 344                 'action_confirm':   'Confirm',
 345                 }
 346         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 347         try:
 348             self.report_age_confirmation()
 349             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 350         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 351             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 352
 353     def _extract_id(self, url):
 354         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 355         if mobj is None:
 356             raise ExtractorError(u'Invalid URL: %s' % url)
 357         video_id = mobj.group(2)
 358         return video_id
 359
 360     def _real_extract(self, url):
 361         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 362         mobj = re.search(self._NEXT_URL_RE, url)
 363         if mobj:
 364             url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 365         video_id = self._extract_id(url)
 366
 367         # Get video webpage
 368         self.report_video_webpage_download(video_id)
 369         url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 370         request = compat_urllib_request.Request(url)
 371         try:
 372             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 373         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 374             raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
 375
 376         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 377
 378         # Attempt to extract SWF player URL
 379         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 380         if mobj is not None:
 381             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 382         else:
 383             player_url = None
 384
 385         # Get video info
 386         self.report_video_info_webpage_download(video_id)
 387         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 388             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 389                     % (video_id, el_type))
 390             video_info_webpage = self._download_webpage(video_info_url, video_id,
 391                                     note=False,
 392                                     errnote='unable to download video info webpage')
 393             video_info = compat_parse_qs(video_info_webpage)
 394             if 'token' in video_info:
 395                 break
 396         if 'token' not in video_info:
 397             if 'reason' in video_info:
 398                 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
 399             else:
 400                 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
 401
 402         # Check for "rental" videos
 403         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 404             raise ExtractorError(u'"rental" videos not supported')
 405
 406         # Start extracting information
 407         self.report_information_extraction(video_id)
 408
 409         # uploader
 410         if 'author' not in video_info:
 411             raise ExtractorError(u'Unable to extract uploader name')
 412         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 413
 414         # uploader_id
 415         video_uploader_id = None
 416         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 417         if mobj is not None:
 418             video_uploader_id = mobj.group(1)
 419         else:
 420             self._downloader.report_warning(u'unable to extract uploader nickname')
 421
 422         # title
 423         if 'title' not in video_info:
 424             raise ExtractorError(u'Unable to extract video title')
 425         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 426
 427         # thumbnail image
 428         if 'thumbnail_url' not in video_info:
 429             self._downloader.report_warning(u'unable to extract video thumbnail')
 430             video_thumbnail = ''
 431         else:   # don't panic if we can't find it
 432             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 433
 434         # upload date
 435         upload_date = None
 436         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 437         if mobj is not None:
 438             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 439             upload_date = unified_strdate(upload_date)
 440
 441         # description
 442         video_description = get_element_by_id("eow-description", video_webpage)
 443         if video_description:
 444             video_description = clean_html(video_description)
 445         else:
 446             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 447             if fd_mobj:
 448                 video_description = unescapeHTML(fd_mobj.group(1))
 449             else:
 450                 video_description = u''
 451
 452         # subtitles
 453         video_subtitles = None
 454
 455         if self._downloader.params.get('writesubtitles', False):
 456             video_subtitles = self._extract_subtitle(video_id)
 457             if video_subtitles:
 458                 (sub_error, sub_lang, sub) = video_subtitles[0]
 459                 if sub_error:
 460                     # We try with the automatic captions
 461                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
 462                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
 463                     if sub is not None:
 464                         pass
 465                     else:
 466                         # We report the original error
 467                         self._downloader.report_warning(sub_error)
 468
 469         if self._downloader.params.get('allsubtitles', False):
 470             video_subtitles = self._extract_all_subtitles(video_id)
 471             for video_subtitle in video_subtitles:
 472                 (sub_error, sub_lang, sub) = video_subtitle
 473                 if sub_error:
 474                     self._downloader.report_warning(sub_error)
 475
 476         if self._downloader.params.get('listsubtitles', False):
 477             sub_lang_list = self._list_available_subtitles(video_id)
 478             return
 479
 480         if 'length_seconds' not in video_info:
 481             self._downloader.report_warning(u'unable to extract video duration')
 482             video_duration = ''
 483         else:
 484             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 485
 486         # token
 487         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 488
 489         # Decide which formats to download
 490         req_format = self._downloader.params.get('format', None)
 491
 492         try:
 493             mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
 494             info = json.loads(mobj.group(1))
 495             args = info['args']
 496             if args.get('ptk','') == 'vevo' or 'dashmpd':
 497                 # Vevo videos with encrypted signatures
 498                 self.to_screen(u'%s: Vevo video detected.' % video_id)
 499                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 500         except ValueError:
 501             pass
 502
 503         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 504             self.report_rtmp_download()
 505             video_url_list = [(None, video_info['conn'][0])]
 506         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 507             url_map = {}
 508             for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
 509                 url_data = compat_parse_qs(url_data_str)
 510                 if 'itag' in url_data and 'url' in url_data:
 511                     url = url_data['url'][0]
 512                     if 'sig' in url_data:
 513                         url += '&signature=' + url_data['sig'][0]
 514                     elif 's' in url_data:
 515                         signature = self._decrypt_signature(url_data['s'][0])
 516                         url += '&signature=' + signature
 517                     if 'ratebypass' not in url:
 518                         url += '&ratebypass=yes'
 519                     url_map[url_data['itag'][0]] = url
 520
 521             format_limit = self._downloader.params.get('format_limit', None)
 522             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 523             if format_limit is not None and format_limit in available_formats:
 524                 format_list = available_formats[available_formats.index(format_limit):]
 525             else:
 526                 format_list = available_formats
 527             existing_formats = [x for x in format_list if x in url_map]
 528             if len(existing_formats) == 0:
 529                 raise ExtractorError(u'no known formats available for video')
 530             if self._downloader.params.get('listformats', None):
 531                 self._print_formats(existing_formats)
 532                 return
 533             if req_format is None or req_format == 'best':
 534                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 535             elif req_format == 'worst':
 536                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 537             elif req_format in ('-1', 'all'):
 538                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 539             else:
 540                 # Specific formats. We pick the first in a slash-delimeted sequence.
 541                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 542                 req_formats = req_format.split('/')
 543                 video_url_list = None
 544                 for rf in req_formats:
 545                     if rf in url_map:
 546                         video_url_list = [(rf, url_map[rf])]
 547                         break
 548                 if video_url_list is None:
 549                     raise ExtractorError(u'requested format not available')
 550         else:
 551             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 552
 553         results = []
 554         for format_param, video_real_url in video_url_list:
 555             # Extension
 556             video_extension = self._video_extensions.get(format_param, 'flv')
 557
 558             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 559                                               self._video_dimensions.get(format_param, '???'))
 560
 561             results.append({
 562                 'id':       video_id,
 563                 'url':      video_real_url,
 564                 'uploader': video_uploader,
 565                 'uploader_id': video_uploader_id,
 566                 'upload_date':  upload_date,
 567                 'title':    video_title,
 568                 'ext':      video_extension,
 569                 'format':   video_format,
 570                 'thumbnail':    video_thumbnail,
 571                 'description':  video_description,
 572                 'player_url':   player_url,
 573                 'subtitles':    video_subtitles,
 574                 'duration':     video_duration
 575             })
 576         return results
 577
 578
 579 class MetacafeIE(InfoExtractor):
 580     """Information Extractor for metacafe.com."""
 581
 582     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 583     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 584     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 585     IE_NAME = u'metacafe'
 586
 587     def report_disclaimer(self):
 588         """Report disclaimer retrieval."""
 589         self.to_screen(u'Retrieving disclaimer')
 590
 591     def _real_initialize(self):
 592         # Retrieve disclaimer
 593         request = compat_urllib_request.Request(self._DISCLAIMER)
 594         try:
 595             self.report_disclaimer()
 596             disclaimer = compat_urllib_request.urlopen(request).read()
 597         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 598             raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
 599
 600         # Confirm age
 601         disclaimer_form = {
 602             'filters': '0',
 603             'submit': "Continue - I'm over 18",
 604             }
 605         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 606         try:
 607             self.report_age_confirmation()
 608             disclaimer = compat_urllib_request.urlopen(request).read()
 609         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 610             raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
 611
 612     def _real_extract(self, url):
 613         # Extract id and simplified title from URL
 614         mobj = re.match(self._VALID_URL, url)
 615         if mobj is None:
 616             raise ExtractorError(u'Invalid URL: %s' % url)
 617
 618         video_id = mobj.group(1)
 619
 620         # Check if video comes from YouTube
 621         mobj2 = re.match(r'^yt-(.*)$', video_id)
 622         if mobj2 is not None:
 623             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 624
 625         # Retrieve video webpage to extract further information
 626         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 627
 628         # Extract URL, uploader and title from webpage
 629         self.report_extraction(video_id)
 630         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 631         if mobj is not None:
 632             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 633             video_extension = mediaURL[-3:]
 634
 635             # Extract gdaKey if available
 636             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 637             if mobj is None:
 638                 video_url = mediaURL
 639             else:
 640                 gdaKey = mobj.group(1)
 641                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 642         else:
 643             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 644             if mobj is None:
 645                 raise ExtractorError(u'Unable to extract media URL')
 646             vardict = compat_parse_qs(mobj.group(1))
 647             if 'mediaData' not in vardict:
 648                 raise ExtractorError(u'Unable to extract media URL')
 649             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 650             if mobj is None:
 651                 raise ExtractorError(u'Unable to extract media URL')
 652             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 653             video_extension = mediaURL[-3:]
 654             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 655
 656         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 657         if mobj is None:
 658             raise ExtractorError(u'Unable to extract title')
 659         video_title = mobj.group(1).decode('utf-8')
 660
 661         mobj = re.search(r'submitter=(.*?);', webpage)
 662         if mobj is None:
 663             raise ExtractorError(u'Unable to extract uploader nickname')
 664         video_uploader = mobj.group(1)
 665
 666         return [{
 667             'id':       video_id.decode('utf-8'),
 668             'url':      video_url.decode('utf-8'),
 669             'uploader': video_uploader.decode('utf-8'),
 670             'upload_date':  None,
 671             'title':    video_title,
 672             'ext':      video_extension.decode('utf-8'),
 673         }]
 674
 675 class DailymotionIE(InfoExtractor):
 676     """Information Extractor for Dailymotion"""
 677
 678     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 679     IE_NAME = u'dailymotion'
 680
 681     def _real_extract(self, url):
 682         # Extract id and simplified title from URL
 683         mobj = re.match(self._VALID_URL, url)
 684         if mobj is None:
 685             raise ExtractorError(u'Invalid URL: %s' % url)
 686
 687         video_id = mobj.group(1).split('_')[0].split('?')[0]
 688
 689         video_extension = 'mp4'
 690
 691         # Retrieve video webpage to extract further information
 692         request = compat_urllib_request.Request(url)
 693         request.add_header('Cookie', 'family_filter=off')
 694         webpage = self._download_webpage(request, video_id)
 695
 696         # Extract URL, uploader and title from webpage
 697         self.report_extraction(video_id)
 698         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 699         if mobj is None:
 700             raise ExtractorError(u'Unable to extract media URL')
 701         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 702
 703         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 704             if key in flashvars:
 705                 max_quality = key
 706                 self.to_screen(u'Using %s' % key)
 707                 break
 708         else:
 709             raise ExtractorError(u'Unable to extract video URL')
 710
 711         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 712         if mobj is None:
 713             raise ExtractorError(u'Unable to extract video URL')
 714
 715         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 716
 717         # TODO: support choosing qualities
 718
 719         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 720         if mobj is None:
 721             raise ExtractorError(u'Unable to extract title')
 722         video_title = unescapeHTML(mobj.group('title'))
 723
 724         video_uploader = None
 725         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
 726                                              # Looking for official user
 727                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
 728                                             webpage, 'video uploader')
 729
 730         video_upload_date = None
 731         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 732         if mobj is not None:
 733             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 734
 735         return [{
 736             'id':       video_id,
 737             'url':      video_url,
 738             'uploader': video_uploader,
 739             'upload_date':  video_upload_date,
 740             'title':    video_title,
 741             'ext':      video_extension,
 742         }]
 743
 744
 745 class PhotobucketIE(InfoExtractor):
 746     """Information extractor for photobucket.com."""
 747
 748     # TODO: the original _VALID_URL was:
 749     # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 750     # Check if it's necessary to keep the old extracion process
 751     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
 752     IE_NAME = u'photobucket'
 753
 754     def _real_extract(self, url):
 755         # Extract id from URL
 756         mobj = re.match(self._VALID_URL, url)
 757         if mobj is None:
 758             raise ExtractorError(u'Invalid URL: %s' % url)
 759
 760         video_id = mobj.group('id')
 761
 762         video_extension = mobj.group('ext')
 763
 764         # Retrieve video webpage to extract further information
 765         webpage = self._download_webpage(url, video_id)
 766
 767         # Extract URL, uploader, and title from webpage
 768         self.report_extraction(video_id)
 769         # We try first by looking the javascript code:
 770         mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
 771         if mobj is not None:
 772             info = json.loads(mobj.group('json'))
 773             return [{
 774                 'id':       video_id,
 775                 'url':      info[u'downloadUrl'],
 776                 'uploader': info[u'username'],
 777                 'upload_date':  datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
 778                 'title':    info[u'title'],
 779                 'ext':      video_extension,
 780                 'thumbnail': info[u'thumbUrl'],
 781             }]
 782
 783         # We try looking in other parts of the webpage
 784         video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
 785             webpage, u'video URL')
 786
 787         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 788         if mobj is None:
 789             raise ExtractorError(u'Unable to extract title')
 790         video_title = mobj.group(1).decode('utf-8')
 791         video_uploader = mobj.group(2).decode('utf-8')
 792
 793         return [{
 794             'id':       video_id.decode('utf-8'),
 795             'url':      video_url.decode('utf-8'),
 796             'uploader': video_uploader,
 797             'upload_date':  None,
 798             'title':    video_title,
 799             'ext':      video_extension.decode('utf-8'),
 800         }]
 801
 802
 803 class YahooIE(InfoExtractor):
 804     """Information extractor for screen.yahoo.com."""
 805     _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
 806
 807     def _real_extract(self, url):
 808         mobj = re.match(self._VALID_URL, url)
 809         if mobj is None:
 810             raise ExtractorError(u'Invalid URL: %s' % url)
 811         video_id = mobj.group('id')
 812         webpage = self._download_webpage(url, video_id)
 813         m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
 814
 815         if m_id is None:
 816             # TODO: Check which url parameters are required
 817             info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 818             webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
 819             info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
 820                         <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
 821                         <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
 822                         <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
 823                         '''
 824             self.report_extraction(video_id)
 825             m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
 826             if m_info is None:
 827                 raise ExtractorError(u'Unable to extract video info')
 828             video_title = m_info.group('title')
 829             video_description = m_info.group('description')
 830             video_thumb = m_info.group('thumb')
 831             video_date = m_info.group('date')
 832             video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
 833
 834             # TODO: Find a way to get mp4 videos
 835             rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
 836             webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
 837             m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
 838             video_url = m_rest.group('url')
 839             video_path = m_rest.group('path')
 840             if m_rest is None:
 841                 raise ExtractorError(u'Unable to extract video url')
 842
 843         else: # We have to use a different method if another id is defined
 844             long_id = m_id.group('new_id')
 845             info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
 846             webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
 847             json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
 848             info = json.loads(json_str)
 849             res = info[u'query'][u'results'][u'mediaObj'][0]
 850             stream = res[u'streams'][0]
 851             video_path = stream[u'path']
 852             video_url = stream[u'host']
 853             meta = res[u'meta']
 854             video_title = meta[u'title']
 855             video_description = meta[u'description']
 856             video_thumb = meta[u'thumbnail']
 857             video_date = None # I can't find it
 858
 859         info_dict = {
 860                      'id': video_id,
 861                      'url': video_url,
 862                      'play_path': video_path,
 863                      'title':video_title,
 864                      'description': video_description,
 865                      'thumbnail': video_thumb,
 866                      'upload_date': video_date,
 867                      'ext': 'flv',
 868                      }
 869         return info_dict
 870
 871 class VimeoIE(InfoExtractor):
 872     """Information extractor for vimeo.com."""
 873
 874     # _VALID_URL matches Vimeo URLs
 875     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
 876     IE_NAME = u'vimeo'
 877
 878     def _verify_video_password(self, url, video_id, webpage):
 879         password = self._downloader.params.get('password', None)
 880         if password is None:
 881             raise ExtractorError(u'This video is protected by a password, use the --password option')
 882         token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
 883         data = compat_urllib_parse.urlencode({'password': password,
 884                                               'token': token})
 885         # I didn't manage to use the password with https
 886         if url.startswith('https'):
 887             pass_url = url.replace('https','http')
 888         else:
 889             pass_url = url
 890         password_request = compat_urllib_request.Request(pass_url+'/password', data)
 891         password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
 892         password_request.add_header('Cookie', 'xsrft=%s' % token)
 893         pass_web = self._download_webpage(password_request, video_id,
 894                                           u'Verifying the password',
 895                                           u'Wrong password')
 896
 897     def _real_extract(self, url, new_video=True):
 898         # Extract ID from URL
 899         mobj = re.match(self._VALID_URL, url)
 900         if mobj is None:
 901             raise ExtractorError(u'Invalid URL: %s' % url)
 902
 903         video_id = mobj.group('id')
 904         if not mobj.group('proto'):
 905             url = 'https://' + url
 906         if mobj.group('direct_link') or mobj.group('pro'):
 907             url = 'https://vimeo.com/' + video_id
 908
 909         # Retrieve video webpage to extract further information
 910         request = compat_urllib_request.Request(url, None, std_headers)
 911         webpage = self._download_webpage(request, video_id)
 912
 913         # Now we begin extracting as much information as we can from what we
 914         # retrieved. First we extract the information common to all extractors,
 915         # and latter we extract those that are Vimeo specific.
 916         self.report_extraction(video_id)
 917
 918         # Extract the config JSON
 919         try:
 920             config = webpage.split(' = {config:')[1].split(',assets:')[0]
 921             config = json.loads(config)
 922         except:
 923             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
 924                 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
 925
 926             if re.search('If so please provide the correct password.', webpage):
 927                 self._verify_video_password(url, video_id, webpage)
 928                 return self._real_extract(url)
 929             else:
 930                 raise ExtractorError(u'Unable to extract info section')
 931
 932         # Extract title
 933         video_title = config["video"]["title"]
 934
 935         # Extract uploader and uploader_id
 936         video_uploader = config["video"]["owner"]["name"]
 937         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
 938
 939         # Extract video thumbnail
 940         video_thumbnail = config["video"]["thumbnail"]
 941
 942         # Extract video description
 943         video_description = get_element_by_attribute("itemprop", "description", webpage)
 944         if video_description: video_description = clean_html(video_description)
 945         else: video_description = u''
 946
 947         # Extract upload date
 948         video_upload_date = None
 949         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
 950         if mobj is not None:
 951             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
 952
 953         # Vimeo specific: extract request signature and timestamp
 954         sig = config['request']['signature']
 955         timestamp = config['request']['timestamp']
 956
 957         # Vimeo specific: extract video codec and quality information
 958         # First consider quality, then codecs, then take everything
 959         # TODO bind to format param
 960         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
 961         files = { 'hd': [], 'sd': [], 'other': []}
 962         for codec_name, codec_extension in codecs:
 963             if codec_name in config["video"]["files"]:
 964                 if 'hd' in config["video"]["files"][codec_name]:
 965                     files['hd'].append((codec_name, codec_extension, 'hd'))
 966                 elif 'sd' in config["video"]["files"][codec_name]:
 967                     files['sd'].append((codec_name, codec_extension, 'sd'))
 968                 else:
 969                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
 970
 971         for quality in ('hd', 'sd', 'other'):
 972             if len(files[quality]) > 0:
 973                 video_quality = files[quality][0][2]
 974                 video_codec = files[quality][0][0]
 975                 video_extension = files[quality][0][1]
 976                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
 977                 break
 978         else:
 979             raise ExtractorError(u'No known codec found')
 980
 981         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
 982                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
 983
 984         return [{
 985             'id':       video_id,
 986             'url':      video_url,
 987             'uploader': video_uploader,
 988             'uploader_id': video_uploader_id,
 989             'upload_date':  video_upload_date,
 990             'title':    video_title,
 991             'ext':      video_extension,
 992             'thumbnail':    video_thumbnail,
 993             'description':  video_description,
 994         }]
 995
 996
 997 class ArteTvIE(InfoExtractor):
 998     """arte.tv information extractor."""
 999
1000     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1001     _LIVE_URL = r'index-[0-9]+\.html$'
1002
1003     IE_NAME = u'arte.tv'
1004
1005     def fetch_webpage(self, url):
1006         request = compat_urllib_request.Request(url)
1007         try:
1008             self.report_download_webpage(url)
1009             webpage = compat_urllib_request.urlopen(request).read()
1010         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1011             raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
1012         except ValueError as err:
1013             raise ExtractorError(u'Invalid URL: %s' % url)
1014         return webpage
1015
1016     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1017         page = self.fetch_webpage(url)
1018         mobj = re.search(regex, page, regexFlags)
1019         info = {}
1020
1021         if mobj is None:
1022             raise ExtractorError(u'Invalid URL: %s' % url)
1023
1024         for (i, key, err) in matchTuples:
1025             if mobj.group(i) is None:
1026                 raise ExtractorError(err)
1027             else:
1028                 info[key] = mobj.group(i)
1029
1030         return info
1031
1032     def extractLiveStream(self, url):
1033         video_lang = url.split('/')[-4]
1034         info = self.grep_webpage(
1035             url,
1036             r'src="(.*?/videothek_js.*?\.js)',
1037             0,
1038             [
1039                 (1, 'url', u'Invalid URL: %s' % url)
1040             ]
1041         )
1042         http_host = url.split('/')[2]
1043         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1044         info = self.grep_webpage(
1045             next_url,
1046             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1047                 '(http://.*?\.swf).*?' +
1048                 '(rtmp://.*?)\'',
1049             re.DOTALL,
1050             [
1051                 (1, 'path',   u'could not extract video path: %s' % url),
1052                 (2, 'player', u'could not extract video player: %s' % url),
1053                 (3, 'url',    u'could not extract video url: %s' % url)
1054             ]
1055         )
1056         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1057
1058     def extractPlus7Stream(self, url):
1059         video_lang = url.split('/')[-3]
1060         info = self.grep_webpage(
1061             url,
1062             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1063             0,
1064             [
1065                 (1, 'url', u'Invalid URL: %s' % url)
1066             ]
1067         )
1068         next_url = compat_urllib_parse.unquote(info.get('url'))
1069         info = self.grep_webpage(
1070             next_url,
1071             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1072             0,
1073             [
1074                 (1, 'url', u'Could not find <video> tag: %s' % url)
1075             ]
1076         )
1077         next_url = compat_urllib_parse.unquote(info.get('url'))
1078
1079         info = self.grep_webpage(
1080             next_url,
1081             r'<video id="(.*?)".*?>.*?' +
1082                 '<name>(.*?)</name>.*?' +
1083                 '<dateVideo>(.*?)</dateVideo>.*?' +
1084                 '<url quality="hd">(.*?)</url>',
1085             re.DOTALL,
1086             [
1087                 (1, 'id',    u'could not extract video id: %s' % url),
1088                 (2, 'title', u'could not extract video title: %s' % url),
1089                 (3, 'date',  u'could not extract video date: %s' % url),
1090                 (4, 'url',   u'could not extract video url: %s' % url)
1091             ]
1092         )
1093
1094         return {
1095             'id':           info.get('id'),
1096             'url':          compat_urllib_parse.unquote(info.get('url')),
1097             'uploader':     u'arte.tv',
1098             'upload_date':  unified_strdate(info.get('date')),
1099             'title':        info.get('title').decode('utf-8'),
1100             'ext':          u'mp4',
1101             'format':       u'NA',
1102             'player_url':   None,
1103         }
1104
1105     def _real_extract(self, url):
1106         video_id = url.split('/')[-1]
1107         self.report_extraction(video_id)
1108
1109         if re.search(self._LIVE_URL, video_id) is not None:
1110             self.extractLiveStream(url)
1111             return
1112         else:
1113             info = self.extractPlus7Stream(url)
1114
1115         return [info]
1116
1117
1118 class GenericIE(InfoExtractor):
1119     """Generic last-resort information extractor."""
1120
1121     _VALID_URL = r'.*'
1122     IE_NAME = u'generic'
1123
1124     def report_download_webpage(self, video_id):
1125         """Report webpage download."""
1126         if not self._downloader.params.get('test', False):
1127             self._downloader.report_warning(u'Falling back on generic information extractor.')
1128         super(GenericIE, self).report_download_webpage(video_id)
1129
1130     def report_following_redirect(self, new_url):
1131         """Report information extraction."""
1132         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1133
1134     def _test_redirect(self, url):
1135         """Check if it is a redirect, like url shorteners, in case return the new url."""
1136         class HeadRequest(compat_urllib_request.Request):
1137             def get_method(self):
1138                 return "HEAD"
1139
1140         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1141             """
1142             Subclass the HTTPRedirectHandler to make it use our
1143             HeadRequest also on the redirected URL
1144             """
1145             def redirect_request(self, req, fp, code, msg, headers, newurl):
1146                 if code in (301, 302, 303, 307):
1147                     newurl = newurl.replace(' ', '%20')
1148                     newheaders = dict((k,v) for k,v in req.headers.items()
1149                                       if k.lower() not in ("content-length", "content-type"))
1150                     return HeadRequest(newurl,
1151                                        headers=newheaders,
1152                                        origin_req_host=req.get_origin_req_host(),
1153                                        unverifiable=True)
1154                 else:
1155                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1156
1157         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1158             """
1159             Fallback to GET if HEAD is not allowed (405 HTTP error)
1160             """
1161             def http_error_405(self, req, fp, code, msg, headers):
1162                 fp.read()
1163                 fp.close()
1164
1165                 newheaders = dict((k,v) for k,v in req.headers.items()
1166                                   if k.lower() not in ("content-length", "content-type"))
1167                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1168                                                  headers=newheaders,
1169                                                  origin_req_host=req.get_origin_req_host(),
1170                                                  unverifiable=True))
1171
1172         # Build our opener
1173         opener = compat_urllib_request.OpenerDirector()
1174         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1175                         HTTPMethodFallback, HEADRedirectHandler,
1176                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1177             opener.add_handler(handler())
1178
1179         response = opener.open(HeadRequest(url))
1180         if response is None:
1181             raise ExtractorError(u'Invalid URL protocol')
1182         new_url = response.geturl()
1183
1184         if url == new_url:
1185             return False
1186
1187         self.report_following_redirect(new_url)
1188         return new_url
1189
1190     def _real_extract(self, url):
1191         new_url = self._test_redirect(url)
1192         if new_url: return [self.url_result(new_url)]
1193
1194         video_id = url.split('/')[-1]
1195         try:
1196             webpage = self._download_webpage(url, video_id)
1197         except ValueError as err:
1198             # since this is the last-resort InfoExtractor, if
1199             # this error is thrown, it'll be thrown here
1200             raise ExtractorError(u'Invalid URL: %s' % url)
1201
1202         self.report_extraction(video_id)
1203         # Start with something easy: JW Player in SWFObject
1204         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1205         if mobj is None:
1206             # Broaden the search a little bit
1207             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1208         if mobj is None:
1209             # Broaden the search a little bit: JWPlayer JS loader
1210             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1211         if mobj is None:
1212             # Try to find twitter cards info
1213             mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
1214         if mobj is None:
1215             # We look for Open Graph info:
1216             # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
1217             m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
1218             # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
1219             if m_video_type is not None:
1220                 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
1221         if mobj is None:
1222             raise ExtractorError(u'Invalid URL: %s' % url)
1223
1224         # It's possible that one of the regexes
1225         # matched, but returned an empty group:
1226         if mobj.group(1) is None:
1227             raise ExtractorError(u'Invalid URL: %s' % url)
1228
1229         video_url = compat_urllib_parse.unquote(mobj.group(1))
1230         video_id = os.path.basename(video_url)
1231
1232         # here's a fun little line of code for you:
1233         video_extension = os.path.splitext(video_id)[1][1:]
1234         video_id = os.path.splitext(video_id)[0]
1235
1236         # it's tempting to parse this further, but you would
1237         # have to take into account all the variations like
1238         #   Video Title - Site Name
1239         #   Site Name | Video Title
1240         #   Video Title - Tagline | Site Name
1241         # and so on and so forth; it's just not practical
1242         video_title = self._html_search_regex(r'<title>(.*)</title>',
1243             webpage, u'video title')
1244
1245         # video uploader is domain name
1246         video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
1247             url, u'video uploader')
1248
1249         return [{
1250             'id':       video_id,
1251             'url':      video_url,
1252             'uploader': video_uploader,
1253             'upload_date':  None,
1254             'title':    video_title,
1255             'ext':      video_extension,
1256         }]
1257
1258
1259 class YoutubeSearchIE(SearchInfoExtractor):
1260     """Information Extractor for YouTube search queries."""
1261     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1262     _MAX_RESULTS = 1000
1263     IE_NAME = u'youtube:search'
1264     _SEARCH_KEY = 'ytsearch'
1265
1266     def report_download_page(self, query, pagenum):
1267         """Report attempt to download search page with given number."""
1268         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1269
1270     def _get_n_results(self, query, n):
1271         """Get a specified number of results for a query"""
1272
1273         video_ids = []
1274         pagenum = 0
1275         limit = n
1276
1277         while (50 * pagenum) < limit:
1278             self.report_download_page(query, pagenum+1)
1279             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1280             request = compat_urllib_request.Request(result_url)
1281             try:
1282                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1283             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1284                 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1285             api_response = json.loads(data)['data']
1286
1287             if not 'items' in api_response:
1288                 raise ExtractorError(u'[youtube] No video results')
1289
1290             new_ids = list(video['id'] for video in api_response['items'])
1291             video_ids += new_ids
1292
1293             limit = min(n, api_response['totalItems'])
1294             pagenum += 1
1295
1296         if len(video_ids) > n:
1297             video_ids = video_ids[:n]
1298         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1299         return self.playlist_result(videos, query)
1300
1301
1302 class GoogleSearchIE(SearchInfoExtractor):
1303     """Information Extractor for Google Video search queries."""
1304     _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
1305     _MAX_RESULTS = 1000
1306     IE_NAME = u'video.google:search'
1307     _SEARCH_KEY = 'gvsearch'
1308
1309     def _get_n_results(self, query, n):
1310         """Get a specified number of results for a query"""
1311
1312         res = {
1313             '_type': 'playlist',
1314             'id': query,
1315             'entries': []
1316         }
1317
1318         for pagenum in itertools.count(1):
1319             result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
1320             webpage = self._download_webpage(result_url, u'gvsearch:' + query,
1321                                              note='Downloading result page ' + str(pagenum))
1322
1323             for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
1324                 e = {
1325                     '_type': 'url',
1326                     'url': mobj.group(1)
1327                 }
1328                 res['entries'].append(e)
1329
1330             if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
1331                 return res
1332
1333 class YahooSearchIE(SearchInfoExtractor):
1334     """Information Extractor for Yahoo! Video search queries."""
1335
1336     _MAX_RESULTS = 1000
1337     IE_NAME = u'screen.yahoo:search'
1338     _SEARCH_KEY = 'yvsearch'
1339
1340     def _get_n_results(self, query, n):
1341         """Get a specified number of results for a query"""
1342
1343         res = {
1344             '_type': 'playlist',
1345             'id': query,
1346             'entries': []
1347         }
1348         for pagenum in itertools.count(0):
1349             result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
1350             webpage = self._download_webpage(result_url, query,
1351                                              note='Downloading results page '+str(pagenum+1))
1352             info = json.loads(webpage)
1353             m = info[u'm']
1354             results = info[u'results']
1355
1356             for (i, r) in enumerate(results):
1357                 if (pagenum * 30) +i >= n:
1358                     break
1359                 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
1360                 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
1361                 res['entries'].append(e)
1362             if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
1363                 break
1364
1365         return res
1366
1367
1368 class YoutubePlaylistIE(InfoExtractor):
1369     """Information Extractor for YouTube playlists."""
1370
1371     _VALID_URL = r"""(?:
1372                         (?:https?://)?
1373                         (?:\w+\.)?
1374                         youtube\.com/
1375                         (?:
1376                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1377                            \? (?:.*?&)*? (?:p|a|list)=
1378                         |  p/
1379                         )
1380                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1381                         .*
1382                      |
1383                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1384                      )"""
1385     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1386     _MAX_RESULTS = 50
1387     IE_NAME = u'youtube:playlist'
1388
1389     @classmethod
1390     def suitable(cls, url):
1391         """Receives a URL and returns True if suitable for this IE."""
1392         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1393
1394     def _real_extract(self, url):
1395         # Extract playlist id
1396         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1397         if mobj is None:
1398             raise ExtractorError(u'Invalid URL: %s' % url)
1399
1400         # Download playlist videos from API
1401         playlist_id = mobj.group(1) or mobj.group(2)
1402         page_num = 1
1403         videos = []
1404
1405         while True:
1406             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1407             page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1408
1409             try:
1410                 response = json.loads(page)
1411             except ValueError as err:
1412                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1413
1414             if 'feed' not in response:
1415                 raise ExtractorError(u'Got a malformed response from YouTube API')
1416             playlist_title = response['feed']['title']['$t']
1417             if 'entry' not in response['feed']:
1418                 # Number of videos is a multiple of self._MAX_RESULTS
1419                 break
1420
1421             for entry in response['feed']['entry']:
1422                 index = entry['yt$position']['$t']
1423                 if 'media$group' in entry and 'media$player' in entry['media$group']:
1424                     videos.append((index, entry['media$group']['media$player']['url']))
1425
1426             if len(response['feed']['entry']) < self._MAX_RESULTS:
1427                 break
1428             page_num += 1
1429
1430         videos = [v[1] for v in sorted(videos)]
1431
1432         url_results = [self.url_result(url, 'Youtube') for url in videos]
1433         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1434
1435
1436 class YoutubeChannelIE(InfoExtractor):
1437     """Information Extractor for YouTube channels."""
1438
1439     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1440     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1441     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1442     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1443     IE_NAME = u'youtube:channel'
1444
1445     def extract_videos_from_page(self, page):
1446         ids_in_page = []
1447         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1448             if mobj.group(1) not in ids_in_page:
1449                 ids_in_page.append(mobj.group(1))
1450         return ids_in_page
1451
1452     def _real_extract(self, url):
1453         # Extract channel id
1454         mobj = re.match(self._VALID_URL, url)
1455         if mobj is None:
1456             raise ExtractorError(u'Invalid URL: %s' % url)
1457
1458         # Download channel page
1459         channel_id = mobj.group(1)
1460         video_ids = []
1461         pagenum = 1
1462
1463         url = self._TEMPLATE_URL % (channel_id, pagenum)
1464         page = self._download_webpage(url, channel_id,
1465                                       u'Downloading page #%s' % pagenum)
1466
1467         # Extract video identifiers
1468         ids_in_page = self.extract_videos_from_page(page)
1469         video_ids.extend(ids_in_page)
1470
1471         # Download any subsequent channel pages using the json-based channel_ajax query
1472         if self._MORE_PAGES_INDICATOR in page:
1473             while True:
1474                 pagenum = pagenum + 1
1475
1476                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1477                 page = self._download_webpage(url, channel_id,
1478                                               u'Downloading page #%s' % pagenum)
1479
1480                 page = json.loads(page)
1481
1482                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1483                 video_ids.extend(ids_in_page)
1484
1485                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1486                     break
1487
1488         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1489
1490         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1491         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1492         return [self.playlist_result(url_entries, channel_id)]
1493
1494
1495 class YoutubeUserIE(InfoExtractor):
1496     """Information Extractor for YouTube users."""
1497
1498     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1499     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1500     _GDATA_PAGE_SIZE = 50
1501     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1502     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1503     IE_NAME = u'youtube:user'
1504
1505     def _real_extract(self, url):
1506         # Extract username
1507         mobj = re.match(self._VALID_URL, url)
1508         if mobj is None:
1509             raise ExtractorError(u'Invalid URL: %s' % url)
1510
1511         username = mobj.group(1)
1512
1513         # Download video ids using YouTube Data API. Result size per
1514         # query is limited (currently to 50 videos) so we need to query
1515         # page by page until there are no video ids - it means we got
1516         # all of them.
1517
1518         video_ids = []
1519         pagenum = 0
1520
1521         while True:
1522             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1523
1524             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1525             page = self._download_webpage(gdata_url, username,
1526                                           u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1527
1528             # Extract video identifiers
1529             ids_in_page = []
1530
1531             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1532                 if mobj.group(1) not in ids_in_page:
1533                     ids_in_page.append(mobj.group(1))
1534
1535             video_ids.extend(ids_in_page)
1536
1537             # A little optimization - if current page is not
1538             # "full", ie. does not contain PAGE_SIZE video ids then
1539             # we can assume that this page is the last one - there
1540             # are no more ids on further pages - no need to query
1541             # again.
1542
1543             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1544                 break
1545
1546             pagenum += 1
1547
1548         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1549         url_results = [self.url_result(url, 'Youtube') for url in urls]
1550         return [self.playlist_result(url_results, playlist_title = username)]
1551
1552
1553 class BlipTVUserIE(InfoExtractor):
1554     """Information Extractor for blip.tv users."""
1555
1556     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1557     _PAGE_SIZE = 12
1558     IE_NAME = u'blip.tv:user'
1559
1560     def _real_extract(self, url):
1561         # Extract username
1562         mobj = re.match(self._VALID_URL, url)
1563         if mobj is None:
1564             raise ExtractorError(u'Invalid URL: %s' % url)
1565
1566         username = mobj.group(1)
1567
1568         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1569
1570         page = self._download_webpage(url, username, u'Downloading user page')
1571         mobj = re.search(r'data-users-id="([^"]+)"', page)
1572         page_base = page_base % mobj.group(1)
1573
1574
1575         # Download video ids using BlipTV Ajax calls. Result size per
1576         # query is limited (currently to 12 videos) so we need to query
1577         # page by page until there are no video ids - it means we got
1578         # all of them.
1579
1580         video_ids = []
1581         pagenum = 1
1582
1583         while True:
1584             url = page_base + "&page=" + str(pagenum)
1585             page = self._download_webpage(url, username,
1586                                           u'Downloading video ids from page %d' % pagenum)
1587
1588             # Extract video identifiers
1589             ids_in_page = []
1590
1591             for mobj in re.finditer(r'href="/([^"]+)"', page):
1592                 if mobj.group(1) not in ids_in_page:
1593                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1594
1595             video_ids.extend(ids_in_page)
1596
1597             # A little optimization - if current page is not
1598             # "full", ie. does not contain PAGE_SIZE video ids then
1599             # we can assume that this page is the last one - there
1600             # are no more ids on further pages - no need to query
1601             # again.
1602
1603             if len(ids_in_page) < self._PAGE_SIZE:
1604                 break
1605
1606             pagenum += 1
1607
1608         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1609         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1610         return [self.playlist_result(url_entries, playlist_title = username)]
1611
1612
1613 class DepositFilesIE(InfoExtractor):
1614     """Information extractor for depositfiles.com"""
1615
1616     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1617
1618     def _real_extract(self, url):
1619         file_id = url.split('/')[-1]
1620         # Rebuild url in english locale
1621         url = 'http://depositfiles.com/en/files/' + file_id
1622
1623         # Retrieve file webpage with 'Free download' button pressed
1624         free_download_indication = { 'gateway_result' : '1' }
1625         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1626         try:
1627             self.report_download_webpage(file_id)
1628             webpage = compat_urllib_request.urlopen(request).read()
1629         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1630             raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
1631
1632         # Search for the real file URL
1633         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1634         if (mobj is None) or (mobj.group(1) is None):
1635             # Try to figure out reason of the error.
1636             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1637             if (mobj is not None) and (mobj.group(1) is not None):
1638                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1639                 raise ExtractorError(u'%s' % restriction_message)
1640             else:
1641                 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
1642
1643         file_url = mobj.group(1)
1644         file_extension = os.path.splitext(file_url)[1][1:]
1645
1646         # Search for file title
1647         file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
1648
1649         return [{
1650             'id':       file_id.decode('utf-8'),
1651             'url':      file_url.decode('utf-8'),
1652             'uploader': None,
1653             'upload_date':  None,
1654             'title':    file_title,
1655             'ext':      file_extension.decode('utf-8'),
1656         }]
1657
1658
1659 class FacebookIE(InfoExtractor):
1660     """Information Extractor for Facebook"""
1661
1662     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1663     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1664     _NETRC_MACHINE = 'facebook'
1665     IE_NAME = u'facebook'
1666
1667     def report_login(self):
1668         """Report attempt to log in."""
1669         self.to_screen(u'Logging in')
1670
1671     def _real_initialize(self):
1672         if self._downloader is None:
1673             return
1674
1675         useremail = None
1676         password = None
1677         downloader_params = self._downloader.params
1678
1679         # Attempt to use provided username and password or .netrc data
1680         if downloader_params.get('username', None) is not None:
1681             useremail = downloader_params['username']
1682             password = downloader_params['password']
1683         elif downloader_params.get('usenetrc', False):
1684             try:
1685                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1686                 if info is not None:
1687                     useremail = info[0]
1688                     password = info[2]
1689                 else:
1690                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1691             except (IOError, netrc.NetrcParseError) as err:
1692                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1693                 return
1694
1695         if useremail is None:
1696             return
1697
1698         # Log in
1699         login_form = {
1700             'email': useremail,
1701             'pass': password,
1702             'login': 'Log+In'
1703             }
1704         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
1705         try:
1706             self.report_login()
1707             login_results = compat_urllib_request.urlopen(request).read()
1708             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
1709                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
1710                 return
1711         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
1713             return
1714
1715     def _real_extract(self, url):
1716         mobj = re.match(self._VALID_URL, url)
1717         if mobj is None:
1718             raise ExtractorError(u'Invalid URL: %s' % url)
1719         video_id = mobj.group('ID')
1720
1721         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
1722         webpage = self._download_webpage(url, video_id)
1723
1724         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
1725         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
1726         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
1727         if not m:
1728             raise ExtractorError(u'Cannot parse data')
1729         data = dict(json.loads(m.group(1)))
1730         params_raw = compat_urllib_parse.unquote(data['params'])
1731         params = json.loads(params_raw)
1732         video_data = params['video_data'][0]
1733         video_url = video_data.get('hd_src')
1734         if not video_url:
1735             video_url = video_data['sd_src']
1736         if not video_url:
1737             raise ExtractorError(u'Cannot find video URL')
1738         video_duration = int(video_data['video_duration'])
1739         thumbnail = video_data['thumbnail_src']
1740
1741         video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1742             webpage, u'title')
1743
1744         info = {
1745             'id': video_id,
1746             'title': video_title,
1747             'url': video_url,
1748             'ext': 'mp4',
1749             'duration': video_duration,
1750             'thumbnail': thumbnail,
1751         }
1752         return [info]
1753
1754
1755 class BlipTVIE(InfoExtractor):
1756     """Information extractor for blip.tv"""
1757
1758     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1759     _URL_EXT = r'^.*\.([a-z0-9]+)$'
1760     IE_NAME = u'blip.tv'
1761
1762     def report_direct_download(self, title):
1763         """Report information extraction."""
1764         self.to_screen(u'%s: Direct download detected' % title)
1765
1766     def _real_extract(self, url):
1767         mobj = re.match(self._VALID_URL, url)
1768         if mobj is None:
1769             raise ExtractorError(u'Invalid URL: %s' % url)
1770
1771         # See https://github.com/rg3/youtube-dl/issues/857
1772         api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1773         if api_mobj is not None:
1774             url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1775         urlp = compat_urllib_parse_urlparse(url)
1776         if urlp.path.startswith('/play/'):
1777             request = compat_urllib_request.Request(url)
1778             response = compat_urllib_request.urlopen(request)
1779             redirecturl = response.geturl()
1780             rurlp = compat_urllib_parse_urlparse(redirecturl)
1781             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1782             url = 'http://blip.tv/a/a-' + file_id
1783             return self._real_extract(url)
1784
1785
1786         if '?' in url:
1787             cchar = '&'
1788         else:
1789             cchar = '?'
1790         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1791         request = compat_urllib_request.Request(json_url)
1792         request.add_header('User-Agent', 'iTunes/10.6.1')
1793         self.report_extraction(mobj.group(1))
1794         info = None
1795         try:
1796             urlh = compat_urllib_request.urlopen(request)
1797             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1798                 basename = url.split('/')[-1]
1799                 title,ext = os.path.splitext(basename)
1800                 title = title.decode('UTF-8')
1801                 ext = ext.replace('.', '')
1802                 self.report_direct_download(title)
1803                 info = {
1804                     'id': title,
1805                     'url': url,
1806                     'uploader': None,
1807                     'upload_date': None,
1808                     'title': title,
1809                     'ext': ext,
1810                     'urlhandle': urlh
1811                 }
1812         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1813             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1814         if info is None: # Regular URL
1815             try:
1816                 json_code_bytes = urlh.read()
1817                 json_code = json_code_bytes.decode('utf-8')
1818             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1819                 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1820
1821             try:
1822                 json_data = json.loads(json_code)
1823                 if 'Post' in json_data:
1824                     data = json_data['Post']
1825                 else:
1826                     data = json_data
1827
1828                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1829                 video_url = data['media']['url']
1830                 umobj = re.match(self._URL_EXT, video_url)
1831                 if umobj is None:
1832                     raise ValueError('Can not determine filename extension')
1833                 ext = umobj.group(1)
1834
1835                 info = {
1836                     'id': data['item_id'],
1837                     'url': video_url,
1838                     'uploader': data['display_name'],
1839                     'upload_date': upload_date,
1840                     'title': data['title'],
1841                     'ext': ext,
1842                     'format': data['media']['mimeType'],
1843                     'thumbnail': data['thumbnailUrl'],
1844                     'description': data['description'],
1845                     'player_url': data['embedUrl'],
1846                     'user_agent': 'iTunes/10.6.1',
1847                 }
1848             except (ValueError,KeyError) as err:
1849                 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1850
1851         return [info]
1852
1853
1854 class MyVideoIE(InfoExtractor):
1855     """Information Extractor for myvideo.de."""
1856
1857     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1858     IE_NAME = u'myvideo'
1859
1860     # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1861     # Released into the Public Domain by Tristan Fischer on 2013-05-19
1862     # https://github.com/rg3/youtube-dl/pull/842
1863     def __rc4crypt(self,data, key):
1864         x = 0
1865         box = list(range(256))
1866         for i in list(range(256)):
1867             x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1868             box[i], box[x] = box[x], box[i]
1869         x = 0
1870         y = 0
1871         out = ''
1872         for char in data:
1873             x = (x + 1) % 256
1874             y = (y + box[x]) % 256
1875             box[x], box[y] = box[y], box[x]
1876             out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
1877         return out
1878
1879     def __md5(self,s):
1880         return hashlib.md5(s).hexdigest().encode()
1881
1882     def _real_extract(self,url):
1883         mobj = re.match(self._VALID_URL, url)
1884         if mobj is None:
1885             raise ExtractorError(u'invalid URL: %s' % url)
1886
1887         video_id = mobj.group(1)
1888
1889         GK = (
1890           b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
1891           b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
1892           b'TnpsbA0KTVRkbU1tSTRNdz09'
1893         )
1894
1895         # Get video webpage
1896         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1897         webpage = self._download_webpage(webpage_url, video_id)
1898
1899         mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
1900         if mobj is not None:
1901             self.report_extraction(video_id)
1902             video_url = mobj.group(1) + '.flv'
1903
1904             video_title = self._html_search_regex('<title>([^<]+)</title>',
1905                 webpage, u'title')
1906
1907             video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
1908
1909             return [{
1910                 'id':       video_id,
1911                 'url':      video_url,
1912                 'uploader': None,
1913                 'upload_date':  None,
1914                 'title':    video_title,
1915                 'ext':      u'flv',
1916             }]
1917
1918         # try encxml
1919         mobj = re.search('var flashvars={(.+?)}', webpage)
1920         if mobj is None:
1921             raise ExtractorError(u'Unable to extract video')
1922
1923         params = {}
1924         encxml = ''
1925         sec = mobj.group(1)
1926         for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
1927             if not a == '_encxml':
1928                 params[a] = b
1929             else:
1930                 encxml = compat_urllib_parse.unquote(b)
1931         if not params.get('domain'):
1932             params['domain'] = 'www.myvideo.de'
1933         xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
1934         if 'flash_playertype=MTV' in xmldata_url:
1935             self._downloader.report_warning(u'avoiding MTV player')
1936             xmldata_url = (
1937                 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
1938                 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
1939             ) % video_id
1940
1941         # get enc data
1942         enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
1943         enc_data_b = binascii.unhexlify(enc_data)
1944         sk = self.__md5(
1945             base64.b64decode(base64.b64decode(GK)) +
1946             self.__md5(
1947                 str(video_id).encode('utf-8')
1948             )
1949         )
1950         dec_data = self.__rc4crypt(enc_data_b, sk)
1951
1952         # extracting infos
1953         self.report_extraction(video_id)
1954
1955         video_url = None
1956         mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1957         if mobj:
1958             video_url = compat_urllib_parse.unquote(mobj.group(1))
1959             if 'myvideo2flash' in video_url:
1960                 self._downloader.report_warning(u'forcing RTMPT ...')
1961                 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1962
1963         if not video_url:
1964             # extract non rtmp videos
1965             mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1966             if mobj is None:
1967                 raise ExtractorError(u'unable to extract url')
1968             video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1969
1970         video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1971         video_file = compat_urllib_parse.unquote(video_file)
1972
1973         if not video_file.endswith('f4m'):
1974             ppath, prefix = video_file.split('.')
1975             video_playpath = '%s:%s' % (prefix, ppath)
1976             video_hls_playlist = ''
1977         else:
1978             video_playpath = ''
1979             video_hls_playlist = (
1980                 video_filepath + video_file
1981             ).replace('.f4m', '.m3u8')
1982
1983         video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1984         video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1985
1986         video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1987             webpage, u'title')
1988
1989         return [{
1990             'id':                 video_id,
1991             'url':                video_url,
1992             'tc_url':             video_url,
1993             'uploader':           None,
1994             'upload_date':        None,
1995             'title':              video_title,
1996             'ext':                u'flv',
1997             'play_path':          video_playpath,
1998             'video_file':         video_file,
1999             'video_hls_playlist': video_hls_playlist,
2000             'player_url':         video_swfobj,
2001         }]
2002
2003
2004 class ComedyCentralIE(InfoExtractor):
2005     """Information extractor for The Daily Show and Colbert Report """
2006
2007     # urls can be abbreviations like :thedailyshow or :colbert
2008     # urls for episodes like:
2009     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2010     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2011     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2012     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2013                       |(https?://)?(www\.)?
2014                           (?P<showname>thedailyshow|colbertnation)\.com/
2015                          (full-episodes/(?P<episode>.*)|
2016                           (?P<clip>
2017                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2018                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2019                      $"""
2020
2021     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2022
2023     _video_extensions = {
2024         '3500': 'mp4',
2025         '2200': 'mp4',
2026         '1700': 'mp4',
2027         '1200': 'mp4',
2028         '750': 'mp4',
2029         '400': 'mp4',
2030     }
2031     _video_dimensions = {
2032         '3500': '1280x720',
2033         '2200': '960x540',
2034         '1700': '768x432',
2035         '1200': '640x360',
2036         '750': '512x288',
2037         '400': '384x216',
2038     }
2039
2040     @classmethod
2041     def suitable(cls, url):
2042         """Receives a URL and returns True if suitable for this IE."""
2043         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2044
2045     def _print_formats(self, formats):
2046         print('Available formats:')
2047         for x in formats:
2048             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2049
2050
2051     def _real_extract(self, url):
2052         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2053         if mobj is None:
2054             raise ExtractorError(u'Invalid URL: %s' % url)
2055
2056         if mobj.group('shortname'):
2057             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2058                 url = u'http://www.thedailyshow.com/full-episodes/'
2059             else:
2060                 url = u'http://www.colbertnation.com/full-episodes/'
2061             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2062             assert mobj is not None
2063
2064         if mobj.group('clip'):
2065             if mobj.group('showname') == 'thedailyshow':
2066                 epTitle = mobj.group('tdstitle')
2067             else:
2068                 epTitle = mobj.group('cntitle')
2069             dlNewest = False
2070         else:
2071             dlNewest = not mobj.group('episode')
2072             if dlNewest:
2073                 epTitle = mobj.group('showname')
2074             else:
2075                 epTitle = mobj.group('episode')
2076
2077         self.report_extraction(epTitle)
2078         webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
2079         if dlNewest:
2080             url = htmlHandle.geturl()
2081             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2082             if mobj is None:
2083                 raise ExtractorError(u'Invalid redirected URL: ' + url)
2084             if mobj.group('episode') == '':
2085                 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
2086             epTitle = mobj.group('episode')
2087
2088         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2089
2090         if len(mMovieParams) == 0:
2091             # The Colbert Report embeds the information in a without
2092             # a URL prefix; so extract the alternate reference
2093             # and then add the URL prefix manually.
2094
2095             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2096             if len(altMovieParams) == 0:
2097                 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
2098             else:
2099                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2100
2101         uri = mMovieParams[0][1]
2102         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2103         indexXml = self._download_webpage(indexUrl, epTitle,
2104                                           u'Downloading show index',
2105                                           u'unable to download episode index')
2106
2107         results = []
2108
2109         idoc = xml.etree.ElementTree.fromstring(indexXml)
2110         itemEls = idoc.findall('.//item')
2111         for partNum,itemEl in enumerate(itemEls):
2112             mediaId = itemEl.findall('./guid')[0].text
2113             shortMediaId = mediaId.split(':')[-1]
2114             showId = mediaId.split(':')[-2].replace('.com', '')
2115             officialTitle = itemEl.findall('./title')[0].text
2116             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2117
2118             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2119                         compat_urllib_parse.urlencode({'uri': mediaId}))
2120             configXml = self._download_webpage(configUrl, epTitle,
2121                                                u'Downloading configuration for %s' % shortMediaId)
2122
2123             cdoc = xml.etree.ElementTree.fromstring(configXml)
2124             turls = []
2125             for rendition in cdoc.findall('.//rendition'):
2126                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2127                 turls.append(finfo)
2128
2129             if len(turls) == 0:
2130                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2131                 continue
2132
2133             if self._downloader.params.get('listformats', None):
2134                 self._print_formats([i[0] for i in turls])
2135                 return
2136
2137             # For now, just pick the highest bitrate
2138             format,rtmp_video_url = turls[-1]
2139
2140             # Get the format arg from the arg stream
2141             req_format = self._downloader.params.get('format', None)
2142
2143             # Select format if we can find one
2144             for f,v in turls:
2145                 if f == req_format:
2146                     format, rtmp_video_url = f, v
2147                     break
2148
2149             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2150             if not m:
2151                 raise ExtractorError(u'Cannot transform RTMP url')
2152             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2153             video_url = base + m.group('finalid')
2154
2155             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2156             info = {
2157                 'id': shortMediaId,
2158                 'url': video_url,
2159                 'uploader': showId,
2160                 'upload_date': officialDate,
2161                 'title': effTitle,
2162                 'ext': 'mp4',
2163                 'format': format,
2164                 'thumbnail': None,
2165                 'description': officialTitle,
2166             }
2167             results.append(info)
2168
2169         return results
2170
2171
2172 class EscapistIE(InfoExtractor):
2173     """Information extractor for The Escapist """
2174
2175     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2176     IE_NAME = u'escapist'
2177
2178     def _real_extract(self, url):
2179         mobj = re.match(self._VALID_URL, url)
2180         if mobj is None:
2181             raise ExtractorError(u'Invalid URL: %s' % url)
2182         showName = mobj.group('showname')
2183         videoId = mobj.group('episode')
2184
2185         self.report_extraction(videoId)
2186         webpage = self._download_webpage(url, videoId)
2187
2188         videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
2189             webpage, u'description', fatal=False)
2190
2191         imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
2192             webpage, u'thumbnail', fatal=False)
2193
2194         playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
2195             webpage, u'player url')
2196
2197         title = self._html_search_regex('<meta name="title" content="([^"]*)"',
2198             webpage, u'player url').split(' : ')[-1]
2199
2200         configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
2201         configUrl = compat_urllib_parse.unquote(configUrl)
2202
2203         configJSON = self._download_webpage(configUrl, videoId,
2204                                             u'Downloading configuration',
2205                                             u'unable to download configuration')
2206
2207         # Technically, it's JavaScript, not JSON
2208         configJSON = configJSON.replace("'", '"')
2209
2210         try:
2211             config = json.loads(configJSON)
2212         except (ValueError,) as err:
2213             raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
2214
2215         playlist = config['playlist']
2216         videoUrl = playlist[1]['url']
2217
2218         info = {
2219             'id': videoId,
2220             'url': videoUrl,
2221             'uploader': showName,
2222             'upload_date': None,
2223             'title': title,
2224             'ext': 'mp4',
2225             'thumbnail': imgUrl,
2226             'description': videoDesc,
2227             'player_url': playerUrl,
2228         }
2229
2230         return [info]
2231
2232 class CollegeHumorIE(InfoExtractor):
2233     """Information extractor for collegehumor.com"""
2234
2235     _WORKING = False
2236     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2237     IE_NAME = u'collegehumor'
2238
2239     def report_manifest(self, video_id):
2240         """Report information extraction."""
2241         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2242
2243     def _real_extract(self, url):
2244         mobj = re.match(self._VALID_URL, url)
2245         if mobj is None:
2246             raise ExtractorError(u'Invalid URL: %s' % url)
2247         video_id = mobj.group('videoid')
2248
2249         info = {
2250             'id': video_id,
2251             'uploader': None,
2252             'upload_date': None,
2253         }
2254
2255         self.report_extraction(video_id)
2256         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2257         try:
2258             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2259         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2260             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2261
2262         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2263         try:
2264             videoNode = mdoc.findall('./video')[0]
2265             info['description'] = videoNode.findall('./description')[0].text
2266             info['title'] = videoNode.findall('./caption')[0].text
2267             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2268             manifest_url = videoNode.findall('./file')[0].text
2269         except IndexError:
2270             raise ExtractorError(u'Invalid metadata XML file')
2271
2272         manifest_url += '?hdcore=2.10.3'
2273         self.report_manifest(video_id)
2274         try:
2275             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2276         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2277             raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2278
2279         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2280         try:
2281             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2282             node_id = media_node.attrib['url']
2283             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2284         except IndexError as err:
2285             raise ExtractorError(u'Invalid manifest file')
2286
2287         url_pr = compat_urllib_parse_urlparse(manifest_url)
2288         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2289
2290         info['url'] = url
2291         info['ext'] = 'f4f'
2292         return [info]
2293
2294
2295 class XVideosIE(InfoExtractor):
2296     """Information extractor for xvideos.com"""
2297
2298     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2299     IE_NAME = u'xvideos'
2300
2301     def _real_extract(self, url):
2302         mobj = re.match(self._VALID_URL, url)
2303         if mobj is None:
2304             raise ExtractorError(u'Invalid URL: %s' % url)
2305         video_id = mobj.group(1)
2306
2307         webpage = self._download_webpage(url, video_id)
2308
2309         self.report_extraction(video_id)
2310
2311         # Extract video URL
2312         video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
2313             webpage, u'video URL'))
2314
2315         # Extract title
2316         video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
2317             webpage, u'title')
2318
2319         # Extract video thumbnail
2320         video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
2321             webpage, u'thumbnail', fatal=False)
2322
2323         info = {
2324             'id': video_id,
2325             'url': video_url,
2326             'uploader': None,
2327             'upload_date': None,
2328             'title': video_title,
2329             'ext': 'flv',
2330             'thumbnail': video_thumbnail,
2331             'description': None,
2332         }
2333
2334         return [info]
2335
2336
2337 class SoundcloudIE(InfoExtractor):
2338     """Information extractor for soundcloud.com
2339        To access the media, the uid of the song and a stream token
2340        must be extracted from the page source and the script must make
2341        a request to media.soundcloud.com/crossdomain.xml. Then
2342        the media can be grabbed by requesting from an url composed
2343        of the stream token and uid
2344      """
2345
2346     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2347     IE_NAME = u'soundcloud'
2348
2349     def report_resolve(self, video_id):
2350         """Report information extraction."""
2351         self.to_screen(u'%s: Resolving id' % video_id)
2352
2353     def _real_extract(self, url):
2354         mobj = re.match(self._VALID_URL, url)
2355         if mobj is None:
2356             raise ExtractorError(u'Invalid URL: %s' % url)
2357
2358         # extract uploader (which is in the url)
2359         uploader = mobj.group(1)
2360         # extract simple title (uploader + slug of song title)
2361         slug_title =  mobj.group(2)
2362         simple_title = uploader + u'-' + slug_title
2363         full_title = '%s/%s' % (uploader, slug_title)
2364
2365         self.report_resolve(full_title)
2366
2367         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2368         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2369         info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2370
2371         info = json.loads(info_json)
2372         video_id = info['id']
2373         self.report_extraction(full_title)
2374
2375         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2376         stream_json = self._download_webpage(streams_url, full_title,
2377                                              u'Downloading stream definitions',
2378                                              u'unable to download stream definitions')
2379
2380         streams = json.loads(stream_json)
2381         mediaURL = streams['http_mp3_128_url']
2382         upload_date = unified_strdate(info['created_at'])
2383
2384         return [{
2385             'id':       info['id'],
2386             'url':      mediaURL,
2387             'uploader': info['user']['username'],
2388             'upload_date': upload_date,
2389             'title':    info['title'],
2390             'ext':      u'mp3',
2391             'description': info['description'],
2392         }]
2393
2394 class SoundcloudSetIE(InfoExtractor):
2395     """Information extractor for soundcloud.com sets
2396        To access the media, the uid of the song and a stream token
2397        must be extracted from the page source and the script must make
2398        a request to media.soundcloud.com/crossdomain.xml. Then
2399        the media can be grabbed by requesting from an url composed
2400        of the stream token and uid
2401      """
2402
2403     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2404     IE_NAME = u'soundcloud:set'
2405
2406     def report_resolve(self, video_id):
2407         """Report information extraction."""
2408         self.to_screen(u'%s: Resolving id' % video_id)
2409
2410     def _real_extract(self, url):
2411         mobj = re.match(self._VALID_URL, url)
2412         if mobj is None:
2413             raise ExtractorError(u'Invalid URL: %s' % url)
2414
2415         # extract uploader (which is in the url)
2416         uploader = mobj.group(1)
2417         # extract simple title (uploader + slug of song title)
2418         slug_title =  mobj.group(2)
2419         simple_title = uploader + u'-' + slug_title
2420         full_title = '%s/sets/%s' % (uploader, slug_title)
2421
2422         self.report_resolve(full_title)
2423
2424         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2425         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2426         info_json = self._download_webpage(resolv_url, full_title)
2427
2428         videos = []
2429         info = json.loads(info_json)
2430         if 'errors' in info:
2431             for err in info['errors']:
2432                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2433             return
2434
2435         self.report_extraction(full_title)
2436         for track in info['tracks']:
2437             video_id = track['id']
2438
2439             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2440             stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2441
2442             self.report_extraction(video_id)
2443             streams = json.loads(stream_json)
2444             mediaURL = streams['http_mp3_128_url']
2445
2446             videos.append({
2447                 'id':       video_id,
2448                 'url':      mediaURL,
2449                 'uploader': track['user']['username'],
2450                 'upload_date':  unified_strdate(track['created_at']),
2451                 'title':    track['title'],
2452                 'ext':      u'mp3',
2453                 'description': track['description'],
2454             })
2455         return videos
2456
2457
2458 class InfoQIE(InfoExtractor):
2459     """Information extractor for infoq.com"""
2460     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2461
2462     def _real_extract(self, url):
2463         mobj = re.match(self._VALID_URL, url)
2464         if mobj is None:
2465             raise ExtractorError(u'Invalid URL: %s' % url)
2466
2467         webpage = self._download_webpage(url, video_id=url)
2468         self.report_extraction(url)
2469
2470         # Extract video URL
2471         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2472         if mobj is None:
2473             raise ExtractorError(u'Unable to extract video url')
2474         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2475         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2476
2477         # Extract title
2478         video_title = self._search_regex(r'contentTitle = "(.*?)";',
2479             webpage, u'title')
2480
2481         # Extract description
2482         video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
2483             webpage, u'description', fatal=False)
2484
2485         video_filename = video_url.split('/')[-1]
2486         video_id, extension = video_filename.split('.')
2487
2488         info = {
2489             'id': video_id,
2490             'url': video_url,
2491             'uploader': None,
2492             'upload_date': None,
2493             'title': video_title,
2494             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2495             'thumbnail': None,
2496             'description': video_description,
2497         }
2498
2499         return [info]
2500
2501 class MixcloudIE(InfoExtractor):
2502     """Information extractor for www.mixcloud.com"""
2503
2504     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2505     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2506     IE_NAME = u'mixcloud'
2507
2508     def report_download_json(self, file_id):
2509         """Report JSON download."""
2510         self.to_screen(u'Downloading json')
2511
2512     def get_urls(self, jsonData, fmt, bitrate='best'):
2513         """Get urls from 'audio_formats' section in json"""
2514         file_url = None
2515         try:
2516             bitrate_list = jsonData[fmt]
2517             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2518                 bitrate = max(bitrate_list) # select highest
2519
2520             url_list = jsonData[fmt][bitrate]
2521         except TypeError: # we have no bitrate info.
2522             url_list = jsonData[fmt]
2523         return url_list
2524
2525     def check_urls(self, url_list):
2526         """Returns 1st active url from list"""
2527         for url in url_list:
2528             try:
2529                 compat_urllib_request.urlopen(url)
2530                 return url
2531             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2532                 url = None
2533
2534         return None
2535
2536     def _print_formats(self, formats):
2537         print('Available formats:')
2538         for fmt in formats.keys():
2539             for b in formats[fmt]:
2540                 try:
2541                     ext = formats[fmt][b][0]
2542                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2543                 except TypeError: # we have no bitrate info
2544                     ext = formats[fmt][0]
2545                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2546                     break
2547
2548     def _real_extract(self, url):
2549         mobj = re.match(self._VALID_URL, url)
2550         if mobj is None:
2551             raise ExtractorError(u'Invalid URL: %s' % url)
2552         # extract uploader & filename from url
2553         uploader = mobj.group(1).decode('utf-8')
2554         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2555
2556         # construct API request
2557         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2558         # retrieve .json file with links to files
2559         request = compat_urllib_request.Request(file_url)
2560         try:
2561             self.report_download_json(file_url)
2562             jsonData = compat_urllib_request.urlopen(request).read()
2563         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2564             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
2565
2566         # parse JSON
2567         json_data = json.loads(jsonData)
2568         player_url = json_data['player_swf_url']
2569         formats = dict(json_data['audio_formats'])
2570
2571         req_format = self._downloader.params.get('format', None)
2572         bitrate = None
2573
2574         if self._downloader.params.get('listformats', None):
2575             self._print_formats(formats)
2576             return
2577
2578         if req_format is None or req_format == 'best':
2579             for format_param in formats.keys():
2580                 url_list = self.get_urls(formats, format_param)
2581                 # check urls
2582                 file_url = self.check_urls(url_list)
2583                 if file_url is not None:
2584                     break # got it!
2585         else:
2586             if req_format not in formats:
2587                 raise ExtractorError(u'Format is not available')
2588
2589             url_list = self.get_urls(formats, req_format)
2590             file_url = self.check_urls(url_list)
2591             format_param = req_format
2592
2593         return [{
2594             'id': file_id.decode('utf-8'),
2595             'url': file_url.decode('utf-8'),
2596             'uploader': uploader.decode('utf-8'),
2597             'upload_date': None,
2598             'title': json_data['name'],
2599             'ext': file_url.split('.')[-1].decode('utf-8'),
2600             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2601             'thumbnail': json_data['thumbnail_url'],
2602             'description': json_data['description'],
2603             'player_url': player_url.decode('utf-8'),
2604         }]
2605
2606 class StanfordOpenClassroomIE(InfoExtractor):
2607     """Information extractor for Stanford's Open ClassRoom"""
2608
2609     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2610     IE_NAME = u'stanfordoc'
2611
2612     def _real_extract(self, url):
2613         mobj = re.match(self._VALID_URL, url)
2614         if mobj is None:
2615             raise ExtractorError(u'Invalid URL: %s' % url)
2616
2617         if mobj.group('course') and mobj.group('video'): # A specific video
2618             course = mobj.group('course')
2619             video = mobj.group('video')
2620             info = {
2621                 'id': course + '_' + video,
2622                 'uploader': None,
2623                 'upload_date': None,
2624             }
2625
2626             self.report_extraction(info['id'])
2627             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2628             xmlUrl = baseUrl + video + '.xml'
2629             try:
2630                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2631             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2632                 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
2633             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2634             try:
2635                 info['title'] = mdoc.findall('./title')[0].text
2636                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2637             except IndexError:
2638                 raise ExtractorError(u'Invalid metadata XML file')
2639             info['ext'] = info['url'].rpartition('.')[2]
2640             return [info]
2641         elif mobj.group('course'): # A course page
2642             course = mobj.group('course')
2643             info = {
2644                 'id': course,
2645                 'type': 'playlist',
2646                 'uploader': None,
2647                 'upload_date': None,
2648             }
2649
2650             coursepage = self._download_webpage(url, info['id'],
2651                                         note='Downloading course info page',
2652                                         errnote='Unable to download course info page')
2653
2654             info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
2655
2656             info['description'] = self._html_search_regex('<description>([^<]+)</description>',
2657                 coursepage, u'description', fatal=False)
2658
2659             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2660             info['list'] = [
2661                 {
2662                     'type': 'reference',
2663                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2664                 }
2665                     for vpage in links]
2666             results = []
2667             for entry in info['list']:
2668                 assert entry['type'] == 'reference'
2669                 results += self.extract(entry['url'])
2670             return results
2671         else: # Root page
2672             info = {
2673                 'id': 'Stanford OpenClassroom',
2674                 'type': 'playlist',
2675                 'uploader': None,
2676                 'upload_date': None,
2677             }
2678
2679             self.report_download_webpage(info['id'])
2680             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2681             try:
2682                 rootpage = compat_urllib_request.urlopen(rootURL).read()
2683             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2684                 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
2685
2686             info['title'] = info['id']
2687
2688             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2689             info['list'] = [
2690                 {
2691                     'type': 'reference',
2692                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2693                 }
2694                     for cpage in links]
2695
2696             results = []
2697             for entry in info['list']:
2698                 assert entry['type'] == 'reference'
2699                 results += self.extract(entry['url'])
2700             return results
2701
2702 class MTVIE(InfoExtractor):
2703     """Information extractor for MTV.com"""
2704
2705     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2706     IE_NAME = u'mtv'
2707
2708     def _real_extract(self, url):
2709         mobj = re.match(self._VALID_URL, url)
2710         if mobj is None:
2711             raise ExtractorError(u'Invalid URL: %s' % url)
2712         if not mobj.group('proto'):
2713             url = 'http://' + url
2714         video_id = mobj.group('videoid')
2715
2716         webpage = self._download_webpage(url, video_id)
2717
2718         song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
2719             webpage, u'song name', fatal=False)
2720
2721         video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
2722             webpage, u'title')
2723
2724         mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
2725             webpage, u'mtvn_uri', fatal=False)
2726
2727         content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
2728             webpage, u'content id', fatal=False)
2729
2730         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2731         self.report_extraction(video_id)
2732         request = compat_urllib_request.Request(videogen_url)
2733         try:
2734             metadataXml = compat_urllib_request.urlopen(request).read()
2735         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736             raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2737
2738         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2739         renditions = mdoc.findall('.//rendition')
2740
2741         # For now, always pick the highest quality.
2742         rendition = renditions[-1]
2743
2744         try:
2745             _,_,ext = rendition.attrib['type'].partition('/')
2746             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2747             video_url = rendition.find('./src').text
2748         except KeyError:
2749             raise ExtractorError('Invalid rendition field.')
2750
2751         info = {
2752             'id': video_id,
2753             'url': video_url,
2754             'uploader': performer,
2755             'upload_date': None,
2756             'title': video_title,
2757             'ext': ext,
2758             'format': format,
2759         }
2760
2761         return [info]
2762
2763
2764 class YoukuIE(InfoExtractor):
2765     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2766
2767     def _gen_sid(self):
2768         nowTime = int(time.time() * 1000)
2769         random1 = random.randint(1000,1998)
2770         random2 = random.randint(1000,9999)
2771
2772         return "%d%d%d" %(nowTime,random1,random2)
2773
2774     def _get_file_ID_mix_string(self, seed):
2775         mixed = []
2776         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2777         seed = float(seed)
2778         for i in range(len(source)):
2779             seed  =  (seed * 211 + 30031 ) % 65536
2780             index  =  math.floor(seed / 65536 * len(source) )
2781             mixed.append(source[int(index)])
2782             source.remove(source[int(index)])
2783         #return ''.join(mixed)
2784         return mixed
2785
2786     def _get_file_id(self, fileId, seed):
2787         mixed = self._get_file_ID_mix_string(seed)
2788         ids = fileId.split('*')
2789         realId = []
2790         for ch in ids:
2791             if ch:
2792                 realId.append(mixed[int(ch)])
2793         return ''.join(realId)
2794
2795     def _real_extract(self, url):
2796         mobj = re.match(self._VALID_URL, url)
2797         if mobj is None:
2798             raise ExtractorError(u'Invalid URL: %s' % url)
2799         video_id = mobj.group('ID')
2800
2801         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2802
2803         jsondata = self._download_webpage(info_url, video_id)
2804
2805         self.report_extraction(video_id)
2806         try:
2807             config = json.loads(jsondata)
2808
2809             video_title =  config['data'][0]['title']
2810             seed = config['data'][0]['seed']
2811
2812             format = self._downloader.params.get('format', None)
2813             supported_format = list(config['data'][0]['streamfileids'].keys())
2814
2815             if format is None or format == 'best':
2816                 if 'hd2' in supported_format:
2817                     format = 'hd2'
2818                 else:
2819                     format = 'flv'
2820                 ext = u'flv'
2821             elif format == 'worst':
2822                 format = 'mp4'
2823                 ext = u'mp4'
2824             else:
2825                 format = 'flv'
2826                 ext = u'flv'
2827
2828
2829             fileid = config['data'][0]['streamfileids'][format]
2830             keys = [s['k'] for s in config['data'][0]['segs'][format]]
2831         except (UnicodeDecodeError, ValueError, KeyError):
2832             raise ExtractorError(u'Unable to extract info section')
2833
2834         files_info=[]
2835         sid = self._gen_sid()
2836         fileid = self._get_file_id(fileid, seed)
2837
2838         #column 8,9 of fileid represent the segment number
2839         #fileid[7:9] should be changed
2840         for index, key in enumerate(keys):
2841
2842             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2843             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2844
2845             info = {
2846                 'id': '%s_part%02d' % (video_id, index),
2847                 'url': download_url,
2848                 'uploader': None,
2849                 'upload_date': None,
2850                 'title': video_title,
2851                 'ext': ext,
2852             }
2853             files_info.append(info)
2854
2855         return files_info
2856
2857
2858 class XNXXIE(InfoExtractor):
2859     """Information extractor for xnxx.com"""
2860
2861     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2862     IE_NAME = u'xnxx'
2863     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
2864     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2865     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
2866
2867     def _real_extract(self, url):
2868         mobj = re.match(self._VALID_URL, url)
2869         if mobj is None:
2870             raise ExtractorError(u'Invalid URL: %s' % url)
2871         video_id = mobj.group(1)
2872
2873         # Get webpage content
2874         webpage = self._download_webpage(url, video_id)
2875
2876         video_url = self._search_regex(self.VIDEO_URL_RE,
2877             webpage, u'video URL')
2878         video_url = compat_urllib_parse.unquote(video_url)
2879
2880         video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
2881             webpage, u'title')
2882
2883         video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
2884             webpage, u'thumbnail', fatal=False)
2885
2886         return [{
2887             'id': video_id,
2888             'url': video_url,
2889             'uploader': None,
2890             'upload_date': None,
2891             'title': video_title,
2892             'ext': 'flv',
2893             'thumbnail': video_thumbnail,
2894             'description': None,
2895         }]
2896
2897
2898 class GooglePlusIE(InfoExtractor):
2899     """Information extractor for plus.google.com."""
2900
2901     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2902     IE_NAME = u'plus.google'
2903
2904     def _real_extract(self, url):
2905         # Extract id from URL
2906         mobj = re.match(self._VALID_URL, url)
2907         if mobj is None:
2908             raise ExtractorError(u'Invalid URL: %s' % url)
2909
2910         post_url = mobj.group(0)
2911         video_id = mobj.group(1)
2912
2913         video_extension = 'flv'
2914
2915         # Step 1, Retrieve post webpage to extract further information
2916         webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2917
2918         self.report_extraction(video_id)
2919
2920         # Extract update date
2921         upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
2922             webpage, u'upload date', fatal=False)
2923         if upload_date:
2924             # Convert timestring to a format suitable for filename
2925             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2926             upload_date = upload_date.strftime('%Y%m%d')
2927
2928         # Extract uploader
2929         uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
2930             webpage, u'uploader', fatal=False)
2931
2932         # Extract title
2933         # Get the first line for title
2934         video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
2935             webpage, 'title', default=u'NA')
2936
2937         # Step 2, Stimulate clicking the image box to launch video
2938         video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
2939             webpage, u'video page URL')
2940         webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
2941
2942         # Extract video links on video page
2943         """Extract video links of all sizes"""
2944         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
2945         mobj = re.findall(pattern, webpage)
2946         if len(mobj) == 0:
2947             raise ExtractorError(u'Unable to extract video links')
2948
2949         # Sort in resolution
2950         links = sorted(mobj)
2951
2952         # Choose the lowest of the sort, i.e. highest resolution
2953         video_url = links[-1]
2954         # Only get the url. The resolution part in the tuple has no use anymore
2955         video_url = video_url[-1]
2956         # Treat escaped \u0026 style hex
2957         try:
2958             video_url = video_url.decode("unicode_escape")
2959         except AttributeError: # Python 3
2960             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2961
2962
2963         return [{
2964             'id':       video_id,
2965             'url':      video_url,
2966             'uploader': uploader,
2967             'upload_date':  upload_date,
2968             'title':    video_title,
2969             'ext':      video_extension,
2970         }]
2971
2972 class NBAIE(InfoExtractor):
2973     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2974     IE_NAME = u'nba'
2975
2976     def _real_extract(self, url):
2977         mobj = re.match(self._VALID_URL, url)
2978         if mobj is None:
2979             raise ExtractorError(u'Invalid URL: %s' % url)
2980
2981         video_id = mobj.group(1)
2982
2983         webpage = self._download_webpage(url, video_id)
2984
2985         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2986
2987         shortened_video_id = video_id.rpartition('/')[2]
2988         title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2989             webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2990
2991         # It isn't there in the HTML it returns to us
2992         # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2993
2994         description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2995
2996         info = {
2997             'id': shortened_video_id,
2998             'url': video_url,
2999             'ext': 'mp4',
3000             'title': title,
3001             # 'uploader_date': uploader_date,
3002             'description': description,
3003         }
3004         return [info]
3005
3006 class JustinTVIE(InfoExtractor):
3007     """Information extractor for justin.tv and twitch.tv"""
3008     # TODO: One broadcast may be split into multiple videos. The key
3009     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3010     # starts at 1 and increases. Can we treat all parts as one video?
3011
3012     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3013         (?:
3014             (?P<channelid>[^/]+)|
3015             (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
3016             (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
3017         )
3018         /?(?:\#.*)?$
3019         """
3020     _JUSTIN_PAGE_LIMIT = 100
3021     IE_NAME = u'justin.tv'
3022
3023     def report_download_page(self, channel, offset):
3024         """Report attempt to download a single page of videos."""
3025         self.to_screen(u'%s: Downloading video information from %d to %d' %
3026                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3027
3028     # Return count of items, list of *valid* items
3029     def _parse_page(self, url, video_id):
3030         webpage = self._download_webpage(url, video_id,
3031                                          u'Downloading video info JSON',
3032                                          u'unable to download video info JSON')
3033
3034         response = json.loads(webpage)
3035         if type(response) != list:
3036             error_text = response.get('error', 'unknown error')
3037             raise ExtractorError(u'Justin.tv API: %s' % error_text)
3038         info = []
3039         for clip in response:
3040             video_url = clip['video_file_url']
3041             if video_url:
3042                 video_extension = os.path.splitext(video_url)[1][1:]
3043                 video_date = re.sub('-', '', clip['start_time'][:10])
3044                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3045                 video_id = clip['id']
3046                 video_title = clip.get('title', video_id)
3047                 info.append({
3048                     'id': video_id,
3049                     'url': video_url,
3050                     'title': video_title,
3051                     'uploader': clip.get('channel_name', video_uploader_id),
3052                     'uploader_id': video_uploader_id,
3053                     'upload_date': video_date,
3054                     'ext': video_extension,
3055                 })
3056         return (len(response), info)
3057
3058     def _real_extract(self, url):
3059         mobj = re.match(self._VALID_URL, url)
3060         if mobj is None:
3061             raise ExtractorError(u'invalid URL: %s' % url)
3062
3063         api_base = 'http://api.justin.tv'
3064         paged = False
3065         if mobj.group('channelid'):
3066             paged = True
3067             video_id = mobj.group('channelid')
3068             api = api_base + '/channel/archives/%s.json' % video_id
3069         elif mobj.group('chapterid'):
3070             chapter_id = mobj.group('chapterid')
3071
3072             webpage = self._download_webpage(url, chapter_id)
3073             m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
3074             if not m:
3075                 raise ExtractorError(u'Cannot find archive of a chapter')
3076             archive_id = m.group(1)
3077
3078             api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
3079             chapter_info_xml = self._download_webpage(api, chapter_id,
3080                                              note=u'Downloading chapter information',
3081                                              errnote=u'Chapter information download failed')
3082             doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
3083             for a in doc.findall('.//archive'):
3084                 if archive_id == a.find('./id').text:
3085                     break
3086             else:
3087                 raise ExtractorError(u'Could not find chapter in chapter information')
3088
3089             video_url = a.find('./video_file_url').text
3090             video_ext = video_url.rpartition('.')[2] or u'flv'
3091
3092             chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
3093             chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
3094                                    note='Downloading chapter metadata',
3095                                    errnote='Download of chapter metadata failed')
3096             chapter_info = json.loads(chapter_info_json)
3097
3098             bracket_start = int(doc.find('.//bracket_start').text)
3099             bracket_end = int(doc.find('.//bracket_end').text)
3100
3101             # TODO determine start (and probably fix up file)
3102             #  youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
3103             #video_url += u'?start=' + TODO:start_timestamp
3104             # bracket_start is 13290, but we want 51670615
3105             self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
3106                                             u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
3107
3108             info = {
3109                 'id': u'c' + chapter_id,
3110                 'url': video_url,
3111                 'ext': video_ext,
3112                 'title': chapter_info['title'],
3113                 'thumbnail': chapter_info['preview'],
3114                 'description': chapter_info['description'],
3115                 'uploader': chapter_info['channel']['display_name'],
3116                 'uploader_id': chapter_info['channel']['name'],
3117             }
3118             return [info]
3119         else:
3120             video_id = mobj.group('videoid')
3121             api = api_base + '/broadcast/by_archive/%s.json' % video_id
3122
3123         self.report_extraction(video_id)
3124
3125         info = []
3126         offset = 0
3127         limit = self._JUSTIN_PAGE_LIMIT
3128         while True:
3129             if paged:
3130                 self.report_download_page(video_id, offset)
3131             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3132             page_count, page_info = self._parse_page(page_url, video_id)
3133             info.extend(page_info)
3134             if not paged or page_count != limit:
3135                 break
3136             offset += limit
3137         return info
3138
3139 class FunnyOrDieIE(InfoExtractor):
3140     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3141
3142     def _real_extract(self, url):
3143         mobj = re.match(self._VALID_URL, url)
3144         if mobj is None:
3145             raise ExtractorError(u'invalid URL: %s' % url)
3146
3147         video_id = mobj.group('id')
3148         webpage = self._download_webpage(url, video_id)
3149
3150         video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
3151             webpage, u'video URL', flags=re.DOTALL)
3152
3153         title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
3154             r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
3155
3156         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3157             webpage, u'description', fatal=False, flags=re.DOTALL)
3158
3159         info = {
3160             'id': video_id,
3161             'url': video_url,
3162             'ext': 'mp4',
3163             'title': title,
3164             'description': video_description,
3165         }
3166         return [info]
3167
3168 class SteamIE(InfoExtractor):
3169     _VALID_URL = r"""http://store\.steampowered\.com/
3170                 (agecheck/)?
3171                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3172                 (?P<gameID>\d+)/?
3173                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3174                 """
3175     _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
3176     _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
3177
3178     @classmethod
3179     def suitable(cls, url):
3180         """Receives a URL and returns True if suitable for this IE."""
3181         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3182
3183     def _real_extract(self, url):
3184         m = re.match(self._VALID_URL, url, re.VERBOSE)
3185         gameID = m.group('gameID')
3186
3187         videourl = self._VIDEO_PAGE_TEMPLATE % gameID
3188         webpage = self._download_webpage(videourl, gameID)
3189
3190         if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
3191             videourl = self._AGECHECK_TEMPLATE % gameID
3192             self.report_age_confirmation()
3193             webpage = self._download_webpage(videourl, gameID)
3194
3195         self.report_extraction(gameID)
3196         game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
3197                                              webpage, 'game title')
3198
3199         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3200         mweb = re.finditer(urlRE, webpage)
3201         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3202         titles = re.finditer(namesRE, webpage)
3203         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3204         thumbs = re.finditer(thumbsRE, webpage)
3205         videos = []
3206         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3207             video_id = vid.group('videoID')
3208             title = vtitle.group('videoName')
3209             video_url = vid.group('videoURL')
3210             video_thumb = thumb.group('thumbnail')
3211             if not video_url:
3212                 raise ExtractorError(u'Cannot find video url for %s' % video_id)
3213             info = {
3214                 'id':video_id,
3215                 'url':video_url,
3216                 'ext': 'flv',
3217                 'title': unescapeHTML(title),
3218                 'thumbnail': video_thumb
3219                   }
3220             videos.append(info)
3221         return [self.playlist_result(videos, gameID, game_title)]
3222
3223 class UstreamIE(InfoExtractor):
3224     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3225     IE_NAME = u'ustream'
3226
3227     def _real_extract(self, url):
3228         m = re.match(self._VALID_URL, url)
3229         video_id = m.group('videoID')
3230
3231         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3232         webpage = self._download_webpage(url, video_id)
3233
3234         self.report_extraction(video_id)
3235
3236         video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
3237             webpage, u'title')
3238
3239         uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
3240             webpage, u'uploader', fatal=False, flags=re.DOTALL)
3241
3242         thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
3243             webpage, u'thumbnail', fatal=False)
3244
3245         info = {
3246                 'id': video_id,
3247                 'url': video_url,
3248                 'ext': 'flv',
3249                 'title': video_title,
3250                 'uploader': uploader,
3251                 'thumbnail': thumbnail,
3252                }
3253         return info
3254
3255 class WorldStarHipHopIE(InfoExtractor):
3256     _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3257     IE_NAME = u'WorldStarHipHop'
3258
3259     def _real_extract(self, url):
3260         m = re.match(self._VALID_URL, url)
3261         video_id = m.group('id')
3262
3263         webpage_src = self._download_webpage(url, video_id)
3264
3265         video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
3266             webpage_src, u'video URL')
3267
3268         if 'mp4' in video_url:
3269             ext = 'mp4'
3270         else:
3271             ext = 'flv'
3272
3273         video_title = self._html_search_regex(r"<title>(.*)</title>",
3274             webpage_src, u'title')
3275
3276         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3277         thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
3278             webpage_src, u'thumbnail', fatal=False)
3279
3280         if not thumbnail:
3281             _title = r"""candytitles.*>(.*)</span>"""
3282             mobj = re.search(_title, webpage_src)
3283             if mobj is not None:
3284                 video_title = mobj.group(1)
3285
3286         results = [{
3287                     'id': video_id,
3288                     'url' : video_url,
3289                     'title' : video_title,
3290                     'thumbnail' : thumbnail,
3291                     'ext' : ext,
3292                     }]
3293         return results
3294
3295 class RBMARadioIE(InfoExtractor):
3296     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3297
3298     def _real_extract(self, url):
3299         m = re.match(self._VALID_URL, url)
3300         video_id = m.group('videoID')
3301
3302         webpage = self._download_webpage(url, video_id)
3303
3304         json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
3305             webpage, u'json data', flags=re.MULTILINE)
3306
3307         try:
3308             data = json.loads(json_data)
3309         except ValueError as e:
3310             raise ExtractorError(u'Invalid JSON: ' + str(e))
3311
3312         video_url = data['akamai_url'] + '&cbr=256'
3313         url_parts = compat_urllib_parse_urlparse(video_url)
3314         video_ext = url_parts.path.rpartition('.')[2]
3315         info = {
3316                 'id': video_id,
3317                 'url': video_url,
3318                 'ext': video_ext,
3319                 'title': data['title'],
3320                 'description': data.get('teaser_text'),
3321                 'location': data.get('country_of_origin'),
3322                 'uploader': data.get('host', {}).get('name'),
3323                 'uploader_id': data.get('host', {}).get('slug'),
3324                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3325                 'duration': data.get('duration'),
3326         }
3327         return [info]
3328
3329
3330 class YouPornIE(InfoExtractor):
3331     """Information extractor for youporn.com."""
3332     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3333
3334     def _print_formats(self, formats):
3335         """Print all available formats"""
3336         print(u'Available formats:')
3337         print(u'ext\t\tformat')
3338         print(u'---------------------------------')
3339         for format in formats:
3340             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3341
3342     def _specific(self, req_format, formats):
3343         for x in formats:
3344             if(x["format"]==req_format):
3345                 return x
3346         return None
3347
3348     def _real_extract(self, url):
3349         mobj = re.match(self._VALID_URL, url)
3350         if mobj is None:
3351             raise ExtractorError(u'Invalid URL: %s' % url)
3352         video_id = mobj.group('videoid')
3353
3354         req = compat_urllib_request.Request(url)
3355         req.add_header('Cookie', 'age_verified=1')
3356         webpage = self._download_webpage(req, video_id)
3357
3358         # Get JSON parameters
3359         json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
3360         try:
3361             params = json.loads(json_params)
3362         except:
3363             raise ExtractorError(u'Invalid JSON')
3364
3365         self.report_extraction(video_id)
3366         try:
3367             video_title = params['title']
3368             upload_date = unified_strdate(params['release_date_f'])
3369             video_description = params['description']
3370             video_uploader = params['submitted_by']
3371             thumbnail = params['thumbnails'][0]['image']
3372         except KeyError:
3373             raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
3374
3375         # Get all of the formats available
3376         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3377         download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
3378             webpage, u'download list').strip()
3379
3380         # Get all of the links from the page
3381         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3382         links = re.findall(LINK_RE, download_list_html)
3383         if(len(links) == 0):
3384             raise ExtractorError(u'ERROR: no known formats available for video')
3385
3386         self.to_screen(u'Links found: %d' % len(links))
3387
3388         formats = []
3389         for link in links:
3390
3391             # A link looks like this:
3392             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3393             # A path looks like this:
3394             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3395             video_url = unescapeHTML( link )
3396             path = compat_urllib_parse_urlparse( video_url ).path
3397             extension = os.path.splitext( path )[1][1:]
3398             format = path.split('/')[4].split('_')[:2]
3399             size = format[0]
3400             bitrate = format[1]
3401             format = "-".join( format )
3402             # title = u'%s-%s-%s' % (video_title, size, bitrate)
3403
3404             formats.append({
3405                 'id': video_id,
3406                 'url': video_url,
3407                 'uploader': video_uploader,
3408                 'upload_date': upload_date,
3409                 'title': video_title,
3410                 'ext': extension,
3411                 'format': format,
3412                 'thumbnail': thumbnail,
3413                 'description': video_description
3414             })
3415
3416         if self._downloader.params.get('listformats', None):
3417             self._print_formats(formats)
3418             return
3419
3420         req_format = self._downloader.params.get('format', None)
3421         self.to_screen(u'Format: %s' % req_format)
3422
3423         if req_format is None or req_format == 'best':
3424             return [formats[0]]
3425         elif req_format == 'worst':
3426             return [formats[-1]]
3427         elif req_format in ('-1', 'all'):
3428             return formats
3429         else:
3430             format = self._specific( req_format, formats )
3431             if result is None:
3432                 raise ExtractorError(u'Requested format not available')
3433             return [format]
3434
3435
3436
3437 class PornotubeIE(InfoExtractor):
3438     """Information extractor for pornotube.com."""
3439     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3440
3441     def _real_extract(self, url):
3442         mobj = re.match(self._VALID_URL, url)
3443         if mobj is None:
3444             raise ExtractorError(u'Invalid URL: %s' % url)
3445
3446         video_id = mobj.group('videoid')
3447         video_title = mobj.group('title')
3448
3449         # Get webpage content
3450         webpage = self._download_webpage(url, video_id)
3451
3452         # Get the video URL
3453         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3454         video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
3455         video_url = compat_urllib_parse.unquote(video_url)
3456
3457         #Get the uploaded date
3458         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3459         upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
3460         if upload_date: upload_date = unified_strdate(upload_date)
3461
3462         info = {'id': video_id,
3463                 'url': video_url,
3464                 'uploader': None,
3465                 'upload_date': upload_date,
3466                 'title': video_title,
3467                 'ext': 'flv',
3468                 'format': 'flv'}
3469
3470         return [info]
3471
3472 class YouJizzIE(InfoExtractor):
3473     """Information extractor for youjizz.com."""
3474     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3475
3476     def _real_extract(self, url):
3477         mobj = re.match(self._VALID_URL, url)
3478         if mobj is None:
3479             raise ExtractorError(u'Invalid URL: %s' % url)
3480
3481         video_id = mobj.group('videoid')
3482
3483         # Get webpage content
3484         webpage = self._download_webpage(url, video_id)
3485
3486         # Get the video title
3487         video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
3488             webpage, u'title').strip()
3489
3490         # Get the embed page
3491         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3492         if result is None:
3493             raise ExtractorError(u'ERROR: unable to extract embed page')
3494
3495         embed_page_url = result.group(0).strip()
3496         video_id = result.group('videoid')
3497
3498         webpage = self._download_webpage(embed_page_url, video_id)
3499
3500         # Get the video URL
3501         video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
3502             webpage, u'video URL')
3503
3504         info = {'id': video_id,
3505                 'url': video_url,
3506                 'title': video_title,
3507                 'ext': 'flv',
3508                 'format': 'flv',
3509                 'player_url': embed_page_url}
3510
3511         return [info]
3512
3513 class EightTracksIE(InfoExtractor):
3514     IE_NAME = '8tracks'
3515     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3516
3517     def _real_extract(self, url):
3518         mobj = re.match(self._VALID_URL, url)
3519         if mobj is None:
3520             raise ExtractorError(u'Invalid URL: %s' % url)
3521         playlist_id = mobj.group('id')
3522
3523         webpage = self._download_webpage(url, playlist_id)
3524
3525         json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
3526         data = json.loads(json_like)
3527
3528         session = str(random.randint(0, 1000000000))
3529         mix_id = data['id']
3530         track_count = data['tracks_count']
3531         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3532         next_url = first_url
3533         res = []
3534         for i in itertools.count():
3535             api_json = self._download_webpage(next_url, playlist_id,
3536                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3537                 errnote=u'Failed to download song information')
3538             api_data = json.loads(api_json)
3539             track_data = api_data[u'set']['track']
3540             info = {
3541                 'id': track_data['id'],
3542                 'url': track_data['track_file_stream_url'],
3543                 'title': track_data['performer'] + u' - ' + track_data['name'],
3544                 'raw_title': track_data['name'],
3545                 'uploader_id': data['user']['login'],
3546                 'ext': 'm4a',
3547             }
3548             res.append(info)
3549             if api_data['set']['at_last_track']:
3550                 break
3551             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3552         return res
3553
3554 class KeekIE(InfoExtractor):
3555     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3556     IE_NAME = u'keek'
3557
3558     def _real_extract(self, url):
3559         m = re.match(self._VALID_URL, url)
3560         video_id = m.group('videoID')
3561
3562         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3563         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3564         webpage = self._download_webpage(url, video_id)
3565
3566         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3567             webpage, u'title')
3568
3569         uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
3570             webpage, u'uploader', fatal=False)
3571
3572         info = {
3573                 'id': video_id,
3574                 'url': video_url,
3575                 'ext': 'mp4',
3576                 'title': video_title,
3577                 'thumbnail': thumbnail,
3578                 'uploader': uploader
3579         }
3580         return [info]
3581
3582 class TEDIE(InfoExtractor):
3583     _VALID_URL=r'''http://www\.ted\.com/
3584                    (
3585                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3586                         |
3587                         ((?P<type_talk>talks)) # We have a simple talk
3588                    )
3589                    (/lang/(.*?))? # The url may contain the language
3590                    /(?P<name>\w+) # Here goes the name and then ".html"
3591                    '''
3592
3593     @classmethod
3594     def suitable(cls, url):
3595         """Receives a URL and returns True if suitable for this IE."""
3596         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3597
3598     def _real_extract(self, url):
3599         m=re.match(self._VALID_URL, url, re.VERBOSE)
3600         if m.group('type_talk'):
3601             return [self._talk_info(url)]
3602         else :
3603             playlist_id=m.group('playlist_id')
3604             name=m.group('name')
3605             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3606             return [self._playlist_videos_info(url,name,playlist_id)]
3607
3608     def _playlist_videos_info(self,url,name,playlist_id=0):
3609         '''Returns the videos of the playlist'''
3610         video_RE=r'''
3611                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3612                      ([.\s]*?)data-playlist_item_id="(\d+)"
3613                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3614                      '''
3615         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3616         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3617         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3618         m_names=re.finditer(video_name_RE,webpage)
3619
3620         playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
3621                                                  webpage, 'playlist title')
3622
3623         playlist_entries = []
3624         for m_video, m_name in zip(m_videos,m_names):
3625             video_id=m_video.group('video_id')
3626             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3627             playlist_entries.append(self.url_result(talk_url, 'TED'))
3628         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3629
3630     def _talk_info(self, url, video_id=0):
3631         """Return the video for the talk in the url"""
3632         m = re.match(self._VALID_URL, url,re.VERBOSE)
3633         video_name = m.group('name')
3634         webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
3635         self.report_extraction(video_name)
3636         # If the url includes the language we get the title translated
3637         title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
3638                                         webpage, 'title')
3639         json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
3640                                     webpage, 'json data')
3641         info = json.loads(json_data)
3642         desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
3643                                        webpage, 'description', flags = re.DOTALL)
3644
3645         thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
3646                                        webpage, 'thumbnail')
3647         info = {
3648                 'id': info['id'],
3649                 'url': info['htmlStreams'][-1]['file'],
3650                 'ext': 'mp4',
3651                 'title': title,
3652                 'thumbnail': thumbnail,
3653                 'description': desc,
3654                 }
3655         return info
3656
3657 class MySpassIE(InfoExtractor):
3658     _VALID_URL = r'http://www.myspass.de/.*'
3659
3660     def _real_extract(self, url):
3661         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3662
3663         # video id is the last path element of the URL
3664         # usually there is a trailing slash, so also try the second but last
3665         url_path = compat_urllib_parse_urlparse(url).path
3666         url_parent_path, video_id = os.path.split(url_path)
3667         if not video_id:
3668             _, video_id = os.path.split(url_parent_path)
3669
3670         # get metadata
3671         metadata_url = META_DATA_URL_TEMPLATE % video_id
3672         metadata_text = self._download_webpage(metadata_url, video_id)
3673         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3674
3675         # extract values from metadata
3676         url_flv_el = metadata.find('url_flv')
3677         if url_flv_el is None:
3678             raise ExtractorError(u'Unable to extract download url')
3679         video_url = url_flv_el.text
3680         extension = os.path.splitext(video_url)[1][1:]
3681         title_el = metadata.find('title')
3682         if title_el is None:
3683             raise ExtractorError(u'Unable to extract title')
3684         title = title_el.text
3685         format_id_el = metadata.find('format_id')
3686         if format_id_el is None:
3687             format = ext
3688         else:
3689             format = format_id_el.text
3690         description_el = metadata.find('description')
3691         if description_el is not None:
3692             description = description_el.text
3693         else:
3694             description = None
3695         imagePreview_el = metadata.find('imagePreview')
3696         if imagePreview_el is not None:
3697             thumbnail = imagePreview_el.text
3698         else:
3699             thumbnail = None
3700         info = {
3701             'id': video_id,
3702             'url': video_url,
3703             'title': title,
3704             'ext': extension,
3705             'format': format,
3706             'thumbnail': thumbnail,
3707             'description': description
3708         }
3709         return [info]
3710
3711 class SpiegelIE(InfoExtractor):
3712     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3713
3714     def _real_extract(self, url):
3715         m = re.match(self._VALID_URL, url)
3716         video_id = m.group('videoID')
3717
3718         webpage = self._download_webpage(url, video_id)
3719
3720         video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
3721             webpage, u'title')
3722
3723         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3724         xml_code = self._download_webpage(xml_url, video_id,
3725                     note=u'Downloading XML', errnote=u'Failed to download XML')
3726
3727         idoc = xml.etree.ElementTree.fromstring(xml_code)
3728         last_type = idoc[-1]
3729         filename = last_type.findall('./filename')[0].text
3730         duration = float(last_type.findall('./duration')[0].text)
3731
3732         video_url = 'http://video2.spiegel.de/flash/' + filename
3733         video_ext = filename.rpartition('.')[2]
3734         info = {
3735             'id': video_id,
3736             'url': video_url,
3737             'ext': video_ext,
3738             'title': video_title,
3739             'duration': duration,
3740         }
3741         return [info]
3742
3743 class LiveLeakIE(InfoExtractor):
3744
3745     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3746     IE_NAME = u'liveleak'
3747
3748     def _real_extract(self, url):
3749         mobj = re.match(self._VALID_URL, url)
3750         if mobj is None:
3751             raise ExtractorError(u'Invalid URL: %s' % url)
3752
3753         video_id = mobj.group('video_id')
3754
3755         webpage = self._download_webpage(url, video_id)
3756
3757         video_url = self._search_regex(r'file: "(.*?)",',
3758             webpage, u'video URL')
3759
3760         video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3761             webpage, u'title').replace('LiveLeak.com -', '').strip()
3762
3763         video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3764             webpage, u'description', fatal=False)
3765
3766         video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3767             webpage, u'uploader', fatal=False)
3768
3769         info = {
3770             'id':  video_id,
3771             'url': video_url,
3772             'ext': 'mp4',
3773             'title': video_title,
3774             'description': video_description,
3775             'uploader': video_uploader
3776         }
3777
3778         return [info]
3779
3780 class ARDIE(InfoExtractor):
3781     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3782     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3783     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3784
3785     def _real_extract(self, url):
3786         # determine video id from url
3787         m = re.match(self._VALID_URL, url)
3788
3789         numid = re.search(r'documentId=([0-9]+)', url)
3790         if numid:
3791             video_id = numid.group(1)
3792         else:
3793             video_id = m.group('video_id')
3794
3795         # determine title and media streams from webpage
3796         html = self._download_webpage(url, video_id)
3797         title = re.search(self._TITLE, html).group('title')
3798         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3799         if not streams:
3800             assert '"fsk"' in html
3801             raise ExtractorError(u'This video is only available after 8:00 pm')
3802
3803         # choose default media type and highest quality for now
3804         stream = max([s for s in streams if int(s["media_type"]) == 0],
3805                      key=lambda s: int(s["quality"]))
3806
3807         # there's two possibilities: RTMP stream or HTTP download
3808         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3809         if stream['rtmp_url']:
3810             self.to_screen(u'RTMP download detected')
3811             assert stream['video_url'].startswith('mp4:')
3812             info["url"] = stream["rtmp_url"]
3813             info["play_path"] = stream['video_url']
3814         else:
3815             assert stream["video_url"].endswith('.mp4')
3816             info["url"] = stream["video_url"]
3817         return [info]
3818
3819 class ZDFIE(InfoExtractor):
3820     _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3821     _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
3822     _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
3823     _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
3824     _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
3825
3826     def _real_extract(self, url):
3827         mobj = re.match(self._VALID_URL, url)
3828         if mobj is None:
3829             raise ExtractorError(u'Invalid URL: %s' % url)
3830         video_id = mobj.group('video_id')
3831
3832         html = self._download_webpage(url, video_id)
3833         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3834         if streams is None:
3835             raise ExtractorError(u'No media url found.')
3836
3837         # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
3838         # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
3839         # choose first/default media type and highest quality for now
3840         for s in streams:        #find 300 - dsl1000mbit
3841             if s['quality'] == '300' and s['media_type'] == 'wstreaming':
3842                 stream_=s
3843                 break
3844         for s in streams:        #find veryhigh - dsl2000mbit
3845             if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
3846                 stream_=s
3847                 break
3848         if stream_ is None:
3849             raise ExtractorError(u'No stream found.')
3850
3851         media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
3852
3853         self.report_extraction(video_id)
3854         mobj = re.search(self._TITLE, html)
3855         if mobj is None:
3856             raise ExtractorError(u'Cannot extract title')
3857         title = unescapeHTML(mobj.group('title'))
3858
3859         mobj = re.search(self._MMS_STREAM, media_link)
3860         if mobj is None:
3861             mobj = re.search(self._RTSP_STREAM, media_link)
3862             if mobj is None:
3863                 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
3864         mms_url = mobj.group('video_url')
3865
3866         mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
3867         if mobj is None:
3868             raise ExtractorError(u'Cannot extract extention')
3869         ext = mobj.group('ext')
3870
3871         return [{'id': video_id,
3872                  'url': mms_url,
3873                  'title': title,
3874                  'ext': ext
3875                  }]
3876
3877 class TumblrIE(InfoExtractor):
3878     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3879
3880     def _real_extract(self, url):
3881         m_url = re.match(self._VALID_URL, url)
3882         video_id = m_url.group('id')
3883         blog = m_url.group('blog_name')
3884
3885         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3886         webpage = self._download_webpage(url, video_id)
3887
3888         re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3889         video = re.search(re_video, webpage)
3890         if video is None:
3891            raise ExtractorError(u'Unable to extract video')
3892         video_url = video.group('video_url')
3893         ext = video.group('ext')
3894
3895         video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
3896             webpage, u'thumbnail', fatal=False)  # We pick the first poster
3897         if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
3898
3899         # The only place where you can get a title, it's not complete,
3900         # but searching in other places doesn't work for all videos
3901         video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
3902             webpage, u'title', flags=re.DOTALL)
3903
3904         return [{'id': video_id,
3905                  'url': video_url,
3906                  'title': video_title,
3907                  'thumbnail': video_thumbnail,
3908                  'ext': ext
3909                  }]
3910
3911 class BandcampIE(InfoExtractor):
3912     _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3913
3914     def _real_extract(self, url):
3915         mobj = re.match(self._VALID_URL, url)
3916         title = mobj.group('title')
3917         webpage = self._download_webpage(url, title)
3918         # We get the link to the free download page
3919         m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3920         if m_download is None:
3921             raise ExtractorError(u'No free songs found')
3922
3923         download_link = m_download.group(1)
3924         id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3925                        webpage, re.MULTILINE|re.DOTALL).group('id')
3926
3927         download_webpage = self._download_webpage(download_link, id,
3928                                                   'Downloading free downloads page')
3929         # We get the dictionary of the track from some javascrip code
3930         info = re.search(r'items: (.*?),$',
3931                          download_webpage, re.MULTILINE).group(1)
3932         info = json.loads(info)[0]
3933         # We pick mp3-320 for now, until format selection can be easily implemented.
3934         mp3_info = info[u'downloads'][u'mp3-320']
3935         # If we try to use this url it says the link has expired
3936         initial_url = mp3_info[u'url']
3937         re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3938         m_url = re.match(re_url, initial_url)
3939         #We build the url we will use to get the final track url
3940         # This url is build in Bandcamp in the script download_bunde_*.js
3941         request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3942         final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3943         # If we could correctly generate the .rand field the url would be
3944         #in the "download_url" key
3945         final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3946
3947         track_info = {'id':id,
3948                       'title' : info[u'title'],
3949                       'ext' :   'mp3',
3950                       'url' :   final_url,
3951                       'thumbnail' : info[u'thumb_url'],
3952                       'uploader' :  info[u'artist']
3953                       }
3954
3955         return [track_info]
3956
3957 class RedTubeIE(InfoExtractor):
3958     """Information Extractor for redtube"""
3959     _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3960
3961     def _real_extract(self,url):
3962         mobj = re.match(self._VALID_URL, url)
3963         if mobj is None:
3964             raise ExtractorError(u'Invalid URL: %s' % url)
3965
3966         video_id = mobj.group('id')
3967         video_extension = 'mp4'
3968         webpage = self._download_webpage(url, video_id)
3969
3970         self.report_extraction(video_id)
3971
3972         video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3973             webpage, u'video URL')
3974
3975         video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3976             webpage, u'title')
3977
3978         return [{
3979             'id':       video_id,
3980             'url':      video_url,
3981             'ext':      video_extension,
3982             'title':    video_title,
3983         }]
3984
3985 class InaIE(InfoExtractor):
3986     """Information Extractor for Ina.fr"""
3987     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3988
3989     def _real_extract(self,url):
3990         mobj = re.match(self._VALID_URL, url)
3991
3992         video_id = mobj.group('id')
3993         mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3994         video_extension = 'mp4'
3995         webpage = self._download_webpage(mrss_url, video_id)
3996
3997         self.report_extraction(video_id)
3998
3999         video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
4000             webpage, u'video URL')
4001
4002         video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
4003             webpage, u'title')
4004
4005         return [{
4006             'id':       video_id,
4007             'url':      video_url,
4008             'ext':      video_extension,
4009             'title':    video_title,
4010         }]
4011
4012 class HowcastIE(InfoExtractor):
4013     """Information Extractor for Howcast.com"""
4014     _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
4015
4016     def _real_extract(self, url):
4017         mobj = re.match(self._VALID_URL, url)
4018
4019         video_id = mobj.group('id')
4020         webpage_url = 'http://www.howcast.com/videos/' + video_id
4021         webpage = self._download_webpage(webpage_url, video_id)
4022
4023         self.report_extraction(video_id)
4024
4025         video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
4026             webpage, u'video URL')
4027
4028         video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
4029             webpage, u'title')
4030
4031         video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
4032             webpage, u'description', fatal=False)
4033
4034         thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
4035             webpage, u'thumbnail', fatal=False)
4036
4037         return [{
4038             'id':       video_id,
4039             'url':      video_url,
4040             'ext':      'mp4',
4041             'title':    video_title,
4042             'description': video_description,
4043             'thumbnail': thumbnail,
4044         }]
4045
4046 class VineIE(InfoExtractor):
4047     """Information Extractor for Vine.co"""
4048     _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
4049
4050     def _real_extract(self, url):
4051         mobj = re.match(self._VALID_URL, url)
4052
4053         video_id = mobj.group('id')
4054         webpage_url = 'https://vine.co/v/' + video_id
4055         webpage = self._download_webpage(webpage_url, video_id)
4056
4057         self.report_extraction(video_id)
4058
4059         video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
4060             webpage, u'video URL')
4061
4062         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4063             webpage, u'title')
4064
4065         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
4066             webpage, u'thumbnail', fatal=False)
4067
4068         uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
4069             webpage, u'uploader', fatal=False, flags=re.DOTALL)
4070
4071         return [{
4072             'id':        video_id,
4073             'url':       video_url,
4074             'ext':       'mp4',
4075             'title':     video_title,
4076             'thumbnail': thumbnail,
4077             'uploader':  uploader,
4078         }]
4079
4080 class FlickrIE(InfoExtractor):
4081     """Information Extractor for Flickr videos"""
4082     _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
4083
4084     def _real_extract(self, url):
4085         mobj = re.match(self._VALID_URL, url)
4086
4087         video_id = mobj.group('id')
4088         video_uploader_id = mobj.group('uploader_id')
4089         webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
4090         webpage = self._download_webpage(webpage_url, video_id)
4091
4092         secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
4093
4094         first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
4095         first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
4096
4097         node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
4098             first_xml, u'node_id')
4099
4100         second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
4101         second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
4102
4103         self.report_extraction(video_id)
4104
4105         mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
4106         if mobj is None:
4107             raise ExtractorError(u'Unable to extract video url')
4108         video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
4109
4110         video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
4111             webpage, u'video title')
4112
4113         video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
4114             webpage, u'description', fatal=False)
4115
4116         thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
4117             webpage, u'thumbnail', fatal=False)
4118
4119         return [{
4120             'id':          video_id,
4121             'url':         video_url,
4122             'ext':         'mp4',
4123             'title':       video_title,
4124             'description': video_description,
4125             'thumbnail':   thumbnail,
4126             'uploader_id': video_uploader_id,
4127         }]
4128
4129 class TeamcocoIE(InfoExtractor):
4130     _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
4131
4132     def _real_extract(self, url):
4133         mobj = re.match(self._VALID_URL, url)
4134         if mobj is None:
4135             raise ExtractorError(u'Invalid URL: %s' % url)
4136         url_title = mobj.group('url_title')
4137         webpage = self._download_webpage(url, url_title)
4138
4139         video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
4140             webpage, u'video id')
4141
4142         self.report_extraction(video_id)
4143
4144         video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
4145             webpage, u'title')
4146
4147         thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
4148             webpage, u'thumbnail', fatal=False)
4149
4150         video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
4151             webpage, u'description', fatal=False)
4152
4153         data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
4154         data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
4155
4156         video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
4157             data, u'video URL')
4158
4159         return [{
4160             'id':          video_id,
4161             'url':         video_url,
4162             'ext':         'mp4',
4163             'title':       video_title,
4164             'thumbnail':   thumbnail,
4165             'description': video_description,
4166         }]
4167
4168 class XHamsterIE(InfoExtractor):
4169     """Information Extractor for xHamster"""
4170     _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
4171
4172     def _real_extract(self,url):
4173         mobj = re.match(self._VALID_URL, url)
4174
4175         video_id = mobj.group('id')
4176         mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
4177         webpage = self._download_webpage(mrss_url, video_id)
4178
4179         mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
4180         if mobj is None:
4181             raise ExtractorError(u'Unable to extract media URL')
4182         if len(mobj.group('server')) == 0:
4183             video_url = compat_urllib_parse.unquote(mobj.group('file'))
4184         else:
4185             video_url = mobj.group('server')+'/key='+mobj.group('file')
4186         video_extension = video_url.split('.')[-1]
4187
4188         video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
4189             webpage, u'title')
4190
4191         # Can't see the description anywhere in the UI
4192         # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
4193         #     webpage, u'description', fatal=False)
4194         # if video_description: video_description = unescapeHTML(video_description)
4195
4196         mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
4197         if mobj:
4198             video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
4199         else:
4200             video_upload_date = None
4201             self._downloader.report_warning(u'Unable to extract upload date')
4202
4203         video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
4204             webpage, u'uploader id', default=u'anonymous')
4205
4206         video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
4207             webpage, u'thumbnail', fatal=False)
4208
4209         return [{
4210             'id':       video_id,
4211             'url':      video_url,
4212             'ext':      video_extension,
4213             'title':    video_title,
4214             # 'description': video_description,
4215             'upload_date': video_upload_date,
4216             'uploader_id': video_uploader_id,
4217             'thumbnail': video_thumbnail
4218         }]
4219
4220 class HypemIE(InfoExtractor):
4221     """Information Extractor for hypem"""
4222     _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
4223
4224     def _real_extract(self, url):
4225         mobj = re.match(self._VALID_URL, url)
4226         if mobj is None:
4227             raise ExtractorError(u'Invalid URL: %s' % url)
4228         track_id = mobj.group(1)
4229
4230         data = { 'ax': 1, 'ts': time.time() }
4231         data_encoded = compat_urllib_parse.urlencode(data)
4232         complete_url = url + "?" + data_encoded
4233         request = compat_urllib_request.Request(complete_url)
4234         response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
4235         cookie = urlh.headers.get('Set-Cookie', '')
4236
4237         self.report_extraction(track_id)
4238
4239         html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
4240             response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
4241         try:
4242             track_list = json.loads(html_tracks)
4243             track = track_list[u'tracks'][0]
4244         except ValueError:
4245             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4246
4247         key = track[u"key"]
4248         track_id = track[u"id"]
4249         artist = track[u"artist"]
4250         title = track[u"song"]
4251
4252         serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
4253         request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
4254         request.add_header('cookie', cookie)
4255         song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
4256         try:
4257             song_data = json.loads(song_data_json)
4258         except ValueError:
4259             raise ExtractorError(u'Hypemachine contained invalid JSON.')
4260         final_url = song_data[u"url"]
4261
4262         return [{
4263             'id':       track_id,
4264             'url':      final_url,
4265             'ext':      "mp3",
4266             'title':    title,
4267             'artist':   artist,
4268         }]
4269
4270 class Vbox7IE(InfoExtractor):
4271     """Information Extractor for Vbox7"""
4272     _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
4273
4274     def _real_extract(self,url):
4275         mobj = re.match(self._VALID_URL, url)
4276         if mobj is None:
4277             raise ExtractorError(u'Invalid URL: %s' % url)
4278         video_id = mobj.group(1)
4279
4280         redirect_page, urlh = self._download_webpage_handle(url, video_id)
4281         new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
4282         redirect_url = urlh.geturl() + new_location
4283         webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
4284
4285         title = self._html_search_regex(r'<title>(.*)</title>',
4286             webpage, u'title').split('/')[0].strip()
4287
4288         ext = "flv"
4289         info_url = "http://vbox7.com/play/magare.do"
4290         data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
4291         info_request = compat_urllib_request.Request(info_url, data)
4292         info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
4293         info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
4294         if info_response is None:
4295             raise ExtractorError(u'Unable to extract the media url')
4296         (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
4297
4298         return [{
4299             'id':        video_id,
4300             'url':       final_url,
4301             'ext':       ext,
4302             'title':     title,
4303             'thumbnail': thumbnail_url,
4304         }]
4305
4306 class GametrailersIE(InfoExtractor):
4307     _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
4308
4309     def _real_extract(self, url):
4310         mobj = re.match(self._VALID_URL, url)
4311         if mobj is None:
4312             raise ExtractorError(u'Invalid URL: %s' % url)
4313         video_id = mobj.group('id')
4314         video_type = mobj.group('type')
4315         webpage = self._download_webpage(url, video_id)
4316         if video_type == 'full-episodes':
4317             mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
4318         else:
4319             mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
4320         mgid = self._search_regex(mgid_re, webpage, u'mgid')
4321         data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
4322
4323         info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
4324                                            video_id, u'Downloading video info')
4325         links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
4326                                                video_id, u'Downloading video urls info')
4327
4328         self.report_extraction(video_id)
4329         info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
4330                       <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
4331                       <image>.*
4332                         <url>(?P<thumb>.*?)</url>.*
4333                       </image>'''
4334
4335         m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
4336         if m_info is None:
4337             raise ExtractorError(u'Unable to extract video info')
4338         video_title = m_info.group('title')
4339         video_description = m_info.group('description')
4340         video_thumb = m_info.group('thumb')
4341
4342         m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
4343         if m_urls is None or len(m_urls) == 0:
4344             raise ExtractError(u'Unable to extrat video url')
4345         # They are sorted from worst to best quality
4346         video_url = m_urls[-1].group('url')
4347
4348         return {'url':         video_url,
4349                 'id':          video_id,
4350                 'title':       video_title,
4351                 # Videos are actually flv not mp4
4352                 'ext':         'flv',
4353                 'thumbnail':   video_thumb,
4354                 'description': video_description,
4355                 }
4356
4357 class StatigramIE(InfoExtractor):
4358     _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
4359
4360     def _real_extract(self, url):
4361         mobj = re.match(self._VALID_URL, url)
4362
4363         video_id = mobj.group(1)
4364         webpage = self._download_webpage(url, video_id)
4365         video_url = self._html_search_regex(
4366             r'<meta property="og:video:secure_url" content="(.+?)">',
4367             webpage, u'video URL')
4368         thumbnail_url = self._html_search_regex(
4369             r'<meta property="og:image" content="(.+?)" />',
4370             webpage, u'thumbnail URL', fatal=False)
4371         html_title = self._html_search_regex(
4372             r'<title>(.+?)</title>',
4373             webpage, u'title')
4374         title = html_title.rpartition(u' | Statigram')[0]
4375         uploader_id = self._html_search_regex(
4376             r'@([^ ]+)', title, u'uploader name', fatal=False)
4377         ext = 'mp4'
4378
4379         return [{
4380             'id':        video_id,
4381             'url':       video_url,
4382             'ext':       ext,
4383             'title':     title,
4384             'thumbnail': thumbnail_url,
4385             'uploader_id' : uploader_id
4386         }]
4387
4388 def gen_extractors():
4389     """ Return a list of an instance of every supported extractor.
4390     The order does matter; the first extractor matched is the one handling the URL.
4391     """
4392     return [
4393         YoutubePlaylistIE(),
4394         YoutubeChannelIE(),
4395         YoutubeUserIE(),
4396         YoutubeSearchIE(),
4397         YoutubeIE(),
4398         MetacafeIE(),
4399         DailymotionIE(),
4400         GoogleSearchIE(),
4401         PhotobucketIE(),
4402         YahooIE(),
4403         YahooSearchIE(),
4404         DepositFilesIE(),
4405         FacebookIE(),
4406         BlipTVIE(),
4407         BlipTVUserIE(),
4408         VimeoIE(),
4409         MyVideoIE(),
4410         ComedyCentralIE(),
4411         EscapistIE(),
4412         CollegeHumorIE(),
4413         XVideosIE(),
4414         SoundcloudSetIE(),
4415         SoundcloudIE(),
4416         InfoQIE(),
4417         MixcloudIE(),
4418         StanfordOpenClassroomIE(),
4419         MTVIE(),
4420         YoukuIE(),
4421         XNXXIE(),
4422         YouJizzIE(),
4423         PornotubeIE(),
4424         YouPornIE(),
4425         GooglePlusIE(),
4426         ArteTvIE(),
4427         NBAIE(),
4428         WorldStarHipHopIE(),
4429         JustinTVIE(),
4430         FunnyOrDieIE(),
4431         SteamIE(),
4432         UstreamIE(),
4433         RBMARadioIE(),
4434         EightTracksIE(),
4435         KeekIE(),
4436         TEDIE(),
4437         MySpassIE(),
4438         SpiegelIE(),
4439         LiveLeakIE(),
4440         ARDIE(),
4441         ZDFIE(),
4442         TumblrIE(),
4443         BandcampIE(),
4444         RedTubeIE(),
4445         InaIE(),
4446         HowcastIE(),
4447         VineIE(),
4448         FlickrIE(),
4449         TeamcocoIE(),
4450         XHamsterIE(),
4451         HypemIE(),
4452         Vbox7IE(),
4453         GametrailersIE(),
4454         StatigramIE(),
4455         GenericIE()
4456     ]
4457
4458 def get_info_extractor(ie_name):
4459     """Returns the info extractor class with the given ie_name"""
4460     return globals()[ie_name+'IE']