git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 566             for expression in format_expressions:
 567                 try:
 568                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 569                 except:
 570                     pass
 571
 572         # description
 573         video_description = get_element_by_id("eow-description", video_webpage)
 574         if video_description:
 575             video_description = clean_html(video_description)
 576         else:
 577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 578             if fd_mobj:
 579                 video_description = unescapeHTML(fd_mobj.group(1))
 580             else:
 581                 video_description = u''
 582
 583         # subtitles
 584         video_subtitles = None
 585
 586         if self._downloader.params.get('writesubtitles', False):
 587             video_subtitles = self._extract_subtitle(video_id)
 588             if video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitles[0]
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('allsubtitles', False):
 594             video_subtitles = self._extract_all_subtitles(video_id)
 595             for video_subtitle in video_subtitles:
 596                 (sub_error, sub_lang, sub) = video_subtitle
 597                 if sub_error:
 598                     self._downloader.report_error(sub_error)
 599
 600         if self._downloader.params.get('listsubtitles', False):
 601             sub_lang_list = self._list_available_subtitles(video_id)
 602             return
 603
 604         if 'length_seconds' not in video_info:
 605             self._downloader.report_warning(u'unable to extract video duration')
 606             video_duration = ''
 607         else:
 608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 609
 610         # token
 611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 612
 613         # Decide which formats to download
 614         req_format = self._downloader.params.get('format', None)
 615
 616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 617             self.report_rtmp_download()
 618             video_url_list = [(None, video_info['conn'][0])]
 619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 624
 625             format_limit = self._downloader.params.get('format_limit', None)
 626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 627             if format_limit is not None and format_limit in available_formats:
 628                 format_list = available_formats[available_formats.index(format_limit):]
 629             else:
 630                 format_list = available_formats
 631             existing_formats = [x for x in format_list if x in url_map]
 632             if len(existing_formats) == 0:
 633                 raise ExtractorError(u'no known formats available for video')
 634             if self._downloader.params.get('listformats', None):
 635                 self._print_formats(existing_formats)
 636                 return
 637             if req_format is None or req_format == 'best':
 638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 639             elif req_format == 'worst':
 640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 641             elif req_format in ('-1', 'all'):
 642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 643             else:
 644                 # Specific formats. We pick the first in a slash-delimeted sequence.
 645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 646                 req_formats = req_format.split('/')
 647                 video_url_list = None
 648                 for rf in req_formats:
 649                     if rf in url_map:
 650                         video_url_list = [(rf, url_map[rf])]
 651                         break
 652                 if video_url_list is None:
 653                     raise ExtractorError(u'requested format not available')
 654         else:
 655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 656
 657         results = []
 658         for format_param, video_real_url in video_url_list:
 659             # Extension
 660             video_extension = self._video_extensions.get(format_param, 'flv')
 661
 662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 663                                               self._video_dimensions.get(format_param, '???'))
 664
 665             results.append({
 666                 'id':       video_id,
 667                 'url':      video_real_url,
 668                 'uploader': video_uploader,
 669                 'uploader_id': video_uploader_id,
 670                 'upload_date':  upload_date,
 671                 'title':    video_title,
 672                 'ext':      video_extension,
 673                 'format':   video_format,
 674                 'thumbnail':    video_thumbnail,
 675                 'description':  video_description,
 676                 'player_url':   player_url,
 677                 'subtitles':    video_subtitles,
 678                 'duration':     video_duration
 679             })
 680         return results
 681
 682
 683 class MetacafeIE(InfoExtractor):
 684     """Information Extractor for metacafe.com."""
 685
 686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 689     IE_NAME = u'metacafe'
 690
 691     def report_disclaimer(self):
 692         """Report disclaimer retrieval."""
 693         self.to_screen(u'Retrieving disclaimer')
 694
 695     def _real_initialize(self):
 696         # Retrieve disclaimer
 697         request = compat_urllib_request.Request(self._DISCLAIMER)
 698         try:
 699             self.report_disclaimer()
 700             disclaimer = compat_urllib_request.urlopen(request).read()
 701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 703             return
 704
 705         # Confirm age
 706         disclaimer_form = {
 707             'filters': '0',
 708             'submit': "Continue - I'm over 18",
 709             }
 710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 711         try:
 712             self.report_age_confirmation()
 713             disclaimer = compat_urllib_request.urlopen(request).read()
 714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 716             return
 717
 718     def _real_extract(self, url):
 719         # Extract id and simplified title from URL
 720         mobj = re.match(self._VALID_URL, url)
 721         if mobj is None:
 722             self._downloader.report_error(u'invalid URL: %s' % url)
 723             return
 724
 725         video_id = mobj.group(1)
 726
 727         # Check if video comes from YouTube
 728         mobj2 = re.match(r'^yt-(.*)$', video_id)
 729         if mobj2 is not None:
 730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 731
 732         # Retrieve video webpage to extract further information
 733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 734
 735         # Extract URL, uploader and title from webpage
 736         self.report_extraction(video_id)
 737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 738         if mobj is not None:
 739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 740             video_extension = mediaURL[-3:]
 741
 742             # Extract gdaKey if available
 743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 744             if mobj is None:
 745                 video_url = mediaURL
 746             else:
 747                 gdaKey = mobj.group(1)
 748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 749         else:
 750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 751             if mobj is None:
 752                 self._downloader.report_error(u'unable to extract media URL')
 753                 return
 754             vardict = compat_parse_qs(mobj.group(1))
 755             if 'mediaData' not in vardict:
 756                 self._downloader.report_error(u'unable to extract media URL')
 757                 return
 758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 759             if mobj is None:
 760                 self._downloader.report_error(u'unable to extract media URL')
 761                 return
 762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 763             video_extension = mediaURL[-3:]
 764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 765
 766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 767         if mobj is None:
 768             self._downloader.report_error(u'unable to extract title')
 769             return
 770         video_title = mobj.group(1).decode('utf-8')
 771
 772         mobj = re.search(r'submitter=(.*?);', webpage)
 773         if mobj is None:
 774             self._downloader.report_error(u'unable to extract uploader nickname')
 775             return
 776         video_uploader = mobj.group(1)
 777
 778         return [{
 779             'id':       video_id.decode('utf-8'),
 780             'url':      video_url.decode('utf-8'),
 781             'uploader': video_uploader.decode('utf-8'),
 782             'upload_date':  None,
 783             'title':    video_title,
 784             'ext':      video_extension.decode('utf-8'),
 785         }]
 786
 787
 788 class DailymotionIE(InfoExtractor):
 789     """Information Extractor for Dailymotion"""
 790
 791     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 792     IE_NAME = u'dailymotion'
 793     _WORKING = False
 794
 795     def _real_extract(self, url):
 796         # Extract id and simplified title from URL
 797         mobj = re.match(self._VALID_URL, url)
 798         if mobj is None:
 799             self._downloader.report_error(u'invalid URL: %s' % url)
 800             return
 801
 802         video_id = mobj.group(1).split('_')[0].split('?')[0]
 803
 804         video_extension = 'mp4'
 805
 806         # Retrieve video webpage to extract further information
 807         request = compat_urllib_request.Request(url)
 808         request.add_header('Cookie', 'family_filter=off')
 809         webpage = self._download_webpage(request, video_id)
 810
 811         # Extract URL, uploader and title from webpage
 812         self.report_extraction(video_id)
 813         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 814         if mobj is None:
 815             self._downloader.report_error(u'unable to extract media URL')
 816             return
 817         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 818
 819         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 820             if key in flashvars:
 821                 max_quality = key
 822                 self.to_screen(u'Using %s' % key)
 823                 break
 824         else:
 825             self._downloader.report_error(u'unable to extract video URL')
 826             return
 827
 828         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 829         if mobj is None:
 830             self._downloader.report_error(u'unable to extract video URL')
 831             return
 832
 833         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 834
 835         # TODO: support choosing qualities
 836
 837         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 838         if mobj is None:
 839             self._downloader.report_error(u'unable to extract title')
 840             return
 841         video_title = unescapeHTML(mobj.group('title'))
 842
 843         video_uploader = None
 844         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 845         if mobj is None:
 846             # lookin for official user
 847             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 848             if mobj_official is None:
 849                 self._downloader.report_warning(u'unable to extract uploader nickname')
 850             else:
 851                 video_uploader = mobj_official.group(1)
 852         else:
 853             video_uploader = mobj.group(1)
 854
 855         video_upload_date = None
 856         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 857         if mobj is not None:
 858             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 859
 860         return [{
 861             'id':       video_id,
 862             'url':      video_url,
 863             'uploader': video_uploader,
 864             'upload_date':  video_upload_date,
 865             'title':    video_title,
 866             'ext':      video_extension,
 867         }]
 868
 869
 870 class PhotobucketIE(InfoExtractor):
 871     """Information extractor for photobucket.com."""
 872
 873     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 874     IE_NAME = u'photobucket'
 875
 876     def _real_extract(self, url):
 877         # Extract id from URL
 878         mobj = re.match(self._VALID_URL, url)
 879         if mobj is None:
 880             self._downloader.report_error(u'Invalid URL: %s' % url)
 881             return
 882
 883         video_id = mobj.group(1)
 884
 885         video_extension = 'flv'
 886
 887         # Retrieve video webpage to extract further information
 888         request = compat_urllib_request.Request(url)
 889         try:
 890             self.report_download_webpage(video_id)
 891             webpage = compat_urllib_request.urlopen(request).read()
 892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 893             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 894             return
 895
 896         # Extract URL, uploader, and title from webpage
 897         self.report_extraction(video_id)
 898         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 899         if mobj is None:
 900             self._downloader.report_error(u'unable to extract media URL')
 901             return
 902         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 903
 904         video_url = mediaURL
 905
 906         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 907         if mobj is None:
 908             self._downloader.report_error(u'unable to extract title')
 909             return
 910         video_title = mobj.group(1).decode('utf-8')
 911
 912         video_uploader = mobj.group(2).decode('utf-8')
 913
 914         return [{
 915             'id':       video_id.decode('utf-8'),
 916             'url':      video_url.decode('utf-8'),
 917             'uploader': video_uploader,
 918             'upload_date':  None,
 919             'title':    video_title,
 920             'ext':      video_extension.decode('utf-8'),
 921         }]
 922
 923
 924 class YahooIE(InfoExtractor):
 925     """Information extractor for video.yahoo.com."""
 926
 927     _WORKING = False
 928     # _VALID_URL matches all Yahoo! Video URLs
 929     # _VPAGE_URL matches only the extractable '/watch/' URLs
 930     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 931     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 932     IE_NAME = u'video.yahoo'
 933
 934     def _real_extract(self, url, new_video=True):
 935         # Extract ID from URL
 936         mobj = re.match(self._VALID_URL, url)
 937         if mobj is None:
 938             self._downloader.report_error(u'Invalid URL: %s' % url)
 939             return
 940
 941         video_id = mobj.group(2)
 942         video_extension = 'flv'
 943
 944         # Rewrite valid but non-extractable URLs as
 945         # extractable English language /watch/ URLs
 946         if re.match(self._VPAGE_URL, url) is None:
 947             request = compat_urllib_request.Request(url)
 948             try:
 949                 webpage = compat_urllib_request.urlopen(request).read()
 950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 951                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 952                 return
 953
 954             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 955             if mobj is None:
 956                 self._downloader.report_error(u'Unable to extract id field')
 957                 return
 958             yahoo_id = mobj.group(1)
 959
 960             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 961             if mobj is None:
 962                 self._downloader.report_error(u'Unable to extract vid field')
 963                 return
 964             yahoo_vid = mobj.group(1)
 965
 966             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 967             return self._real_extract(url, new_video=False)
 968
 969         # Retrieve video webpage to extract further information
 970         request = compat_urllib_request.Request(url)
 971         try:
 972             self.report_download_webpage(video_id)
 973             webpage = compat_urllib_request.urlopen(request).read()
 974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 975             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 976             return
 977
 978         # Extract uploader and title from webpage
 979         self.report_extraction(video_id)
 980         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 981         if mobj is None:
 982             self._downloader.report_error(u'unable to extract video title')
 983             return
 984         video_title = mobj.group(1).decode('utf-8')
 985
 986         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 987         if mobj is None:
 988             self._downloader.report_error(u'unable to extract video uploader')
 989             return
 990         video_uploader = mobj.group(1).decode('utf-8')
 991
 992         # Extract video thumbnail
 993         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 994         if mobj is None:
 995             self._downloader.report_error(u'unable to extract video thumbnail')
 996             return
 997         video_thumbnail = mobj.group(1).decode('utf-8')
 998
 999         # Extract video description
1000         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1001         if mobj is None:
1002             self._downloader.report_error(u'unable to extract video description')
1003             return
1004         video_description = mobj.group(1).decode('utf-8')
1005         if not video_description:
1006             video_description = 'No description available.'
1007
1008         # Extract video height and width
1009         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video height')
1012             return
1013         yv_video_height = mobj.group(1)
1014
1015         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video width')
1018             return
1019         yv_video_width = mobj.group(1)
1020
1021         # Retrieve video playlist to extract media URL
1022         # I'm not completely sure what all these options are, but we
1023         # seem to need most of them, otherwise the server sends a 401.
1024         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1025         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1026         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1029         try:
1030             self.report_download_webpage(video_id)
1031             webpage = compat_urllib_request.urlopen(request).read()
1032         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034             return
1035
1036         # Extract media URL from playlist XML
1037         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1038         if mobj is None:
1039             self._downloader.report_error(u'Unable to extract media URL')
1040             return
1041         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042         video_url = unescapeHTML(video_url)
1043
1044         return [{
1045             'id':       video_id.decode('utf-8'),
1046             'url':      video_url,
1047             'uploader': video_uploader,
1048             'upload_date':  None,
1049             'title':    video_title,
1050             'ext':      video_extension.decode('utf-8'),
1051             'thumbnail':    video_thumbnail.decode('utf-8'),
1052             'description':  video_description,
1053         }]
1054
1055
1056 class VimeoIE(InfoExtractor):
1057     """Information extractor for vimeo.com."""
1058
1059     # _VALID_URL matches Vimeo URLs
1060     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061     IE_NAME = u'vimeo'
1062
1063     def _real_extract(self, url, new_video=True):
1064         # Extract ID from URL
1065         mobj = re.match(self._VALID_URL, url)
1066         if mobj is None:
1067             self._downloader.report_error(u'Invalid URL: %s' % url)
1068             return
1069
1070         video_id = mobj.group('id')
1071         if not mobj.group('proto'):
1072             url = 'https://' + url
1073         if mobj.group('direct_link'):
1074             url = 'https://vimeo.com/' + video_id
1075
1076         # Retrieve video webpage to extract further information
1077         request = compat_urllib_request.Request(url, None, std_headers)
1078         try:
1079             self.report_download_webpage(video_id)
1080             webpage_bytes = compat_urllib_request.urlopen(request).read()
1081             webpage = webpage_bytes.decode('utf-8')
1082         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1084             return
1085
1086         # Now we begin extracting as much information as we can from what we
1087         # retrieved. First we extract the information common to all extractors,
1088         # and latter we extract those that are Vimeo specific.
1089         self.report_extraction(video_id)
1090
1091         # Extract the config JSON
1092         try:
1093             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094             config = json.loads(config)
1095         except:
1096             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1097                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1098             else:
1099                 self._downloader.report_error(u'unable to extract info section')
1100             return
1101
1102         # Extract title
1103         video_title = config["video"]["title"]
1104
1105         # Extract uploader and uploader_id
1106         video_uploader = config["video"]["owner"]["name"]
1107         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1108
1109         # Extract video thumbnail
1110         video_thumbnail = config["video"]["thumbnail"]
1111
1112         # Extract video description
1113         video_description = get_element_by_attribute("itemprop", "description", webpage)
1114         if video_description: video_description = clean_html(video_description)
1115         else: video_description = u''
1116
1117         # Extract upload date
1118         video_upload_date = None
1119         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1120         if mobj is not None:
1121             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1122
1123         # Vimeo specific: extract request signature and timestamp
1124         sig = config['request']['signature']
1125         timestamp = config['request']['timestamp']
1126
1127         # Vimeo specific: extract video codec and quality information
1128         # First consider quality, then codecs, then take everything
1129         # TODO bind to format param
1130         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1131         files = { 'hd': [], 'sd': [], 'other': []}
1132         for codec_name, codec_extension in codecs:
1133             if codec_name in config["video"]["files"]:
1134                 if 'hd' in config["video"]["files"][codec_name]:
1135                     files['hd'].append((codec_name, codec_extension, 'hd'))
1136                 elif 'sd' in config["video"]["files"][codec_name]:
1137                     files['sd'].append((codec_name, codec_extension, 'sd'))
1138                 else:
1139                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1140
1141         for quality in ('hd', 'sd', 'other'):
1142             if len(files[quality]) > 0:
1143                 video_quality = files[quality][0][2]
1144                 video_codec = files[quality][0][0]
1145                 video_extension = files[quality][0][1]
1146                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1147                 break
1148         else:
1149             self._downloader.report_error(u'no known codec found')
1150             return
1151
1152         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1153                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1154
1155         return [{
1156             'id':       video_id,
1157             'url':      video_url,
1158             'uploader': video_uploader,
1159             'uploader_id': video_uploader_id,
1160             'upload_date':  video_upload_date,
1161             'title':    video_title,
1162             'ext':      video_extension,
1163             'thumbnail':    video_thumbnail,
1164             'description':  video_description,
1165         }]
1166
1167
1168 class ArteTvIE(InfoExtractor):
1169     """arte.tv information extractor."""
1170
1171     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1172     _LIVE_URL = r'index-[0-9]+\.html$'
1173
1174     IE_NAME = u'arte.tv'
1175
1176     def fetch_webpage(self, url):
1177         request = compat_urllib_request.Request(url)
1178         try:
1179             self.report_download_webpage(url)
1180             webpage = compat_urllib_request.urlopen(request).read()
1181         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1182             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1183             return
1184         except ValueError as err:
1185             self._downloader.report_error(u'Invalid URL: %s' % url)
1186             return
1187         return webpage
1188
1189     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1190         page = self.fetch_webpage(url)
1191         mobj = re.search(regex, page, regexFlags)
1192         info = {}
1193
1194         if mobj is None:
1195             self._downloader.report_error(u'Invalid URL: %s' % url)
1196             return
1197
1198         for (i, key, err) in matchTuples:
1199             if mobj.group(i) is None:
1200                 self._downloader.report_error(err)
1201                 return
1202             else:
1203                 info[key] = mobj.group(i)
1204
1205         return info
1206
1207     def extractLiveStream(self, url):
1208         video_lang = url.split('/')[-4]
1209         info = self.grep_webpage(
1210             url,
1211             r'src="(.*?/videothek_js.*?\.js)',
1212             0,
1213             [
1214                 (1, 'url', u'Invalid URL: %s' % url)
1215             ]
1216         )
1217         http_host = url.split('/')[2]
1218         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1219         info = self.grep_webpage(
1220             next_url,
1221             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1222                 '(http://.*?\.swf).*?' +
1223                 '(rtmp://.*?)\'',
1224             re.DOTALL,
1225             [
1226                 (1, 'path',   u'could not extract video path: %s' % url),
1227                 (2, 'player', u'could not extract video player: %s' % url),
1228                 (3, 'url',    u'could not extract video url: %s' % url)
1229             ]
1230         )
1231         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1232
1233     def extractPlus7Stream(self, url):
1234         video_lang = url.split('/')[-3]
1235         info = self.grep_webpage(
1236             url,
1237             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1238             0,
1239             [
1240                 (1, 'url', u'Invalid URL: %s' % url)
1241             ]
1242         )
1243         next_url = compat_urllib_parse.unquote(info.get('url'))
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1247             0,
1248             [
1249                 (1, 'url', u'Could not find <video> tag: %s' % url)
1250             ]
1251         )
1252         next_url = compat_urllib_parse.unquote(info.get('url'))
1253
1254         info = self.grep_webpage(
1255             next_url,
1256             r'<video id="(.*?)".*?>.*?' +
1257                 '<name>(.*?)</name>.*?' +
1258                 '<dateVideo>(.*?)</dateVideo>.*?' +
1259                 '<url quality="hd">(.*?)</url>',
1260             re.DOTALL,
1261             [
1262                 (1, 'id',    u'could not extract video id: %s' % url),
1263                 (2, 'title', u'could not extract video title: %s' % url),
1264                 (3, 'date',  u'could not extract video date: %s' % url),
1265                 (4, 'url',   u'could not extract video url: %s' % url)
1266             ]
1267         )
1268
1269         return {
1270             'id':           info.get('id'),
1271             'url':          compat_urllib_parse.unquote(info.get('url')),
1272             'uploader':     u'arte.tv',
1273             'upload_date':  info.get('date'),
1274             'title':        info.get('title').decode('utf-8'),
1275             'ext':          u'mp4',
1276             'format':       u'NA',
1277             'player_url':   None,
1278         }
1279
1280     def _real_extract(self, url):
1281         video_id = url.split('/')[-1]
1282         self.report_extraction(video_id)
1283
1284         if re.search(self._LIVE_URL, video_id) is not None:
1285             self.extractLiveStream(url)
1286             return
1287         else:
1288             info = self.extractPlus7Stream(url)
1289
1290         return [info]
1291
1292
1293 class GenericIE(InfoExtractor):
1294     """Generic last-resort information extractor."""
1295
1296     _VALID_URL = r'.*'
1297     IE_NAME = u'generic'
1298
1299     def report_download_webpage(self, video_id):
1300         """Report webpage download."""
1301         if not self._downloader.params.get('test', False):
1302             self._downloader.report_warning(u'Falling back on generic information extractor.')
1303         super(GenericIE, self).report_download_webpage(video_id)
1304
1305     def report_following_redirect(self, new_url):
1306         """Report information extraction."""
1307         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308
1309     def _test_redirect(self, url):
1310         """Check if it is a redirect, like url shorteners, in case return the new url."""
1311         class HeadRequest(compat_urllib_request.Request):
1312             def get_method(self):
1313                 return "HEAD"
1314
1315         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1316             """
1317             Subclass the HTTPRedirectHandler to make it use our
1318             HeadRequest also on the redirected URL
1319             """
1320             def redirect_request(self, req, fp, code, msg, headers, newurl):
1321                 if code in (301, 302, 303, 307):
1322                     newurl = newurl.replace(' ', '%20')
1323                     newheaders = dict((k,v) for k,v in req.headers.items()
1324                                       if k.lower() not in ("content-length", "content-type"))
1325                     return HeadRequest(newurl,
1326                                        headers=newheaders,
1327                                        origin_req_host=req.get_origin_req_host(),
1328                                        unverifiable=True)
1329                 else:
1330                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331
1332         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1333             """
1334             Fallback to GET if HEAD is not allowed (405 HTTP error)
1335             """
1336             def http_error_405(self, req, fp, code, msg, headers):
1337                 fp.read()
1338                 fp.close()
1339
1340                 newheaders = dict((k,v) for k,v in req.headers.items()
1341                                   if k.lower() not in ("content-length", "content-type"))
1342                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1343                                                  headers=newheaders,
1344                                                  origin_req_host=req.get_origin_req_host(),
1345                                                  unverifiable=True))
1346
1347         # Build our opener
1348         opener = compat_urllib_request.OpenerDirector()
1349         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350                         HTTPMethodFallback, HEADRedirectHandler,
1351                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352             opener.add_handler(handler())
1353
1354         response = opener.open(HeadRequest(url))
1355         new_url = response.geturl()
1356
1357         if url == new_url:
1358             return False
1359
1360         self.report_following_redirect(new_url)
1361         return new_url
1362
1363     def _real_extract(self, url):
1364         new_url = self._test_redirect(url)
1365         if new_url: return [self.url_result(new_url)]
1366
1367         video_id = url.split('/')[-1]
1368         try:
1369             webpage = self._download_webpage(url, video_id)
1370         except ValueError as err:
1371             # since this is the last-resort InfoExtractor, if
1372             # this error is thrown, it'll be thrown here
1373             self._downloader.report_error(u'Invalid URL: %s' % url)
1374             return
1375
1376         self.report_extraction(video_id)
1377         # Start with something easy: JW Player in SWFObject
1378         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1379         if mobj is None:
1380             # Broaden the search a little bit
1381             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1382         if mobj is None:
1383             # Broaden the search a little bit: JWPlayer JS loader
1384             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1385         if mobj is None:
1386             self._downloader.report_error(u'Invalid URL: %s' % url)
1387             return
1388
1389         # It's possible that one of the regexes
1390         # matched, but returned an empty group:
1391         if mobj.group(1) is None:
1392             self._downloader.report_error(u'Invalid URL: %s' % url)
1393             return
1394
1395         video_url = compat_urllib_parse.unquote(mobj.group(1))
1396         video_id = os.path.basename(video_url)
1397
1398         # here's a fun little line of code for you:
1399         video_extension = os.path.splitext(video_id)[1][1:]
1400         video_id = os.path.splitext(video_id)[0]
1401
1402         # it's tempting to parse this further, but you would
1403         # have to take into account all the variations like
1404         #   Video Title - Site Name
1405         #   Site Name | Video Title
1406         #   Video Title - Tagline | Site Name
1407         # and so on and so forth; it's just not practical
1408         mobj = re.search(r'<title>(.*)</title>', webpage)
1409         if mobj is None:
1410             self._downloader.report_error(u'unable to extract title')
1411             return
1412         video_title = mobj.group(1)
1413
1414         # video uploader is domain name
1415         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416         if mobj is None:
1417             self._downloader.report_error(u'unable to extract title')
1418             return
1419         video_uploader = mobj.group(1)
1420
1421         return [{
1422             'id':       video_id,
1423             'url':      video_url,
1424             'uploader': video_uploader,
1425             'upload_date':  None,
1426             'title':    video_title,
1427             'ext':      video_extension,
1428         }]
1429
1430
1431 class YoutubeSearchIE(InfoExtractor):
1432     """Information Extractor for YouTube search queries."""
1433     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435     _max_youtube_results = 1000
1436     IE_NAME = u'youtube:search'
1437
1438     def report_download_page(self, query, pagenum):
1439         """Report attempt to download search page with given number."""
1440         query = query.decode(preferredencoding())
1441         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1442
1443     def _real_extract(self, query):
1444         mobj = re.match(self._VALID_URL, query)
1445         if mobj is None:
1446             self._downloader.report_error(u'invalid search query "%s"' % query)
1447             return
1448
1449         prefix, query = query.split(':')
1450         prefix = prefix[8:]
1451         query = query.encode('utf-8')
1452         if prefix == '':
1453             return self._get_n_results(query, 1)
1454         elif prefix == 'all':
1455             self._get_n_results(query, self._max_youtube_results)
1456         else:
1457             try:
1458                 n = int(prefix)
1459                 if n <= 0:
1460                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1461                     return
1462                 elif n > self._max_youtube_results:
1463                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1464                     n = self._max_youtube_results
1465                 return self._get_n_results(query, n)
1466             except ValueError: # parsing prefix as integer fails
1467                 return self._get_n_results(query, 1)
1468
1469     def _get_n_results(self, query, n):
1470         """Get a specified number of results for a query"""
1471
1472         video_ids = []
1473         pagenum = 0
1474         limit = n
1475
1476         while (50 * pagenum) < limit:
1477             self.report_download_page(query, pagenum+1)
1478             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479             request = compat_urllib_request.Request(result_url)
1480             try:
1481                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1484                 return
1485             api_response = json.loads(data)['data']
1486
1487             if not 'items' in api_response:
1488                 self._downloader.report_error(u'[youtube] No video results')
1489                 return
1490
1491             new_ids = list(video['id'] for video in api_response['items'])
1492             video_ids += new_ids
1493
1494             limit = min(n, api_response['totalItems'])
1495             pagenum += 1
1496
1497         if len(video_ids) > n:
1498             video_ids = video_ids[:n]
1499         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1500         return videos
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504     """Information Extractor for Google Video search queries."""
1505     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509     _max_google_results = 1000
1510     IE_NAME = u'video.google:search'
1511
1512     def report_download_page(self, query, pagenum):
1513         """Report attempt to download playlist page with given number."""
1514         query = query.decode(preferredencoding())
1515         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1516
1517     def _real_extract(self, query):
1518         mobj = re.match(self._VALID_URL, query)
1519         if mobj is None:
1520             self._downloader.report_error(u'invalid search query "%s"' % query)
1521             return
1522
1523         prefix, query = query.split(':')
1524         prefix = prefix[8:]
1525         query = query.encode('utf-8')
1526         if prefix == '':
1527             self._download_n_results(query, 1)
1528             return
1529         elif prefix == 'all':
1530             self._download_n_results(query, self._max_google_results)
1531             return
1532         else:
1533             try:
1534                 n = int(prefix)
1535                 if n <= 0:
1536                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1537                     return
1538                 elif n > self._max_google_results:
1539                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540                     n = self._max_google_results
1541                 self._download_n_results(query, n)
1542                 return
1543             except ValueError: # parsing prefix as integer fails
1544                 self._download_n_results(query, 1)
1545                 return
1546
1547     def _download_n_results(self, query, n):
1548         """Downloads a specified number of results for a query"""
1549
1550         video_ids = []
1551         pagenum = 0
1552
1553         while True:
1554             self.report_download_page(query, pagenum)
1555             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556             request = compat_urllib_request.Request(result_url)
1557             try:
1558                 page = compat_urllib_request.urlopen(request).read()
1559             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1561                 return
1562
1563             # Extract video identifiers
1564             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565                 video_id = mobj.group(1)
1566                 if video_id not in video_ids:
1567                     video_ids.append(video_id)
1568                     if len(video_ids) == n:
1569                         # Specified n videos reached
1570                         for id in video_ids:
1571                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572                         return
1573
1574             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575                 for id in video_ids:
1576                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577                 return
1578
1579             pagenum = pagenum + 1
1580
1581
1582 class YahooSearchIE(InfoExtractor):
1583     """Information Extractor for Yahoo! Video search queries."""
1584
1585     _WORKING = False
1586     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1587     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1588     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1589     _MORE_PAGES_INDICATOR = r'\s*Next'
1590     _max_yahoo_results = 1000
1591     IE_NAME = u'video.yahoo:search'
1592
1593     def report_download_page(self, query, pagenum):
1594         """Report attempt to download playlist page with given number."""
1595         query = query.decode(preferredencoding())
1596         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1597
1598     def _real_extract(self, query):
1599         mobj = re.match(self._VALID_URL, query)
1600         if mobj is None:
1601             self._downloader.report_error(u'invalid search query "%s"' % query)
1602             return
1603
1604         prefix, query = query.split(':')
1605         prefix = prefix[8:]
1606         query = query.encode('utf-8')
1607         if prefix == '':
1608             self._download_n_results(query, 1)
1609             return
1610         elif prefix == 'all':
1611             self._download_n_results(query, self._max_yahoo_results)
1612             return
1613         else:
1614             try:
1615                 n = int(prefix)
1616                 if n <= 0:
1617                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1618                     return
1619                 elif n > self._max_yahoo_results:
1620                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621                     n = self._max_yahoo_results
1622                 self._download_n_results(query, n)
1623                 return
1624             except ValueError: # parsing prefix as integer fails
1625                 self._download_n_results(query, 1)
1626                 return
1627
1628     def _download_n_results(self, query, n):
1629         """Downloads a specified number of results for a query"""
1630
1631         video_ids = []
1632         already_seen = set()
1633         pagenum = 1
1634
1635         while True:
1636             self.report_download_page(query, pagenum)
1637             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638             request = compat_urllib_request.Request(result_url)
1639             try:
1640                 page = compat_urllib_request.urlopen(request).read()
1641             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1643                 return
1644
1645             # Extract video identifiers
1646             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647                 video_id = mobj.group(1)
1648                 if video_id not in already_seen:
1649                     video_ids.append(video_id)
1650                     already_seen.add(video_id)
1651                     if len(video_ids) == n:
1652                         # Specified n videos reached
1653                         for id in video_ids:
1654                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655                         return
1656
1657             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658                 for id in video_ids:
1659                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                 return
1661
1662             pagenum = pagenum + 1
1663
1664
1665 class YoutubePlaylistIE(InfoExtractor):
1666     """Information Extractor for YouTube playlists."""
1667
1668     _VALID_URL = r"""(?:
1669                         (?:https?://)?
1670                         (?:\w+\.)?
1671                         youtube\.com/
1672                         (?:
1673                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1674                            \? (?:.*?&)*? (?:p|a|list)=
1675                         |  p/
1676                         )
1677                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1678                         .*
1679                      |
1680                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1681                      )"""
1682     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1683     _MAX_RESULTS = 50
1684     IE_NAME = u'youtube:playlist'
1685
1686     @classmethod
1687     def suitable(cls, url):
1688         """Receives a URL and returns True if suitable for this IE."""
1689         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1690
1691     def report_download_page(self, playlist_id, pagenum):
1692         """Report attempt to download playlist page with given number."""
1693         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1694
1695     def _real_extract(self, url):
1696         # Extract playlist id
1697         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1698         if mobj is None:
1699             self._downloader.report_error(u'invalid url: %s' % url)
1700             return
1701
1702         # Download playlist videos from API
1703         playlist_id = mobj.group(1) or mobj.group(2)
1704         page_num = 1
1705         videos = []
1706
1707         while True:
1708             self.report_download_page(playlist_id, page_num)
1709
1710             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1711             try:
1712                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1713             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1715                 return
1716
1717             try:
1718                 response = json.loads(page)
1719             except ValueError as err:
1720                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1721                 return
1722
1723             if 'feed' not in response:
1724                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1725                 return
1726             if 'entry' not in response['feed']:
1727                 # Number of videos is a multiple of self._MAX_RESULTS
1728                 break
1729
1730             playlist_title = response['feed']['title']['$t']
1731
1732             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1733                         for entry in response['feed']['entry']
1734                         if 'content' in entry ]
1735
1736             if len(response['feed']['entry']) < self._MAX_RESULTS:
1737                 break
1738             page_num += 1
1739
1740         videos = [v[1] for v in sorted(videos)]
1741
1742         url_results = [self.url_result(url, 'Youtube') for url in videos]
1743         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1744
1745
1746 class YoutubeChannelIE(InfoExtractor):
1747     """Information Extractor for YouTube channels."""
1748
1749     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1750     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1751     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1752     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1753     IE_NAME = u'youtube:channel'
1754
1755     def report_download_page(self, channel_id, pagenum):
1756         """Report attempt to download channel page with given number."""
1757         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1758
1759     def extract_videos_from_page(self, page):
1760         ids_in_page = []
1761         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1762             if mobj.group(1) not in ids_in_page:
1763                 ids_in_page.append(mobj.group(1))
1764         return ids_in_page
1765
1766     def _real_extract(self, url):
1767         # Extract channel id
1768         mobj = re.match(self._VALID_URL, url)
1769         if mobj is None:
1770             self._downloader.report_error(u'invalid url: %s' % url)
1771             return
1772
1773         # Download channel page
1774         channel_id = mobj.group(1)
1775         video_ids = []
1776         pagenum = 1
1777
1778         self.report_download_page(channel_id, pagenum)
1779         url = self._TEMPLATE_URL % (channel_id, pagenum)
1780         request = compat_urllib_request.Request(url)
1781         try:
1782             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1783         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1784             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1785             return
1786
1787         # Extract video identifiers
1788         ids_in_page = self.extract_videos_from_page(page)
1789         video_ids.extend(ids_in_page)
1790
1791         # Download any subsequent channel pages using the json-based channel_ajax query
1792         if self._MORE_PAGES_INDICATOR in page:
1793             while True:
1794                 pagenum = pagenum + 1
1795
1796                 self.report_download_page(channel_id, pagenum)
1797                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1798                 request = compat_urllib_request.Request(url)
1799                 try:
1800                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1801                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1802                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1803                     return
1804
1805                 page = json.loads(page)
1806
1807                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1808                 video_ids.extend(ids_in_page)
1809
1810                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1811                     break
1812
1813         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1814
1815         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1816         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1817         return [self.playlist_result(url_entries, channel_id)]
1818
1819
1820 class YoutubeUserIE(InfoExtractor):
1821     """Information Extractor for YouTube users."""
1822
1823     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1824     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1825     _GDATA_PAGE_SIZE = 50
1826     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1827     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1828     IE_NAME = u'youtube:user'
1829
1830     def report_download_page(self, username, start_index):
1831         """Report attempt to download user page."""
1832         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1833                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1834
1835     def _real_extract(self, url):
1836         # Extract username
1837         mobj = re.match(self._VALID_URL, url)
1838         if mobj is None:
1839             self._downloader.report_error(u'invalid url: %s' % url)
1840             return
1841
1842         username = mobj.group(1)
1843
1844         # Download video ids using YouTube Data API. Result size per
1845         # query is limited (currently to 50 videos) so we need to query
1846         # page by page until there are no video ids - it means we got
1847         # all of them.
1848
1849         video_ids = []
1850         pagenum = 0
1851
1852         while True:
1853             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1854             self.report_download_page(username, start_index)
1855
1856             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1857
1858             try:
1859                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1860             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1862                 return
1863
1864             # Extract video identifiers
1865             ids_in_page = []
1866
1867             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1868                 if mobj.group(1) not in ids_in_page:
1869                     ids_in_page.append(mobj.group(1))
1870
1871             video_ids.extend(ids_in_page)
1872
1873             # A little optimization - if current page is not
1874             # "full", ie. does not contain PAGE_SIZE video ids then
1875             # we can assume that this page is the last one - there
1876             # are no more ids on further pages - no need to query
1877             # again.
1878
1879             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1880                 break
1881
1882             pagenum += 1
1883
1884         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1885         url_results = [self.url_result(url, 'Youtube') for url in urls]
1886         return [self.playlist_result(url_results, playlist_title = username)]
1887
1888
1889 class BlipTVUserIE(InfoExtractor):
1890     """Information Extractor for blip.tv users."""
1891
1892     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1893     _PAGE_SIZE = 12
1894     IE_NAME = u'blip.tv:user'
1895
1896     def report_download_page(self, username, pagenum):
1897         """Report attempt to download user page."""
1898         self.to_screen(u'user %s: Downloading video ids from page %d' %
1899                 (username, pagenum))
1900
1901     def _real_extract(self, url):
1902         # Extract username
1903         mobj = re.match(self._VALID_URL, url)
1904         if mobj is None:
1905             self._downloader.report_error(u'invalid url: %s' % url)
1906             return
1907
1908         username = mobj.group(1)
1909
1910         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1911
1912         request = compat_urllib_request.Request(url)
1913
1914         try:
1915             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1916             mobj = re.search(r'data-users-id="([^"]+)"', page)
1917             page_base = page_base % mobj.group(1)
1918         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1920             return
1921
1922
1923         # Download video ids using BlipTV Ajax calls. Result size per
1924         # query is limited (currently to 12 videos) so we need to query
1925         # page by page until there are no video ids - it means we got
1926         # all of them.
1927
1928         video_ids = []
1929         pagenum = 1
1930
1931         while True:
1932             self.report_download_page(username, pagenum)
1933             url = page_base + "&page=" + str(pagenum)
1934             request = compat_urllib_request.Request( url )
1935             try:
1936                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1937             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1939                 return
1940
1941             # Extract video identifiers
1942             ids_in_page = []
1943
1944             for mobj in re.finditer(r'href="/([^"]+)"', page):
1945                 if mobj.group(1) not in ids_in_page:
1946                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1947
1948             video_ids.extend(ids_in_page)
1949
1950             # A little optimization - if current page is not
1951             # "full", ie. does not contain PAGE_SIZE video ids then
1952             # we can assume that this page is the last one - there
1953             # are no more ids on further pages - no need to query
1954             # again.
1955
1956             if len(ids_in_page) < self._PAGE_SIZE:
1957                 break
1958
1959             pagenum += 1
1960
1961         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1962         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1963         return [self.playlist_result(url_entries, playlist_title = username)]
1964
1965
1966 class DepositFilesIE(InfoExtractor):
1967     """Information extractor for depositfiles.com"""
1968
1969     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1970
1971     def _real_extract(self, url):
1972         file_id = url.split('/')[-1]
1973         # Rebuild url in english locale
1974         url = 'http://depositfiles.com/en/files/' + file_id
1975
1976         # Retrieve file webpage with 'Free download' button pressed
1977         free_download_indication = { 'gateway_result' : '1' }
1978         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1979         try:
1980             self.report_download_webpage(file_id)
1981             webpage = compat_urllib_request.urlopen(request).read()
1982         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1984             return
1985
1986         # Search for the real file URL
1987         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1988         if (mobj is None) or (mobj.group(1) is None):
1989             # Try to figure out reason of the error.
1990             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1991             if (mobj is not None) and (mobj.group(1) is not None):
1992                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1993                 self._downloader.report_error(u'%s' % restriction_message)
1994             else:
1995                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1996             return
1997
1998         file_url = mobj.group(1)
1999         file_extension = os.path.splitext(file_url)[1][1:]
2000
2001         # Search for file title
2002         mobj = re.search(r'<b title="(.*?)">', webpage)
2003         if mobj is None:
2004             self._downloader.report_error(u'unable to extract title')
2005             return
2006         file_title = mobj.group(1).decode('utf-8')
2007
2008         return [{
2009             'id':       file_id.decode('utf-8'),
2010             'url':      file_url.decode('utf-8'),
2011             'uploader': None,
2012             'upload_date':  None,
2013             'title':    file_title,
2014             'ext':      file_extension.decode('utf-8'),
2015         }]
2016
2017
2018 class FacebookIE(InfoExtractor):
2019     """Information Extractor for Facebook"""
2020
2021     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2022     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2023     _NETRC_MACHINE = 'facebook'
2024     IE_NAME = u'facebook'
2025
2026     def report_login(self):
2027         """Report attempt to log in."""
2028         self.to_screen(u'Logging in')
2029
2030     def _real_initialize(self):
2031         if self._downloader is None:
2032             return
2033
2034         useremail = None
2035         password = None
2036         downloader_params = self._downloader.params
2037
2038         # Attempt to use provided username and password or .netrc data
2039         if downloader_params.get('username', None) is not None:
2040             useremail = downloader_params['username']
2041             password = downloader_params['password']
2042         elif downloader_params.get('usenetrc', False):
2043             try:
2044                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2045                 if info is not None:
2046                     useremail = info[0]
2047                     password = info[2]
2048                 else:
2049                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2050             except (IOError, netrc.NetrcParseError) as err:
2051                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2052                 return
2053
2054         if useremail is None:
2055             return
2056
2057         # Log in
2058         login_form = {
2059             'email': useremail,
2060             'pass': password,
2061             'login': 'Log+In'
2062             }
2063         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2064         try:
2065             self.report_login()
2066             login_results = compat_urllib_request.urlopen(request).read()
2067             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2068                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2069                 return
2070         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2072             return
2073
2074     def _real_extract(self, url):
2075         mobj = re.match(self._VALID_URL, url)
2076         if mobj is None:
2077             self._downloader.report_error(u'invalid URL: %s' % url)
2078             return
2079         video_id = mobj.group('ID')
2080
2081         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2082         webpage = self._download_webpage(url, video_id)
2083
2084         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2085         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2086         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2087         if not m:
2088             raise ExtractorError(u'Cannot parse data')
2089         data = dict(json.loads(m.group(1)))
2090         params_raw = compat_urllib_parse.unquote(data['params'])
2091         params = json.loads(params_raw)
2092         video_data = params['video_data'][0]
2093         video_url = video_data.get('hd_src')
2094         if not video_url:
2095             video_url = video_data['sd_src']
2096         if not video_url:
2097             raise ExtractorError(u'Cannot find video URL')
2098         video_duration = int(video_data['video_duration'])
2099         thumbnail = video_data['thumbnail_src']
2100
2101         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2102         if not m:
2103             raise ExtractorError(u'Cannot find title in webpage')
2104         video_title = unescapeHTML(m.group(1))
2105
2106         info = {
2107             'id': video_id,
2108             'title': video_title,
2109             'url': video_url,
2110             'ext': 'mp4',
2111             'duration': video_duration,
2112             'thumbnail': thumbnail,
2113         }
2114         return [info]
2115
2116
2117 class BlipTVIE(InfoExtractor):
2118     """Information extractor for blip.tv"""
2119
2120     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2121     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2122     IE_NAME = u'blip.tv'
2123
2124     def report_direct_download(self, title):
2125         """Report information extraction."""
2126         self.to_screen(u'%s: Direct download detected' % title)
2127
2128     def _real_extract(self, url):
2129         mobj = re.match(self._VALID_URL, url)
2130         if mobj is None:
2131             self._downloader.report_error(u'invalid URL: %s' % url)
2132             return
2133
2134         urlp = compat_urllib_parse_urlparse(url)
2135         if urlp.path.startswith('/play/'):
2136             request = compat_urllib_request.Request(url)
2137             response = compat_urllib_request.urlopen(request)
2138             redirecturl = response.geturl()
2139             rurlp = compat_urllib_parse_urlparse(redirecturl)
2140             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2141             url = 'http://blip.tv/a/a-' + file_id
2142             return self._real_extract(url)
2143
2144
2145         if '?' in url:
2146             cchar = '&'
2147         else:
2148             cchar = '?'
2149         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2150         request = compat_urllib_request.Request(json_url)
2151         request.add_header('User-Agent', 'iTunes/10.6.1')
2152         self.report_extraction(mobj.group(1))
2153         info = None
2154         try:
2155             urlh = compat_urllib_request.urlopen(request)
2156             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2157                 basename = url.split('/')[-1]
2158                 title,ext = os.path.splitext(basename)
2159                 title = title.decode('UTF-8')
2160                 ext = ext.replace('.', '')
2161                 self.report_direct_download(title)
2162                 info = {
2163                     'id': title,
2164                     'url': url,
2165                     'uploader': None,
2166                     'upload_date': None,
2167                     'title': title,
2168                     'ext': ext,
2169                     'urlhandle': urlh
2170                 }
2171         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2173         if info is None: # Regular URL
2174             try:
2175                 json_code_bytes = urlh.read()
2176                 json_code = json_code_bytes.decode('utf-8')
2177             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2178                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2179                 return
2180
2181             try:
2182                 json_data = json.loads(json_code)
2183                 if 'Post' in json_data:
2184                     data = json_data['Post']
2185                 else:
2186                     data = json_data
2187
2188                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2189                 video_url = data['media']['url']
2190                 umobj = re.match(self._URL_EXT, video_url)
2191                 if umobj is None:
2192                     raise ValueError('Can not determine filename extension')
2193                 ext = umobj.group(1)
2194
2195                 info = {
2196                     'id': data['item_id'],
2197                     'url': video_url,
2198                     'uploader': data['display_name'],
2199                     'upload_date': upload_date,
2200                     'title': data['title'],
2201                     'ext': ext,
2202                     'format': data['media']['mimeType'],
2203                     'thumbnail': data['thumbnailUrl'],
2204                     'description': data['description'],
2205                     'player_url': data['embedUrl'],
2206                     'user_agent': 'iTunes/10.6.1',
2207                 }
2208             except (ValueError,KeyError) as err:
2209                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2210                 return
2211
2212         return [info]
2213
2214
2215 class MyVideoIE(InfoExtractor):
2216     """Information Extractor for myvideo.de."""
2217
2218     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2219     IE_NAME = u'myvideo'
2220
2221     def _real_extract(self,url):
2222         mobj = re.match(self._VALID_URL, url)
2223         if mobj is None:
2224             self._download.report_error(u'invalid URL: %s' % url)
2225             return
2226
2227         video_id = mobj.group(1)
2228
2229         # Get video webpage
2230         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2231         webpage = self._download_webpage(webpage_url, video_id)
2232
2233         self.report_extraction(video_id)
2234         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2235                  webpage)
2236         if mobj is None:
2237             self._downloader.report_error(u'unable to extract media URL')
2238             return
2239         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2240
2241         mobj = re.search('<title>([^<]+)</title>', webpage)
2242         if mobj is None:
2243             self._downloader.report_error(u'unable to extract title')
2244             return
2245
2246         video_title = mobj.group(1)
2247
2248         return [{
2249             'id':       video_id,
2250             'url':      video_url,
2251             'uploader': None,
2252             'upload_date':  None,
2253             'title':    video_title,
2254             'ext':      u'flv',
2255         }]
2256
2257 class ComedyCentralIE(InfoExtractor):
2258     """Information extractor for The Daily Show and Colbert Report """
2259
2260     # urls can be abbreviations like :thedailyshow or :colbert
2261     # urls for episodes like:
2262     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2263     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2264     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2265     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2266                       |(https?://)?(www\.)?
2267                           (?P<showname>thedailyshow|colbertnation)\.com/
2268                          (full-episodes/(?P<episode>.*)|
2269                           (?P<clip>
2270                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2271                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2272                      $"""
2273
2274     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2275
2276     _video_extensions = {
2277         '3500': 'mp4',
2278         '2200': 'mp4',
2279         '1700': 'mp4',
2280         '1200': 'mp4',
2281         '750': 'mp4',
2282         '400': 'mp4',
2283     }
2284     _video_dimensions = {
2285         '3500': '1280x720',
2286         '2200': '960x540',
2287         '1700': '768x432',
2288         '1200': '640x360',
2289         '750': '512x288',
2290         '400': '384x216',
2291     }
2292
2293     @classmethod
2294     def suitable(cls, url):
2295         """Receives a URL and returns True if suitable for this IE."""
2296         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2297
2298     def report_config_download(self, episode_id, media_id):
2299         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2300
2301     def report_index_download(self, episode_id):
2302         self.to_screen(u'%s: Downloading show index' % episode_id)
2303
2304     def _print_formats(self, formats):
2305         print('Available formats:')
2306         for x in formats:
2307             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2308
2309
2310     def _real_extract(self, url):
2311         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2312         if mobj is None:
2313             self._downloader.report_error(u'invalid URL: %s' % url)
2314             return
2315
2316         if mobj.group('shortname'):
2317             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2318                 url = u'http://www.thedailyshow.com/full-episodes/'
2319             else:
2320                 url = u'http://www.colbertnation.com/full-episodes/'
2321             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2322             assert mobj is not None
2323
2324         if mobj.group('clip'):
2325             if mobj.group('showname') == 'thedailyshow':
2326                 epTitle = mobj.group('tdstitle')
2327             else:
2328                 epTitle = mobj.group('cntitle')
2329             dlNewest = False
2330         else:
2331             dlNewest = not mobj.group('episode')
2332             if dlNewest:
2333                 epTitle = mobj.group('showname')
2334             else:
2335                 epTitle = mobj.group('episode')
2336
2337         req = compat_urllib_request.Request(url)
2338         self.report_extraction(epTitle)
2339         try:
2340             htmlHandle = compat_urllib_request.urlopen(req)
2341             html = htmlHandle.read()
2342             webpage = html.decode('utf-8')
2343         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2344             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2345             return
2346         if dlNewest:
2347             url = htmlHandle.geturl()
2348             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2349             if mobj is None:
2350                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2351                 return
2352             if mobj.group('episode') == '':
2353                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2354                 return
2355             epTitle = mobj.group('episode')
2356
2357         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2358
2359         if len(mMovieParams) == 0:
2360             # The Colbert Report embeds the information in a without
2361             # a URL prefix; so extract the alternate reference
2362             # and then add the URL prefix manually.
2363
2364             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2365             if len(altMovieParams) == 0:
2366                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2367                 return
2368             else:
2369                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2370
2371         uri = mMovieParams[0][1]
2372         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2373         self.report_index_download(epTitle)
2374         try:
2375             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2376         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2377             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2378             return
2379
2380         results = []
2381
2382         idoc = xml.etree.ElementTree.fromstring(indexXml)
2383         itemEls = idoc.findall('.//item')
2384         for partNum,itemEl in enumerate(itemEls):
2385             mediaId = itemEl.findall('./guid')[0].text
2386             shortMediaId = mediaId.split(':')[-1]
2387             showId = mediaId.split(':')[-2].replace('.com', '')
2388             officialTitle = itemEl.findall('./title')[0].text
2389             officialDate = itemEl.findall('./pubDate')[0].text
2390
2391             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2392                         compat_urllib_parse.urlencode({'uri': mediaId}))
2393             configReq = compat_urllib_request.Request(configUrl)
2394             self.report_config_download(epTitle, shortMediaId)
2395             try:
2396                 configXml = compat_urllib_request.urlopen(configReq).read()
2397             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2398                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2399                 return
2400
2401             cdoc = xml.etree.ElementTree.fromstring(configXml)
2402             turls = []
2403             for rendition in cdoc.findall('.//rendition'):
2404                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2405                 turls.append(finfo)
2406
2407             if len(turls) == 0:
2408                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2409                 continue
2410
2411             if self._downloader.params.get('listformats', None):
2412                 self._print_formats([i[0] for i in turls])
2413                 return
2414
2415             # For now, just pick the highest bitrate
2416             format,rtmp_video_url = turls[-1]
2417
2418             # Get the format arg from the arg stream
2419             req_format = self._downloader.params.get('format', None)
2420
2421             # Select format if we can find one
2422             for f,v in turls:
2423                 if f == req_format:
2424                     format, rtmp_video_url = f, v
2425                     break
2426
2427             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2428             if not m:
2429                 raise ExtractorError(u'Cannot transform RTMP url')
2430             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2431             video_url = base + m.group('finalid')
2432
2433             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2434             info = {
2435                 'id': shortMediaId,
2436                 'url': video_url,
2437                 'uploader': showId,
2438                 'upload_date': officialDate,
2439                 'title': effTitle,
2440                 'ext': 'mp4',
2441                 'format': format,
2442                 'thumbnail': None,
2443                 'description': officialTitle,
2444             }
2445             results.append(info)
2446
2447         return results
2448
2449
2450 class EscapistIE(InfoExtractor):
2451     """Information extractor for The Escapist """
2452
2453     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2454     IE_NAME = u'escapist'
2455
2456     def report_config_download(self, showName):
2457         self.to_screen(u'%s: Downloading configuration' % showName)
2458
2459     def _real_extract(self, url):
2460         mobj = re.match(self._VALID_URL, url)
2461         if mobj is None:
2462             self._downloader.report_error(u'invalid URL: %s' % url)
2463             return
2464         showName = mobj.group('showname')
2465         videoId = mobj.group('episode')
2466
2467         self.report_extraction(showName)
2468         try:
2469             webPage = compat_urllib_request.urlopen(url)
2470             webPageBytes = webPage.read()
2471             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2472             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2473         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2474             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2475             return
2476
2477         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2478         description = unescapeHTML(descMatch.group(1))
2479         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2480         imgUrl = unescapeHTML(imgMatch.group(1))
2481         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2482         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2483         configUrlMatch = re.search('config=(.*)$', playerUrl)
2484         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2485
2486         self.report_config_download(showName)
2487         try:
2488             configJSON = compat_urllib_request.urlopen(configUrl)
2489             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2490             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2491         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2493             return
2494
2495         # Technically, it's JavaScript, not JSON
2496         configJSON = configJSON.replace("'", '"')
2497
2498         try:
2499             config = json.loads(configJSON)
2500         except (ValueError,) as err:
2501             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2502             return
2503
2504         playlist = config['playlist']
2505         videoUrl = playlist[1]['url']
2506
2507         info = {
2508             'id': videoId,
2509             'url': videoUrl,
2510             'uploader': showName,
2511             'upload_date': None,
2512             'title': showName,
2513             'ext': 'mp4',
2514             'thumbnail': imgUrl,
2515             'description': description,
2516             'player_url': playerUrl,
2517         }
2518
2519         return [info]
2520
2521 class CollegeHumorIE(InfoExtractor):
2522     """Information extractor for collegehumor.com"""
2523
2524     _WORKING = False
2525     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2526     IE_NAME = u'collegehumor'
2527
2528     def report_manifest(self, video_id):
2529         """Report information extraction."""
2530         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2531
2532     def _real_extract(self, url):
2533         mobj = re.match(self._VALID_URL, url)
2534         if mobj is None:
2535             self._downloader.report_error(u'invalid URL: %s' % url)
2536             return
2537         video_id = mobj.group('videoid')
2538
2539         info = {
2540             'id': video_id,
2541             'uploader': None,
2542             'upload_date': None,
2543         }
2544
2545         self.report_extraction(video_id)
2546         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2547         try:
2548             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2549         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2551             return
2552
2553         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2554         try:
2555             videoNode = mdoc.findall('./video')[0]
2556             info['description'] = videoNode.findall('./description')[0].text
2557             info['title'] = videoNode.findall('./caption')[0].text
2558             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2559             manifest_url = videoNode.findall('./file')[0].text
2560         except IndexError:
2561             self._downloader.report_error(u'Invalid metadata XML file')
2562             return
2563
2564         manifest_url += '?hdcore=2.10.3'
2565         self.report_manifest(video_id)
2566         try:
2567             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2568         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2570             return
2571
2572         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2573         try:
2574             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2575             node_id = media_node.attrib['url']
2576             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2577         except IndexError as err:
2578             self._downloader.report_error(u'Invalid manifest file')
2579             return
2580
2581         url_pr = compat_urllib_parse_urlparse(manifest_url)
2582         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2583
2584         info['url'] = url
2585         info['ext'] = 'f4f'
2586         return [info]
2587
2588
2589 class XVideosIE(InfoExtractor):
2590     """Information extractor for xvideos.com"""
2591
2592     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2593     IE_NAME = u'xvideos'
2594
2595     def _real_extract(self, url):
2596         mobj = re.match(self._VALID_URL, url)
2597         if mobj is None:
2598             self._downloader.report_error(u'invalid URL: %s' % url)
2599             return
2600         video_id = mobj.group(1)
2601
2602         webpage = self._download_webpage(url, video_id)
2603
2604         self.report_extraction(video_id)
2605
2606
2607         # Extract video URL
2608         mobj = re.search(r'flv_url=(.+?)&', webpage)
2609         if mobj is None:
2610             self._downloader.report_error(u'unable to extract video url')
2611             return
2612         video_url = compat_urllib_parse.unquote(mobj.group(1))
2613
2614
2615         # Extract title
2616         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2617         if mobj is None:
2618             self._downloader.report_error(u'unable to extract video title')
2619             return
2620         video_title = mobj.group(1)
2621
2622
2623         # Extract video thumbnail
2624         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2625         if mobj is None:
2626             self._downloader.report_error(u'unable to extract video thumbnail')
2627             return
2628         video_thumbnail = mobj.group(0)
2629
2630         info = {
2631             'id': video_id,
2632             'url': video_url,
2633             'uploader': None,
2634             'upload_date': None,
2635             'title': video_title,
2636             'ext': 'flv',
2637             'thumbnail': video_thumbnail,
2638             'description': None,
2639         }
2640
2641         return [info]
2642
2643
2644 class SoundcloudIE(InfoExtractor):
2645     """Information extractor for soundcloud.com
2646        To access the media, the uid of the song and a stream token
2647        must be extracted from the page source and the script must make
2648        a request to media.soundcloud.com/crossdomain.xml. Then
2649        the media can be grabbed by requesting from an url composed
2650        of the stream token and uid
2651      """
2652
2653     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2654     IE_NAME = u'soundcloud'
2655
2656     def report_resolve(self, video_id):
2657         """Report information extraction."""
2658         self.to_screen(u'%s: Resolving id' % video_id)
2659
2660     def _real_extract(self, url):
2661         mobj = re.match(self._VALID_URL, url)
2662         if mobj is None:
2663             self._downloader.report_error(u'invalid URL: %s' % url)
2664             return
2665
2666         # extract uploader (which is in the url)
2667         uploader = mobj.group(1)
2668         # extract simple title (uploader + slug of song title)
2669         slug_title =  mobj.group(2)
2670         simple_title = uploader + u'-' + slug_title
2671
2672         self.report_resolve('%s/%s' % (uploader, slug_title))
2673
2674         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2675         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2676         request = compat_urllib_request.Request(resolv_url)
2677         try:
2678             info_json_bytes = compat_urllib_request.urlopen(request).read()
2679             info_json = info_json_bytes.decode('utf-8')
2680         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2681             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2682             return
2683
2684         info = json.loads(info_json)
2685         video_id = info['id']
2686         self.report_extraction('%s/%s' % (uploader, slug_title))
2687
2688         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2689         request = compat_urllib_request.Request(streams_url)
2690         try:
2691             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2692             stream_json = stream_json_bytes.decode('utf-8')
2693         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2695             return
2696
2697         streams = json.loads(stream_json)
2698         mediaURL = streams['http_mp3_128_url']
2699
2700         return [{
2701             'id':       info['id'],
2702             'url':      mediaURL,
2703             'uploader': info['user']['username'],
2704             'upload_date':  info['created_at'],
2705             'title':    info['title'],
2706             'ext':      u'mp3',
2707             'description': info['description'],
2708         }]
2709
2710 class SoundcloudSetIE(InfoExtractor):
2711     """Information extractor for soundcloud.com sets
2712        To access the media, the uid of the song and a stream token
2713        must be extracted from the page source and the script must make
2714        a request to media.soundcloud.com/crossdomain.xml. Then
2715        the media can be grabbed by requesting from an url composed
2716        of the stream token and uid
2717      """
2718
2719     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2720     IE_NAME = u'soundcloud:set'
2721
2722     def report_resolve(self, video_id):
2723         """Report information extraction."""
2724         self.to_screen(u'%s: Resolving id' % video_id)
2725
2726     def _real_extract(self, url):
2727         mobj = re.match(self._VALID_URL, url)
2728         if mobj is None:
2729             self._downloader.report_error(u'invalid URL: %s' % url)
2730             return
2731
2732         # extract uploader (which is in the url)
2733         uploader = mobj.group(1)
2734         # extract simple title (uploader + slug of song title)
2735         slug_title =  mobj.group(2)
2736         simple_title = uploader + u'-' + slug_title
2737
2738         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2739
2740         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2741         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2742         request = compat_urllib_request.Request(resolv_url)
2743         try:
2744             info_json_bytes = compat_urllib_request.urlopen(request).read()
2745             info_json = info_json_bytes.decode('utf-8')
2746         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2748             return
2749
2750         videos = []
2751         info = json.loads(info_json)
2752         if 'errors' in info:
2753             for err in info['errors']:
2754                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2755             return
2756
2757         for track in info['tracks']:
2758             video_id = track['id']
2759             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2760
2761             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2762             request = compat_urllib_request.Request(streams_url)
2763             try:
2764                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2765                 stream_json = stream_json_bytes.decode('utf-8')
2766             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2767                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2768                 return
2769
2770             streams = json.loads(stream_json)
2771             mediaURL = streams['http_mp3_128_url']
2772
2773             videos.append({
2774                 'id':       video_id,
2775                 'url':      mediaURL,
2776                 'uploader': track['user']['username'],
2777                 'upload_date':  track['created_at'],
2778                 'title':    track['title'],
2779                 'ext':      u'mp3',
2780                 'description': track['description'],
2781             })
2782         return videos
2783
2784
2785 class InfoQIE(InfoExtractor):
2786     """Information extractor for infoq.com"""
2787     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2788
2789     def _real_extract(self, url):
2790         mobj = re.match(self._VALID_URL, url)
2791         if mobj is None:
2792             self._downloader.report_error(u'invalid URL: %s' % url)
2793             return
2794
2795         webpage = self._download_webpage(url, video_id=url)
2796         self.report_extraction(url)
2797
2798         # Extract video URL
2799         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2800         if mobj is None:
2801             self._downloader.report_error(u'unable to extract video url')
2802             return
2803         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2804         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2805
2806         # Extract title
2807         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2808         if mobj is None:
2809             self._downloader.report_error(u'unable to extract video title')
2810             return
2811         video_title = mobj.group(1)
2812
2813         # Extract description
2814         video_description = u'No description available.'
2815         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2816         if mobj is not None:
2817             video_description = mobj.group(1)
2818
2819         video_filename = video_url.split('/')[-1]
2820         video_id, extension = video_filename.split('.')
2821
2822         info = {
2823             'id': video_id,
2824             'url': video_url,
2825             'uploader': None,
2826             'upload_date': None,
2827             'title': video_title,
2828             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2829             'thumbnail': None,
2830             'description': video_description,
2831         }
2832
2833         return [info]
2834
2835 class MixcloudIE(InfoExtractor):
2836     """Information extractor for www.mixcloud.com"""
2837
2838     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2839     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2840     IE_NAME = u'mixcloud'
2841
2842     def report_download_json(self, file_id):
2843         """Report JSON download."""
2844         self.to_screen(u'Downloading json')
2845
2846     def get_urls(self, jsonData, fmt, bitrate='best'):
2847         """Get urls from 'audio_formats' section in json"""
2848         file_url = None
2849         try:
2850             bitrate_list = jsonData[fmt]
2851             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2852                 bitrate = max(bitrate_list) # select highest
2853
2854             url_list = jsonData[fmt][bitrate]
2855         except TypeError: # we have no bitrate info.
2856             url_list = jsonData[fmt]
2857         return url_list
2858
2859     def check_urls(self, url_list):
2860         """Returns 1st active url from list"""
2861         for url in url_list:
2862             try:
2863                 compat_urllib_request.urlopen(url)
2864                 return url
2865             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2866                 url = None
2867
2868         return None
2869
2870     def _print_formats(self, formats):
2871         print('Available formats:')
2872         for fmt in formats.keys():
2873             for b in formats[fmt]:
2874                 try:
2875                     ext = formats[fmt][b][0]
2876                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2877                 except TypeError: # we have no bitrate info
2878                     ext = formats[fmt][0]
2879                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2880                     break
2881
2882     def _real_extract(self, url):
2883         mobj = re.match(self._VALID_URL, url)
2884         if mobj is None:
2885             self._downloader.report_error(u'invalid URL: %s' % url)
2886             return
2887         # extract uploader & filename from url
2888         uploader = mobj.group(1).decode('utf-8')
2889         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2890
2891         # construct API request
2892         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2893         # retrieve .json file with links to files
2894         request = compat_urllib_request.Request(file_url)
2895         try:
2896             self.report_download_json(file_url)
2897             jsonData = compat_urllib_request.urlopen(request).read()
2898         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2899             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2900             return
2901
2902         # parse JSON
2903         json_data = json.loads(jsonData)
2904         player_url = json_data['player_swf_url']
2905         formats = dict(json_data['audio_formats'])
2906
2907         req_format = self._downloader.params.get('format', None)
2908         bitrate = None
2909
2910         if self._downloader.params.get('listformats', None):
2911             self._print_formats(formats)
2912             return
2913
2914         if req_format is None or req_format == 'best':
2915             for format_param in formats.keys():
2916                 url_list = self.get_urls(formats, format_param)
2917                 # check urls
2918                 file_url = self.check_urls(url_list)
2919                 if file_url is not None:
2920                     break # got it!
2921         else:
2922             if req_format not in formats:
2923                 self._downloader.report_error(u'format is not available')
2924                 return
2925
2926             url_list = self.get_urls(formats, req_format)
2927             file_url = self.check_urls(url_list)
2928             format_param = req_format
2929
2930         return [{
2931             'id': file_id.decode('utf-8'),
2932             'url': file_url.decode('utf-8'),
2933             'uploader': uploader.decode('utf-8'),
2934             'upload_date': None,
2935             'title': json_data['name'],
2936             'ext': file_url.split('.')[-1].decode('utf-8'),
2937             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2938             'thumbnail': json_data['thumbnail_url'],
2939             'description': json_data['description'],
2940             'player_url': player_url.decode('utf-8'),
2941         }]
2942
2943 class StanfordOpenClassroomIE(InfoExtractor):
2944     """Information extractor for Stanford's Open ClassRoom"""
2945
2946     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2947     IE_NAME = u'stanfordoc'
2948
2949     def _real_extract(self, url):
2950         mobj = re.match(self._VALID_URL, url)
2951         if mobj is None:
2952             raise ExtractorError(u'Invalid URL: %s' % url)
2953
2954         if mobj.group('course') and mobj.group('video'): # A specific video
2955             course = mobj.group('course')
2956             video = mobj.group('video')
2957             info = {
2958                 'id': course + '_' + video,
2959                 'uploader': None,
2960                 'upload_date': None,
2961             }
2962
2963             self.report_extraction(info['id'])
2964             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2965             xmlUrl = baseUrl + video + '.xml'
2966             try:
2967                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2968             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2969                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2970                 return
2971             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2972             try:
2973                 info['title'] = mdoc.findall('./title')[0].text
2974                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2975             except IndexError:
2976                 self._downloader.report_error(u'Invalid metadata XML file')
2977                 return
2978             info['ext'] = info['url'].rpartition('.')[2]
2979             return [info]
2980         elif mobj.group('course'): # A course page
2981             course = mobj.group('course')
2982             info = {
2983                 'id': course,
2984                 'type': 'playlist',
2985                 'uploader': None,
2986                 'upload_date': None,
2987             }
2988
2989             coursepage = self._download_webpage(url, info['id'],
2990                                         note='Downloading course info page',
2991                                         errnote='Unable to download course info page')
2992
2993             m = re.search('<h1>([^<]+)</h1>', coursepage)
2994             if m:
2995                 info['title'] = unescapeHTML(m.group(1))
2996             else:
2997                 info['title'] = info['id']
2998
2999             m = re.search('<description>([^<]+)</description>', coursepage)
3000             if m:
3001                 info['description'] = unescapeHTML(m.group(1))
3002
3003             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3004             info['list'] = [
3005                 {
3006                     'type': 'reference',
3007                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3008                 }
3009                     for vpage in links]
3010             results = []
3011             for entry in info['list']:
3012                 assert entry['type'] == 'reference'
3013                 results += self.extract(entry['url'])
3014             return results
3015         else: # Root page
3016             info = {
3017                 'id': 'Stanford OpenClassroom',
3018                 'type': 'playlist',
3019                 'uploader': None,
3020                 'upload_date': None,
3021             }
3022
3023             self.report_download_webpage(info['id'])
3024             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3025             try:
3026                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3027             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3029                 return
3030
3031             info['title'] = info['id']
3032
3033             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3034             info['list'] = [
3035                 {
3036                     'type': 'reference',
3037                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3038                 }
3039                     for cpage in links]
3040
3041             results = []
3042             for entry in info['list']:
3043                 assert entry['type'] == 'reference'
3044                 results += self.extract(entry['url'])
3045             return results
3046
3047 class MTVIE(InfoExtractor):
3048     """Information extractor for MTV.com"""
3049
3050     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3051     IE_NAME = u'mtv'
3052
3053     def _real_extract(self, url):
3054         mobj = re.match(self._VALID_URL, url)
3055         if mobj is None:
3056             self._downloader.report_error(u'invalid URL: %s' % url)
3057             return
3058         if not mobj.group('proto'):
3059             url = 'http://' + url
3060         video_id = mobj.group('videoid')
3061
3062         webpage = self._download_webpage(url, video_id)
3063
3064         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3065         if mobj is None:
3066             self._downloader.report_error(u'unable to extract song name')
3067             return
3068         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3069         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3070         if mobj is None:
3071             self._downloader.report_error(u'unable to extract performer')
3072             return
3073         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3074         video_title = performer + ' - ' + song_name
3075
3076         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3077         if mobj is None:
3078             self._downloader.report_error(u'unable to mtvn_uri')
3079             return
3080         mtvn_uri = mobj.group(1)
3081
3082         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3083         if mobj is None:
3084             self._downloader.report_error(u'unable to extract content id')
3085             return
3086         content_id = mobj.group(1)
3087
3088         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3089         self.report_extraction(video_id)
3090         request = compat_urllib_request.Request(videogen_url)
3091         try:
3092             metadataXml = compat_urllib_request.urlopen(request).read()
3093         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3094             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3095             return
3096
3097         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3098         renditions = mdoc.findall('.//rendition')
3099
3100         # For now, always pick the highest quality.
3101         rendition = renditions[-1]
3102
3103         try:
3104             _,_,ext = rendition.attrib['type'].partition('/')
3105             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3106             video_url = rendition.find('./src').text
3107         except KeyError:
3108             self._downloader.report_error('Invalid rendition field.')
3109             return
3110
3111         info = {
3112             'id': video_id,
3113             'url': video_url,
3114             'uploader': performer,
3115             'upload_date': None,
3116             'title': video_title,
3117             'ext': ext,
3118             'format': format,
3119         }
3120
3121         return [info]
3122
3123
3124 class YoukuIE(InfoExtractor):
3125     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3126
3127     def _gen_sid(self):
3128         nowTime = int(time.time() * 1000)
3129         random1 = random.randint(1000,1998)
3130         random2 = random.randint(1000,9999)
3131
3132         return "%d%d%d" %(nowTime,random1,random2)
3133
3134     def _get_file_ID_mix_string(self, seed):
3135         mixed = []
3136         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3137         seed = float(seed)
3138         for i in range(len(source)):
3139             seed  =  (seed * 211 + 30031 ) % 65536
3140             index  =  math.floor(seed / 65536 * len(source) )
3141             mixed.append(source[int(index)])
3142             source.remove(source[int(index)])
3143         #return ''.join(mixed)
3144         return mixed
3145
3146     def _get_file_id(self, fileId, seed):
3147         mixed = self._get_file_ID_mix_string(seed)
3148         ids = fileId.split('*')
3149         realId = []
3150         for ch in ids:
3151             if ch:
3152                 realId.append(mixed[int(ch)])
3153         return ''.join(realId)
3154
3155     def _real_extract(self, url):
3156         mobj = re.match(self._VALID_URL, url)
3157         if mobj is None:
3158             self._downloader.report_error(u'invalid URL: %s' % url)
3159             return
3160         video_id = mobj.group('ID')
3161
3162         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3163
3164         request = compat_urllib_request.Request(info_url, None, std_headers)
3165         try:
3166             self.report_download_webpage(video_id)
3167             jsondata = compat_urllib_request.urlopen(request).read()
3168         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3170             return
3171
3172         self.report_extraction(video_id)
3173         try:
3174             jsonstr = jsondata.decode('utf-8')
3175             config = json.loads(jsonstr)
3176
3177             video_title =  config['data'][0]['title']
3178             seed = config['data'][0]['seed']
3179
3180             format = self._downloader.params.get('format', None)
3181             supported_format = list(config['data'][0]['streamfileids'].keys())
3182
3183             if format is None or format == 'best':
3184                 if 'hd2' in supported_format:
3185                     format = 'hd2'
3186                 else:
3187                     format = 'flv'
3188                 ext = u'flv'
3189             elif format == 'worst':
3190                 format = 'mp4'
3191                 ext = u'mp4'
3192             else:
3193                 format = 'flv'
3194                 ext = u'flv'
3195
3196
3197             fileid = config['data'][0]['streamfileids'][format]
3198             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3199         except (UnicodeDecodeError, ValueError, KeyError):
3200             self._downloader.report_error(u'unable to extract info section')
3201             return
3202
3203         files_info=[]
3204         sid = self._gen_sid()
3205         fileid = self._get_file_id(fileid, seed)
3206
3207         #column 8,9 of fileid represent the segment number
3208         #fileid[7:9] should be changed
3209         for index, key in enumerate(keys):
3210
3211             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3212             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3213
3214             info = {
3215                 'id': '%s_part%02d' % (video_id, index),
3216                 'url': download_url,
3217                 'uploader': None,
3218                 'upload_date': None,
3219                 'title': video_title,
3220                 'ext': ext,
3221             }
3222             files_info.append(info)
3223
3224         return files_info
3225
3226
3227 class XNXXIE(InfoExtractor):
3228     """Information extractor for xnxx.com"""
3229
3230     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3231     IE_NAME = u'xnxx'
3232     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3233     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3234     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3235
3236     def _real_extract(self, url):
3237         mobj = re.match(self._VALID_URL, url)
3238         if mobj is None:
3239             self._downloader.report_error(u'invalid URL: %s' % url)
3240             return
3241         video_id = mobj.group(1)
3242
3243         self.report_download_webpage(video_id)
3244
3245         # Get webpage content
3246         try:
3247             webpage_bytes = compat_urllib_request.urlopen(url).read()
3248             webpage = webpage_bytes.decode('utf-8')
3249         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3250             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3251             return
3252
3253         result = re.search(self.VIDEO_URL_RE, webpage)
3254         if result is None:
3255             self._downloader.report_error(u'unable to extract video url')
3256             return
3257         video_url = compat_urllib_parse.unquote(result.group(1))
3258
3259         result = re.search(self.VIDEO_TITLE_RE, webpage)
3260         if result is None:
3261             self._downloader.report_error(u'unable to extract video title')
3262             return
3263         video_title = result.group(1)
3264
3265         result = re.search(self.VIDEO_THUMB_RE, webpage)
3266         if result is None:
3267             self._downloader.report_error(u'unable to extract video thumbnail')
3268             return
3269         video_thumbnail = result.group(1)
3270
3271         return [{
3272             'id': video_id,
3273             'url': video_url,
3274             'uploader': None,
3275             'upload_date': None,
3276             'title': video_title,
3277             'ext': 'flv',
3278             'thumbnail': video_thumbnail,
3279             'description': None,
3280         }]
3281
3282
3283 class GooglePlusIE(InfoExtractor):
3284     """Information extractor for plus.google.com."""
3285
3286     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3287     IE_NAME = u'plus.google'
3288
3289     def report_extract_entry(self, url):
3290         """Report downloading extry"""
3291         self.to_screen(u'Downloading entry: %s' % url)
3292
3293     def report_date(self, upload_date):
3294         """Report downloading extry"""
3295         self.to_screen(u'Entry date: %s' % upload_date)
3296
3297     def report_uploader(self, uploader):
3298         """Report downloading extry"""
3299         self.to_screen(u'Uploader: %s' % uploader)
3300
3301     def report_title(self, video_title):
3302         """Report downloading extry"""
3303         self.to_screen(u'Title: %s' % video_title)
3304
3305     def report_extract_vid_page(self, video_page):
3306         """Report information extraction."""
3307         self.to_screen(u'Extracting video page: %s' % video_page)
3308
3309     def _real_extract(self, url):
3310         # Extract id from URL
3311         mobj = re.match(self._VALID_URL, url)
3312         if mobj is None:
3313             self._downloader.report_error(u'Invalid URL: %s' % url)
3314             return
3315
3316         post_url = mobj.group(0)
3317         video_id = mobj.group(1)
3318
3319         video_extension = 'flv'
3320
3321         # Step 1, Retrieve post webpage to extract further information
3322         self.report_extract_entry(post_url)
3323         request = compat_urllib_request.Request(post_url)
3324         try:
3325             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3328             return
3329
3330         # Extract update date
3331         upload_date = None
3332         pattern = 'title="Timestamp">(.*?)</a>'
3333         mobj = re.search(pattern, webpage)
3334         if mobj:
3335             upload_date = mobj.group(1)
3336             # Convert timestring to a format suitable for filename
3337             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3338             upload_date = upload_date.strftime('%Y%m%d')
3339         self.report_date(upload_date)
3340
3341         # Extract uploader
3342         uploader = None
3343         pattern = r'rel\="author".*?>(.*?)</a>'
3344         mobj = re.search(pattern, webpage)
3345         if mobj:
3346             uploader = mobj.group(1)
3347         self.report_uploader(uploader)
3348
3349         # Extract title
3350         # Get the first line for title
3351         video_title = u'NA'
3352         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3353         mobj = re.search(pattern, webpage)
3354         if mobj:
3355             video_title = mobj.group(1)
3356         self.report_title(video_title)
3357
3358         # Step 2, Stimulate clicking the image box to launch video
3359         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3360         mobj = re.search(pattern, webpage)
3361         if mobj is None:
3362             self._downloader.report_error(u'unable to extract video page URL')
3363
3364         video_page = mobj.group(1)
3365         request = compat_urllib_request.Request(video_page)
3366         try:
3367             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3368         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3369             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3370             return
3371         self.report_extract_vid_page(video_page)
3372
3373
3374         # Extract video links on video page
3375         """Extract video links of all sizes"""
3376         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3377         mobj = re.findall(pattern, webpage)
3378         if len(mobj) == 0:
3379             self._downloader.report_error(u'unable to extract video links')
3380
3381         # Sort in resolution
3382         links = sorted(mobj)
3383
3384         # Choose the lowest of the sort, i.e. highest resolution
3385         video_url = links[-1]
3386         # Only get the url. The resolution part in the tuple has no use anymore
3387         video_url = video_url[-1]
3388         # Treat escaped \u0026 style hex
3389         try:
3390             video_url = video_url.decode("unicode_escape")
3391         except AttributeError: # Python 3
3392             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3393
3394
3395         return [{
3396             'id':       video_id,
3397             'url':      video_url,
3398             'uploader': uploader,
3399             'upload_date':  upload_date,
3400             'title':    video_title,
3401             'ext':      video_extension,
3402         }]
3403
3404 class NBAIE(InfoExtractor):
3405     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3406     IE_NAME = u'nba'
3407
3408     def _real_extract(self, url):
3409         mobj = re.match(self._VALID_URL, url)
3410         if mobj is None:
3411             self._downloader.report_error(u'invalid URL: %s' % url)
3412             return
3413
3414         video_id = mobj.group(1)
3415         if video_id.endswith('/index.html'):
3416             video_id = video_id[:-len('/index.html')]
3417
3418         webpage = self._download_webpage(url, video_id)
3419
3420         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3421         def _findProp(rexp, default=None):
3422             m = re.search(rexp, webpage)
3423             if m:
3424                 return unescapeHTML(m.group(1))
3425             else:
3426                 return default
3427
3428         shortened_video_id = video_id.rpartition('/')[2]
3429         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3430         info = {
3431             'id': shortened_video_id,
3432             'url': video_url,
3433             'ext': 'mp4',
3434             'title': title,
3435             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3436             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3437         }
3438         return [info]
3439
3440 class JustinTVIE(InfoExtractor):
3441     """Information extractor for justin.tv and twitch.tv"""
3442     # TODO: One broadcast may be split into multiple videos. The key
3443     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3444     # starts at 1 and increases. Can we treat all parts as one video?
3445
3446     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3447         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3448     _JUSTIN_PAGE_LIMIT = 100
3449     IE_NAME = u'justin.tv'
3450
3451     def report_download_page(self, channel, offset):
3452         """Report attempt to download a single page of videos."""
3453         self.to_screen(u'%s: Downloading video information from %d to %d' %
3454                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3455
3456     # Return count of items, list of *valid* items
3457     def _parse_page(self, url):
3458         try:
3459             urlh = compat_urllib_request.urlopen(url)
3460             webpage_bytes = urlh.read()
3461             webpage = webpage_bytes.decode('utf-8', 'ignore')
3462         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3463             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3464             return
3465
3466         response = json.loads(webpage)
3467         if type(response) != list:
3468             error_text = response.get('error', 'unknown error')
3469             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3470             return
3471         info = []
3472         for clip in response:
3473             video_url = clip['video_file_url']
3474             if video_url:
3475                 video_extension = os.path.splitext(video_url)[1][1:]
3476                 video_date = re.sub('-', '', clip['start_time'][:10])
3477                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3478                 video_id = clip['id']
3479                 video_title = clip.get('title', video_id)
3480                 info.append({
3481                     'id': video_id,
3482                     'url': video_url,
3483                     'title': video_title,
3484                     'uploader': clip.get('channel_name', video_uploader_id),
3485                     'uploader_id': video_uploader_id,
3486                     'upload_date': video_date,
3487                     'ext': video_extension,
3488                 })
3489         return (len(response), info)
3490
3491     def _real_extract(self, url):
3492         mobj = re.match(self._VALID_URL, url)
3493         if mobj is None:
3494             self._downloader.report_error(u'invalid URL: %s' % url)
3495             return
3496
3497         api = 'http://api.justin.tv'
3498         video_id = mobj.group(mobj.lastindex)
3499         paged = False
3500         if mobj.lastindex == 1:
3501             paged = True
3502             api += '/channel/archives/%s.json'
3503         else:
3504             api += '/broadcast/by_archive/%s.json'
3505         api = api % (video_id,)
3506
3507         self.report_extraction(video_id)
3508
3509         info = []
3510         offset = 0
3511         limit = self._JUSTIN_PAGE_LIMIT
3512         while True:
3513             if paged:
3514                 self.report_download_page(video_id, offset)
3515             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3516             page_count, page_info = self._parse_page(page_url)
3517             info.extend(page_info)
3518             if not paged or page_count != limit:
3519                 break
3520             offset += limit
3521         return info
3522
3523 class FunnyOrDieIE(InfoExtractor):
3524     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3525
3526     def _real_extract(self, url):
3527         mobj = re.match(self._VALID_URL, url)
3528         if mobj is None:
3529             self._downloader.report_error(u'invalid URL: %s' % url)
3530             return
3531
3532         video_id = mobj.group('id')
3533         webpage = self._download_webpage(url, video_id)
3534
3535         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3536         if not m:
3537             self._downloader.report_error(u'unable to find video information')
3538         video_url = unescapeHTML(m.group('url'))
3539
3540         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3541         if not m:
3542             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3543             if not m:
3544                 self._downloader.report_error(u'Cannot find video title')
3545         title = clean_html(m.group('title'))
3546
3547         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3548         if m:
3549             desc = unescapeHTML(m.group('desc'))
3550         else:
3551             desc = None
3552
3553         info = {
3554             'id': video_id,
3555             'url': video_url,
3556             'ext': 'mp4',
3557             'title': title,
3558             'description': desc,
3559         }
3560         return [info]
3561
3562 class SteamIE(InfoExtractor):
3563     _VALID_URL = r"""http://store.steampowered.com/
3564                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3565                 (?P<gameID>\d+)/?
3566                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3567                 """
3568
3569     @classmethod
3570     def suitable(cls, url):
3571         """Receives a URL and returns True if suitable for this IE."""
3572         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3573
3574     def _real_extract(self, url):
3575         m = re.match(self._VALID_URL, url, re.VERBOSE)
3576         gameID = m.group('gameID')
3577         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3578         self.report_age_confirmation()
3579         webpage = self._download_webpage(videourl, gameID)
3580         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3581
3582         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3583         mweb = re.finditer(urlRE, webpage)
3584         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3585         titles = re.finditer(namesRE, webpage)
3586         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3587         thumbs = re.finditer(thumbsRE, webpage)
3588         videos = []
3589         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3590             video_id = vid.group('videoID')
3591             title = vtitle.group('videoName')
3592             video_url = vid.group('videoURL')
3593             video_thumb = thumb.group('thumbnail')
3594             if not video_url:
3595                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3596             info = {
3597                 'id':video_id,
3598                 'url':video_url,
3599                 'ext': 'flv',
3600                 'title': unescapeHTML(title),
3601                 'thumbnail': video_thumb
3602                   }
3603             videos.append(info)
3604         return [self.playlist_result(videos, gameID, game_title)]
3605
3606 class UstreamIE(InfoExtractor):
3607     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3608     IE_NAME = u'ustream'
3609
3610     def _real_extract(self, url):
3611         m = re.match(self._VALID_URL, url)
3612         video_id = m.group('videoID')
3613         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3614         webpage = self._download_webpage(url, video_id)
3615         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3616         title = m.group('title')
3617         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3618         uploader = m.group('uploader')
3619         info = {
3620                 'id':video_id,
3621                 'url':video_url,
3622                 'ext': 'flv',
3623                 'title': title,
3624                 'uploader': uploader
3625                   }
3626         return [info]
3627
3628 class WorldStarHipHopIE(InfoExtractor):
3629     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3630     IE_NAME = u'WorldStarHipHop'
3631
3632     def _real_extract(self, url):
3633         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3634
3635         webpage_src = compat_urllib_request.urlopen(url).read()
3636         webpage_src = webpage_src.decode('utf-8')
3637
3638         mobj = re.search(_src_url, webpage_src)
3639
3640         m = re.match(self._VALID_URL, url)
3641         video_id = m.group('id')
3642
3643         if mobj is not None:
3644             video_url = mobj.group()
3645             if 'mp4' in video_url:
3646                 ext = 'mp4'
3647             else:
3648                 ext = 'flv'
3649         else:
3650             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3651             return
3652
3653         _title = r"""<title>(.*)</title>"""
3654
3655         mobj = re.search(_title, webpage_src)
3656
3657         if mobj is not None:
3658             title = mobj.group(1)
3659         else:
3660             title = 'World Start Hip Hop - %s' % time.ctime()
3661
3662         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3663         mobj = re.search(_thumbnail, webpage_src)
3664
3665         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3666         if mobj is not None:
3667             thumbnail = mobj.group(1)
3668         else:
3669             _title = r"""candytitles.*>(.*)</span>"""
3670             mobj = re.search(_title, webpage_src)
3671             if mobj is not None:
3672                 title = mobj.group(1)
3673             thumbnail = None
3674
3675         results = [{
3676                     'id': video_id,
3677                     'url' : video_url,
3678                     'title' : title,
3679                     'thumbnail' : thumbnail,
3680                     'ext' : ext,
3681                     }]
3682         return results
3683
3684 class RBMARadioIE(InfoExtractor):
3685     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3686
3687     def _real_extract(self, url):
3688         m = re.match(self._VALID_URL, url)
3689         video_id = m.group('videoID')
3690
3691         webpage = self._download_webpage(url, video_id)
3692         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3693         if not m:
3694             raise ExtractorError(u'Cannot find metadata')
3695         json_data = m.group(1)
3696
3697         try:
3698             data = json.loads(json_data)
3699         except ValueError as e:
3700             raise ExtractorError(u'Invalid JSON: ' + str(e))
3701
3702         video_url = data['akamai_url'] + '&cbr=256'
3703         url_parts = compat_urllib_parse_urlparse(video_url)
3704         video_ext = url_parts.path.rpartition('.')[2]
3705         info = {
3706                 'id': video_id,
3707                 'url': video_url,
3708                 'ext': video_ext,
3709                 'title': data['title'],
3710                 'description': data.get('teaser_text'),
3711                 'location': data.get('country_of_origin'),
3712                 'uploader': data.get('host', {}).get('name'),
3713                 'uploader_id': data.get('host', {}).get('slug'),
3714                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3715                 'duration': data.get('duration'),
3716         }
3717         return [info]
3718
3719
3720 class YouPornIE(InfoExtractor):
3721     """Information extractor for youporn.com."""
3722     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3723
3724     def _print_formats(self, formats):
3725         """Print all available formats"""
3726         print(u'Available formats:')
3727         print(u'ext\t\tformat')
3728         print(u'---------------------------------')
3729         for format in formats:
3730             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3731
3732     def _specific(self, req_format, formats):
3733         for x in formats:
3734             if(x["format"]==req_format):
3735                 return x
3736         return None
3737
3738     def _real_extract(self, url):
3739         mobj = re.match(self._VALID_URL, url)
3740         if mobj is None:
3741             self._downloader.report_error(u'invalid URL: %s' % url)
3742             return
3743
3744         video_id = mobj.group('videoid')
3745
3746         req = compat_urllib_request.Request(url)
3747         req.add_header('Cookie', 'age_verified=1')
3748         webpage = self._download_webpage(req, video_id)
3749
3750         # Get the video title
3751         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3752         if result is None:
3753             raise ExtractorError(u'Unable to extract video title')
3754         video_title = result.group('title').strip()
3755
3756         # Get the video date
3757         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3758         if result is None:
3759             self._downloader.report_warning(u'unable to extract video date')
3760             upload_date = None
3761         else:
3762             upload_date = result.group('date').strip()
3763
3764         # Get the video uploader
3765         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3766         if result is None:
3767             self._downloader.report_warning(u'unable to extract uploader')
3768             video_uploader = None
3769         else:
3770             video_uploader = result.group('uploader').strip()
3771             video_uploader = clean_html( video_uploader )
3772
3773         # Get all of the formats available
3774         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3775         result = re.search(DOWNLOAD_LIST_RE, webpage)
3776         if result is None:
3777             raise ExtractorError(u'Unable to extract download list')
3778         download_list_html = result.group('download_list').strip()
3779
3780         # Get all of the links from the page
3781         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3782         links = re.findall(LINK_RE, download_list_html)
3783         if(len(links) == 0):
3784             raise ExtractorError(u'ERROR: no known formats available for video')
3785
3786         self.to_screen(u'Links found: %d' % len(links))
3787
3788         formats = []
3789         for link in links:
3790
3791             # A link looks like this:
3792             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3793             # A path looks like this:
3794             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3795             video_url = unescapeHTML( link )
3796             path = compat_urllib_parse_urlparse( video_url ).path
3797             extension = os.path.splitext( path )[1][1:]
3798             format = path.split('/')[4].split('_')[:2]
3799             size = format[0]
3800             bitrate = format[1]
3801             format = "-".join( format )
3802             title = u'%s-%s-%s' % (video_title, size, bitrate)
3803
3804             formats.append({
3805                 'id': video_id,
3806                 'url': video_url,
3807                 'uploader': video_uploader,
3808                 'upload_date': upload_date,
3809                 'title': title,
3810                 'ext': extension,
3811                 'format': format,
3812                 'thumbnail': None,
3813                 'description': None,
3814                 'player_url': None
3815             })
3816
3817         if self._downloader.params.get('listformats', None):
3818             self._print_formats(formats)
3819             return
3820
3821         req_format = self._downloader.params.get('format', None)
3822         self.to_screen(u'Format: %s' % req_format)
3823
3824         if req_format is None or req_format == 'best':
3825             return [formats[0]]
3826         elif req_format == 'worst':
3827             return [formats[-1]]
3828         elif req_format in ('-1', 'all'):
3829             return formats
3830         else:
3831             format = self._specific( req_format, formats )
3832             if result is None:
3833                 self._downloader.report_error(u'requested format not available')
3834                 return
3835             return [format]
3836
3837
3838
3839 class PornotubeIE(InfoExtractor):
3840     """Information extractor for pornotube.com."""
3841     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3842
3843     def _real_extract(self, url):
3844         mobj = re.match(self._VALID_URL, url)
3845         if mobj is None:
3846             self._downloader.report_error(u'invalid URL: %s' % url)
3847             return
3848
3849         video_id = mobj.group('videoid')
3850         video_title = mobj.group('title')
3851
3852         # Get webpage content
3853         webpage = self._download_webpage(url, video_id)
3854
3855         # Get the video URL
3856         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3857         result = re.search(VIDEO_URL_RE, webpage)
3858         if result is None:
3859             self._downloader.report_error(u'unable to extract video url')
3860             return
3861         video_url = compat_urllib_parse.unquote(result.group('url'))
3862
3863         #Get the uploaded date
3864         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3865         result = re.search(VIDEO_UPLOADED_RE, webpage)
3866         if result is None:
3867             self._downloader.report_error(u'unable to extract video title')
3868             return
3869         upload_date = result.group('date')
3870
3871         info = {'id': video_id,
3872                 'url': video_url,
3873                 'uploader': None,
3874                 'upload_date': upload_date,
3875                 'title': video_title,
3876                 'ext': 'flv',
3877                 'format': 'flv'}
3878
3879         return [info]
3880
3881 class YouJizzIE(InfoExtractor):
3882     """Information extractor for youjizz.com."""
3883     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3884
3885     def _real_extract(self, url):
3886         mobj = re.match(self._VALID_URL, url)
3887         if mobj is None:
3888             self._downloader.report_error(u'invalid URL: %s' % url)
3889             return
3890
3891         video_id = mobj.group('videoid')
3892
3893         # Get webpage content
3894         webpage = self._download_webpage(url, video_id)
3895
3896         # Get the video title
3897         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3898         if result is None:
3899             raise ExtractorError(u'ERROR: unable to extract video title')
3900         video_title = result.group('title').strip()
3901
3902         # Get the embed page
3903         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3904         if result is None:
3905             raise ExtractorError(u'ERROR: unable to extract embed page')
3906
3907         embed_page_url = result.group(0).strip()
3908         video_id = result.group('videoid')
3909
3910         webpage = self._download_webpage(embed_page_url, video_id)
3911
3912         # Get the video URL
3913         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3914         if result is None:
3915             raise ExtractorError(u'ERROR: unable to extract video url')
3916         video_url = result.group('source')
3917
3918         info = {'id': video_id,
3919                 'url': video_url,
3920                 'title': video_title,
3921                 'ext': 'flv',
3922                 'format': 'flv',
3923                 'player_url': embed_page_url}
3924
3925         return [info]
3926
3927 class EightTracksIE(InfoExtractor):
3928     IE_NAME = '8tracks'
3929     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3930
3931     def _real_extract(self, url):
3932         mobj = re.match(self._VALID_URL, url)
3933         if mobj is None:
3934             raise ExtractorError(u'Invalid URL: %s' % url)
3935         playlist_id = mobj.group('id')
3936
3937         webpage = self._download_webpage(url, playlist_id)
3938
3939         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3940         if not m:
3941             raise ExtractorError(u'Cannot find trax information')
3942         json_like = m.group(1)
3943         data = json.loads(json_like)
3944
3945         session = str(random.randint(0, 1000000000))
3946         mix_id = data['id']
3947         track_count = data['tracks_count']
3948         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3949         next_url = first_url
3950         res = []
3951         for i in itertools.count():
3952             api_json = self._download_webpage(next_url, playlist_id,
3953                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3954                 errnote=u'Failed to download song information')
3955             api_data = json.loads(api_json)
3956             track_data = api_data[u'set']['track']
3957             info = {
3958                 'id': track_data['id'],
3959                 'url': track_data['track_file_stream_url'],
3960                 'title': track_data['performer'] + u' - ' + track_data['name'],
3961                 'raw_title': track_data['name'],
3962                 'uploader_id': data['user']['login'],
3963                 'ext': 'm4a',
3964             }
3965             res.append(info)
3966             if api_data['set']['at_last_track']:
3967                 break
3968             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3969         return res
3970
3971 class KeekIE(InfoExtractor):
3972     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3973     IE_NAME = u'keek'
3974
3975     def _real_extract(self, url):
3976         m = re.match(self._VALID_URL, url)
3977         video_id = m.group('videoID')
3978         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3979         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3980         webpage = self._download_webpage(url, video_id)
3981         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3982         title = unescapeHTML(m.group('title'))
3983         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3984         uploader = clean_html(m.group('uploader'))
3985         info = {
3986                 'id': video_id,
3987                 'url': video_url,
3988                 'ext': 'mp4',
3989                 'title': title,
3990                 'thumbnail': thumbnail,
3991                 'uploader': uploader
3992         }
3993         return [info]
3994
3995 class TEDIE(InfoExtractor):
3996     _VALID_URL=r'''http://www.ted.com/
3997                    (
3998                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3999                         |
4000                         ((?P<type_talk>talks)) # We have a simple talk
4001                    )
4002                    /(?P<name>\w+) # Here goes the name and then ".html"
4003                    '''
4004
4005     @classmethod
4006     def suitable(cls, url):
4007         """Receives a URL and returns True if suitable for this IE."""
4008         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4009
4010     def _real_extract(self, url):
4011         m=re.match(self._VALID_URL, url, re.VERBOSE)
4012         if m.group('type_talk'):
4013             return [self._talk_info(url)]
4014         else :
4015             playlist_id=m.group('playlist_id')
4016             name=m.group('name')
4017             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4018             return [self._playlist_videos_info(url,name,playlist_id)]
4019
4020     def _talk_video_link(self,mediaSlug):
4021         '''Returns the video link for that mediaSlug'''
4022         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4023
4024     def _playlist_videos_info(self,url,name,playlist_id=0):
4025         '''Returns the videos of the playlist'''
4026         video_RE=r'''
4027                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4028                      ([.\s]*?)data-playlist_item_id="(\d+)"
4029                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4030                      '''
4031         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4032         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4033         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4034         m_names=re.finditer(video_name_RE,webpage)
4035
4036         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4037         m_playlist = re.search(playlist_RE, webpage)
4038         playlist_title = m_playlist.group('playlist_title')
4039
4040         playlist_entries = []
4041         for m_video, m_name in zip(m_videos,m_names):
4042             video_id=m_video.group('video_id')
4043             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4044             playlist_entries.append(self.url_result(talk_url, 'TED'))
4045         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4046
4047     def _talk_info(self, url, video_id=0):
4048         """Return the video for the talk in the url"""
4049         m=re.match(self._VALID_URL, url,re.VERBOSE)
4050         videoName=m.group('name')
4051         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4052         # If the url includes the language we get the title translated
4053         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4054         title=re.search(title_RE, webpage).group('title')
4055         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4056                         "id":(?P<videoID>[\d]+).*?
4057                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4058         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4059         thumb_match=re.search(thumb_RE,webpage)
4060         info_match=re.search(info_RE,webpage,re.VERBOSE)
4061         video_id=info_match.group('videoID')
4062         mediaSlug=info_match.group('mediaSlug')
4063         video_url=self._talk_video_link(mediaSlug)
4064         info = {
4065                 'id': video_id,
4066                 'url': video_url,
4067                 'ext': 'mp4',
4068                 'title': title,
4069                 'thumbnail': thumb_match.group('thumbnail')
4070                 }
4071         return info
4072
4073 class MySpassIE(InfoExtractor):
4074     _VALID_URL = r'http://www.myspass.de/.*'
4075
4076     def _real_extract(self, url):
4077         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4078
4079         # video id is the last path element of the URL
4080         # usually there is a trailing slash, so also try the second but last
4081         url_path = compat_urllib_parse_urlparse(url).path
4082         url_parent_path, video_id = os.path.split(url_path)
4083         if not video_id:
4084             _, video_id = os.path.split(url_parent_path)
4085
4086         # get metadata
4087         metadata_url = META_DATA_URL_TEMPLATE % video_id
4088         metadata_text = self._download_webpage(metadata_url, video_id)
4089         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4090
4091         # extract values from metadata
4092         url_flv_el = metadata.find('url_flv')
4093         if url_flv_el is None:
4094             self._downloader.report_error(u'unable to extract download url')
4095             return
4096         video_url = url_flv_el.text
4097         extension = os.path.splitext(video_url)[1][1:]
4098         title_el = metadata.find('title')
4099         if title_el is None:
4100             self._downloader.report_error(u'unable to extract title')
4101             return
4102         title = title_el.text
4103         format_id_el = metadata.find('format_id')
4104         if format_id_el is None:
4105             format = ext
4106         else:
4107             format = format_id_el.text
4108         description_el = metadata.find('description')
4109         if description_el is not None:
4110             description = description_el.text
4111         else:
4112             description = None
4113         imagePreview_el = metadata.find('imagePreview')
4114         if imagePreview_el is not None:
4115             thumbnail = imagePreview_el.text
4116         else:
4117             thumbnail = None
4118         info = {
4119             'id': video_id,
4120             'url': video_url,
4121             'title': title,
4122             'ext': extension,
4123             'format': format,
4124             'thumbnail': thumbnail,
4125             'description': description
4126         }
4127         return [info]
4128
4129 class SpiegelIE(InfoExtractor):
4130     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4131
4132     def _real_extract(self, url):
4133         m = re.match(self._VALID_URL, url)
4134         video_id = m.group('videoID')
4135
4136         webpage = self._download_webpage(url, video_id)
4137         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4138         if not m:
4139             raise ExtractorError(u'Cannot find title')
4140         video_title = unescapeHTML(m.group(1))
4141
4142         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4143         xml_code = self._download_webpage(xml_url, video_id,
4144                     note=u'Downloading XML', errnote=u'Failed to download XML')
4145
4146         idoc = xml.etree.ElementTree.fromstring(xml_code)
4147         last_type = idoc[-1]
4148         filename = last_type.findall('./filename')[0].text
4149         duration = float(last_type.findall('./duration')[0].text)
4150
4151         video_url = 'http://video2.spiegel.de/flash/' + filename
4152         video_ext = filename.rpartition('.')[2]
4153         info = {
4154             'id': video_id,
4155             'url': video_url,
4156             'ext': video_ext,
4157             'title': video_title,
4158             'duration': duration,
4159         }
4160         return [info]
4161
4162 class LiveLeakIE(InfoExtractor):
4163
4164     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4165     IE_NAME = u'liveleak'
4166
4167     def _real_extract(self, url):
4168         mobj = re.match(self._VALID_URL, url)
4169         if mobj is None:
4170             self._downloader.report_error(u'invalid URL: %s' % url)
4171             return
4172
4173         video_id = mobj.group('video_id')
4174
4175         webpage = self._download_webpage(url, video_id)
4176
4177         m = re.search(r'file: "(.*?)",', webpage)
4178         if not m:
4179             self._downloader.report_error(u'unable to find video url')
4180             return
4181         video_url = m.group(1)
4182
4183         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4184         if not m:
4185             self._downloader.report_error(u'Cannot find video title')
4186         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4187
4188         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4189         if m:
4190             desc = unescapeHTML(m.group('desc'))
4191         else:
4192             desc = None
4193
4194         m = re.search(r'By:.*?(\w+)</a>', webpage)
4195         if m:
4196             uploader = clean_html(m.group(1))
4197         else:
4198             uploader = None
4199
4200         info = {
4201             'id':  video_id,
4202             'url': video_url,
4203             'ext': 'mp4',
4204             'title': title,
4205             'description': desc,
4206             'uploader': uploader
4207         }
4208
4209         return [info]
4210
4211 class ARDIE(InfoExtractor):
4212     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4213     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4214     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4215
4216     def _real_extract(self, url):
4217         # determine video id from url
4218         m = re.match(self._VALID_URL, url)
4219
4220         numid = re.search(r'documentId=([0-9]+)', url)
4221         if numid:
4222             video_id = numid.group(1)
4223         else:
4224             video_id = m.group('video_id')
4225
4226         # determine title and media streams from webpage
4227         html = self._download_webpage(url, video_id)
4228         title = re.search(self._TITLE, html).group('title')
4229         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4230         if not streams:
4231             assert '"fsk"' in html
4232             self._downloader.report_error(u'this video is only available after 8:00 pm')
4233             return
4234
4235         # choose default media type and highest quality for now
4236         stream = max([s for s in streams if int(s["media_type"]) == 0],
4237                      key=lambda s: int(s["quality"]))
4238
4239         # there's two possibilities: RTMP stream or HTTP download
4240         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4241         if stream['rtmp_url']:
4242             self.to_screen(u'RTMP download detected')
4243             assert stream['video_url'].startswith('mp4:')
4244             info["url"] = stream["rtmp_url"]
4245             info["play_path"] = stream['video_url']
4246         else:
4247             assert stream["video_url"].endswith('.mp4')
4248             info["url"] = stream["video_url"]
4249         return [info]
4250
4251 class TumblrIE(InfoExtractor):
4252     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4253
4254     def _real_extract(self, url):
4255         m_url = re.match(self._VALID_URL, url)
4256         video_id = m_url.group('id')
4257         blog = m_url.group('blog_name')
4258
4259         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4260         webpage = self._download_webpage(url, video_id)
4261
4262         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4263         video = re.search(re_video, webpage)
4264         if video is None:
4265             self.to_screen("No video founded")
4266             return []
4267         video_url = video.group('video_url')
4268         ext = video.group('ext')
4269
4270         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4271         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4272
4273         # The only place where you can get a title, it's not complete,
4274         # but searching in other places doesn't work for all videos
4275         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4276         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4277
4278         return [{'id': video_id,
4279                  'url': video_url,
4280                  'title': title,
4281                  'thumbnail': thumb,
4282                  'ext': ext
4283                  }]
4284
4285
4286 def gen_extractors():
4287     """ Return a list of an instance of every supported extractor.
4288     The order does matter; the first extractor matched is the one handling the URL.
4289     """
4290     return [
4291         YoutubePlaylistIE(),
4292         YoutubeChannelIE(),
4293         YoutubeUserIE(),
4294         YoutubeSearchIE(),
4295         YoutubeIE(),
4296         MetacafeIE(),
4297         DailymotionIE(),
4298         GoogleSearchIE(),
4299         PhotobucketIE(),
4300         YahooIE(),
4301         YahooSearchIE(),
4302         DepositFilesIE(),
4303         FacebookIE(),
4304         BlipTVUserIE(),
4305         BlipTVIE(),
4306         VimeoIE(),
4307         MyVideoIE(),
4308         ComedyCentralIE(),
4309         EscapistIE(),
4310         CollegeHumorIE(),
4311         XVideosIE(),
4312         SoundcloudSetIE(),
4313         SoundcloudIE(),
4314         InfoQIE(),
4315         MixcloudIE(),
4316         StanfordOpenClassroomIE(),
4317         MTVIE(),
4318         YoukuIE(),
4319         XNXXIE(),
4320         YouJizzIE(),
4321         PornotubeIE(),
4322         YouPornIE(),
4323         GooglePlusIE(),
4324         ArteTvIE(),
4325         NBAIE(),
4326         WorldStarHipHopIE(),
4327         JustinTVIE(),
4328         FunnyOrDieIE(),
4329         SteamIE(),
4330         UstreamIE(),
4331         RBMARadioIE(),
4332         EightTracksIE(),
4333         KeekIE(),
4334         TEDIE(),
4335         MySpassIE(),
4336         SpiegelIE(),
4337         LiveLeakIE(),
4338         ARDIE(),
4339         TumblrIE(),
4340         GenericIE()
4341     ]
4342
4343 def get_info_extractor(ie_name):
4344     """Returns the info extractor class with the given ie_name"""
4345     return globals()[ie_name+'IE']