_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 566             for expression in format_expressions:
 567                 try:
 568                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 569                 except:
 570                     pass
 571
 572         # description
 573         video_description = get_element_by_id("eow-description", video_webpage)
 574         if video_description:
 575             video_description = clean_html(video_description)
 576         else:
 577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 578             if fd_mobj:
 579                 video_description = unescapeHTML(fd_mobj.group(1))
 580             else:
 581                 video_description = u''
 582
 583         # subtitles
 584         video_subtitles = None
 585
 586         if self._downloader.params.get('writesubtitles', False):
 587             video_subtitles = self._extract_subtitle(video_id)
 588             if video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitles[0]
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('allsubtitles', False):
 594             video_subtitles = self._extract_all_subtitles(video_id)
 595             for video_subtitle in video_subtitles:
 596                 (sub_error, sub_lang, sub) = video_subtitle
 597                 if sub_error:
 598                     self._downloader.report_error(sub_error)
 599
 600         if self._downloader.params.get('listsubtitles', False):
 601             sub_lang_list = self._list_available_subtitles(video_id)
 602             return
 603
 604         if 'length_seconds' not in video_info:
 605             self._downloader.report_warning(u'unable to extract video duration')
 606             video_duration = ''
 607         else:
 608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 609
 610         # token
 611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 612
 613         # Decide which formats to download
 614         req_format = self._downloader.params.get('format', None)
 615
 616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 617             self.report_rtmp_download()
 618             video_url_list = [(None, video_info['conn'][0])]
 619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 624
 625             format_limit = self._downloader.params.get('format_limit', None)
 626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 627             if format_limit is not None and format_limit in available_formats:
 628                 format_list = available_formats[available_formats.index(format_limit):]
 629             else:
 630                 format_list = available_formats
 631             existing_formats = [x for x in format_list if x in url_map]
 632             if len(existing_formats) == 0:
 633                 raise ExtractorError(u'no known formats available for video')
 634             if self._downloader.params.get('listformats', None):
 635                 self._print_formats(existing_formats)
 636                 return
 637             if req_format is None or req_format == 'best':
 638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 639             elif req_format == 'worst':
 640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 641             elif req_format in ('-1', 'all'):
 642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 643             else:
 644                 # Specific formats. We pick the first in a slash-delimeted sequence.
 645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 646                 req_formats = req_format.split('/')
 647                 video_url_list = None
 648                 for rf in req_formats:
 649                     if rf in url_map:
 650                         video_url_list = [(rf, url_map[rf])]
 651                         break
 652                 if video_url_list is None:
 653                     raise ExtractorError(u'requested format not available')
 654         else:
 655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 656
 657         results = []
 658         for format_param, video_real_url in video_url_list:
 659             # Extension
 660             video_extension = self._video_extensions.get(format_param, 'flv')
 661
 662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 663                                               self._video_dimensions.get(format_param, '???'))
 664
 665             results.append({
 666                 'id':       video_id,
 667                 'url':      video_real_url,
 668                 'uploader': video_uploader,
 669                 'uploader_id': video_uploader_id,
 670                 'upload_date':  upload_date,
 671                 'title':    video_title,
 672                 'ext':      video_extension,
 673                 'format':   video_format,
 674                 'thumbnail':    video_thumbnail,
 675                 'description':  video_description,
 676                 'player_url':   player_url,
 677                 'subtitles':    video_subtitles,
 678                 'duration':     video_duration
 679             })
 680         return results
 681
 682
 683 class MetacafeIE(InfoExtractor):
 684     """Information Extractor for metacafe.com."""
 685
 686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 689     IE_NAME = u'metacafe'
 690
 691     def report_disclaimer(self):
 692         """Report disclaimer retrieval."""
 693         self.to_screen(u'Retrieving disclaimer')
 694
 695     def _real_initialize(self):
 696         # Retrieve disclaimer
 697         request = compat_urllib_request.Request(self._DISCLAIMER)
 698         try:
 699             self.report_disclaimer()
 700             disclaimer = compat_urllib_request.urlopen(request).read()
 701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 703             return
 704
 705         # Confirm age
 706         disclaimer_form = {
 707             'filters': '0',
 708             'submit': "Continue - I'm over 18",
 709             }
 710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 711         try:
 712             self.report_age_confirmation()
 713             disclaimer = compat_urllib_request.urlopen(request).read()
 714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 716             return
 717
 718     def _real_extract(self, url):
 719         # Extract id and simplified title from URL
 720         mobj = re.match(self._VALID_URL, url)
 721         if mobj is None:
 722             self._downloader.report_error(u'invalid URL: %s' % url)
 723             return
 724
 725         video_id = mobj.group(1)
 726
 727         # Check if video comes from YouTube
 728         mobj2 = re.match(r'^yt-(.*)$', video_id)
 729         if mobj2 is not None:
 730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 731
 732         # Retrieve video webpage to extract further information
 733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 734
 735         # Extract URL, uploader and title from webpage
 736         self.report_extraction(video_id)
 737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 738         if mobj is not None:
 739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 740             video_extension = mediaURL[-3:]
 741
 742             # Extract gdaKey if available
 743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 744             if mobj is None:
 745                 video_url = mediaURL
 746             else:
 747                 gdaKey = mobj.group(1)
 748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 749         else:
 750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 751             if mobj is None:
 752                 self._downloader.report_error(u'unable to extract media URL')
 753                 return
 754             vardict = compat_parse_qs(mobj.group(1))
 755             if 'mediaData' not in vardict:
 756                 self._downloader.report_error(u'unable to extract media URL')
 757                 return
 758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 759             if mobj is None:
 760                 self._downloader.report_error(u'unable to extract media URL')
 761                 return
 762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 763             video_extension = mediaURL[-3:]
 764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 765
 766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 767         if mobj is None:
 768             self._downloader.report_error(u'unable to extract title')
 769             return
 770         video_title = mobj.group(1).decode('utf-8')
 771
 772         mobj = re.search(r'submitter=(.*?);', webpage)
 773         if mobj is None:
 774             self._downloader.report_error(u'unable to extract uploader nickname')
 775             return
 776         video_uploader = mobj.group(1)
 777
 778         return [{
 779             'id':       video_id.decode('utf-8'),
 780             'url':      video_url.decode('utf-8'),
 781             'uploader': video_uploader.decode('utf-8'),
 782             'upload_date':  None,
 783             'title':    video_title,
 784             'ext':      video_extension.decode('utf-8'),
 785         }]
 786
 787
 788 class DailymotionIE(InfoExtractor):
 789     """Information Extractor for Dailymotion"""
 790
 791     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 792     IE_NAME = u'dailymotion'
 793     _WORKING = False
 794
 795     def _real_extract(self, url):
 796         # Extract id and simplified title from URL
 797         mobj = re.match(self._VALID_URL, url)
 798         if mobj is None:
 799             self._downloader.report_error(u'invalid URL: %s' % url)
 800             return
 801
 802         video_id = mobj.group(1).split('_')[0].split('?')[0]
 803
 804         video_extension = 'mp4'
 805
 806         # Retrieve video webpage to extract further information
 807         request = compat_urllib_request.Request(url)
 808         request.add_header('Cookie', 'family_filter=off')
 809         webpage = self._download_webpage(request, video_id)
 810
 811         # Extract URL, uploader and title from webpage
 812         self.report_extraction(video_id)
 813         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 814         if mobj is None:
 815             self._downloader.report_error(u'unable to extract media URL')
 816             return
 817         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 818
 819         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 820             if key in flashvars:
 821                 max_quality = key
 822                 self.to_screen(u'Using %s' % key)
 823                 break
 824         else:
 825             self._downloader.report_error(u'unable to extract video URL')
 826             return
 827
 828         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 829         if mobj is None:
 830             self._downloader.report_error(u'unable to extract video URL')
 831             return
 832
 833         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 834
 835         # TODO: support choosing qualities
 836
 837         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 838         if mobj is None:
 839             self._downloader.report_error(u'unable to extract title')
 840             return
 841         video_title = unescapeHTML(mobj.group('title'))
 842
 843         video_uploader = None
 844         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 845         if mobj is None:
 846             # lookin for official user
 847             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 848             if mobj_official is None:
 849                 self._downloader.report_warning(u'unable to extract uploader nickname')
 850             else:
 851                 video_uploader = mobj_official.group(1)
 852         else:
 853             video_uploader = mobj.group(1)
 854
 855         video_upload_date = None
 856         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 857         if mobj is not None:
 858             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 859
 860         return [{
 861             'id':       video_id,
 862             'url':      video_url,
 863             'uploader': video_uploader,
 864             'upload_date':  video_upload_date,
 865             'title':    video_title,
 866             'ext':      video_extension,
 867         }]
 868
 869
 870 class PhotobucketIE(InfoExtractor):
 871     """Information extractor for photobucket.com."""
 872
 873     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 874     IE_NAME = u'photobucket'
 875
 876     def _real_extract(self, url):
 877         # Extract id from URL
 878         mobj = re.match(self._VALID_URL, url)
 879         if mobj is None:
 880             self._downloader.report_error(u'Invalid URL: %s' % url)
 881             return
 882
 883         video_id = mobj.group(1)
 884
 885         video_extension = 'flv'
 886
 887         # Retrieve video webpage to extract further information
 888         request = compat_urllib_request.Request(url)
 889         try:
 890             self.report_download_webpage(video_id)
 891             webpage = compat_urllib_request.urlopen(request).read()
 892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 893             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 894             return
 895
 896         # Extract URL, uploader, and title from webpage
 897         self.report_extraction(video_id)
 898         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 899         if mobj is None:
 900             self._downloader.report_error(u'unable to extract media URL')
 901             return
 902         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 903
 904         video_url = mediaURL
 905
 906         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 907         if mobj is None:
 908             self._downloader.report_error(u'unable to extract title')
 909             return
 910         video_title = mobj.group(1).decode('utf-8')
 911
 912         video_uploader = mobj.group(2).decode('utf-8')
 913
 914         return [{
 915             'id':       video_id.decode('utf-8'),
 916             'url':      video_url.decode('utf-8'),
 917             'uploader': video_uploader,
 918             'upload_date':  None,
 919             'title':    video_title,
 920             'ext':      video_extension.decode('utf-8'),
 921         }]
 922
 923
 924 class YahooIE(InfoExtractor):
 925     """Information extractor for video.yahoo.com."""
 926
 927     _WORKING = False
 928     # _VALID_URL matches all Yahoo! Video URLs
 929     # _VPAGE_URL matches only the extractable '/watch/' URLs
 930     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 931     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 932     IE_NAME = u'video.yahoo'
 933
 934     def _real_extract(self, url, new_video=True):
 935         # Extract ID from URL
 936         mobj = re.match(self._VALID_URL, url)
 937         if mobj is None:
 938             self._downloader.report_error(u'Invalid URL: %s' % url)
 939             return
 940
 941         video_id = mobj.group(2)
 942         video_extension = 'flv'
 943
 944         # Rewrite valid but non-extractable URLs as
 945         # extractable English language /watch/ URLs
 946         if re.match(self._VPAGE_URL, url) is None:
 947             request = compat_urllib_request.Request(url)
 948             try:
 949                 webpage = compat_urllib_request.urlopen(request).read()
 950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 951                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 952                 return
 953
 954             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 955             if mobj is None:
 956                 self._downloader.report_error(u'Unable to extract id field')
 957                 return
 958             yahoo_id = mobj.group(1)
 959
 960             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 961             if mobj is None:
 962                 self._downloader.report_error(u'Unable to extract vid field')
 963                 return
 964             yahoo_vid = mobj.group(1)
 965
 966             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 967             return self._real_extract(url, new_video=False)
 968
 969         # Retrieve video webpage to extract further information
 970         request = compat_urllib_request.Request(url)
 971         try:
 972             self.report_download_webpage(video_id)
 973             webpage = compat_urllib_request.urlopen(request).read()
 974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 975             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 976             return
 977
 978         # Extract uploader and title from webpage
 979         self.report_extraction(video_id)
 980         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 981         if mobj is None:
 982             self._downloader.report_error(u'unable to extract video title')
 983             return
 984         video_title = mobj.group(1).decode('utf-8')
 985
 986         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 987         if mobj is None:
 988             self._downloader.report_error(u'unable to extract video uploader')
 989             return
 990         video_uploader = mobj.group(1).decode('utf-8')
 991
 992         # Extract video thumbnail
 993         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 994         if mobj is None:
 995             self._downloader.report_error(u'unable to extract video thumbnail')
 996             return
 997         video_thumbnail = mobj.group(1).decode('utf-8')
 998
 999         # Extract video description
1000         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1001         if mobj is None:
1002             self._downloader.report_error(u'unable to extract video description')
1003             return
1004         video_description = mobj.group(1).decode('utf-8')
1005         if not video_description:
1006             video_description = 'No description available.'
1007
1008         # Extract video height and width
1009         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video height')
1012             return
1013         yv_video_height = mobj.group(1)
1014
1015         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video width')
1018             return
1019         yv_video_width = mobj.group(1)
1020
1021         # Retrieve video playlist to extract media URL
1022         # I'm not completely sure what all these options are, but we
1023         # seem to need most of them, otherwise the server sends a 401.
1024         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1025         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1026         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1029         try:
1030             self.report_download_webpage(video_id)
1031             webpage = compat_urllib_request.urlopen(request).read()
1032         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034             return
1035
1036         # Extract media URL from playlist XML
1037         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1038         if mobj is None:
1039             self._downloader.report_error(u'Unable to extract media URL')
1040             return
1041         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042         video_url = unescapeHTML(video_url)
1043
1044         return [{
1045             'id':       video_id.decode('utf-8'),
1046             'url':      video_url,
1047             'uploader': video_uploader,
1048             'upload_date':  None,
1049             'title':    video_title,
1050             'ext':      video_extension.decode('utf-8'),
1051             'thumbnail':    video_thumbnail.decode('utf-8'),
1052             'description':  video_description,
1053         }]
1054
1055
1056 class VimeoIE(InfoExtractor):
1057     """Information extractor for vimeo.com."""
1058
1059     # _VALID_URL matches Vimeo URLs
1060     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061     IE_NAME = u'vimeo'
1062
1063     def _real_extract(self, url, new_video=True):
1064         # Extract ID from URL
1065         mobj = re.match(self._VALID_URL, url)
1066         if mobj is None:
1067             self._downloader.report_error(u'Invalid URL: %s' % url)
1068             return
1069
1070         video_id = mobj.group('id')
1071         if not mobj.group('proto'):
1072             url = 'https://' + url
1073         if mobj.group('direct_link'):
1074             url = 'https://vimeo.com/' + video_id
1075
1076         # Retrieve video webpage to extract further information
1077         request = compat_urllib_request.Request(url, None, std_headers)
1078         try:
1079             self.report_download_webpage(video_id)
1080             webpage_bytes = compat_urllib_request.urlopen(request).read()
1081             webpage = webpage_bytes.decode('utf-8')
1082         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1084             return
1085
1086         # Now we begin extracting as much information as we can from what we
1087         # retrieved. First we extract the information common to all extractors,
1088         # and latter we extract those that are Vimeo specific.
1089         self.report_extraction(video_id)
1090
1091         # Extract the config JSON
1092         try:
1093             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094             config = json.loads(config)
1095         except:
1096             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1097                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1098             else:
1099                 self._downloader.report_error(u'unable to extract info section')
1100             return
1101
1102         # Extract title
1103         video_title = config["video"]["title"]
1104
1105         # Extract uploader and uploader_id
1106         video_uploader = config["video"]["owner"]["name"]
1107         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1108
1109         # Extract video thumbnail
1110         video_thumbnail = config["video"]["thumbnail"]
1111
1112         # Extract video description
1113         video_description = get_element_by_attribute("itemprop", "description", webpage)
1114         if video_description: video_description = clean_html(video_description)
1115         else: video_description = u''
1116
1117         # Extract upload date
1118         video_upload_date = None
1119         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1120         if mobj is not None:
1121             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1122
1123         # Vimeo specific: extract request signature and timestamp
1124         sig = config['request']['signature']
1125         timestamp = config['request']['timestamp']
1126
1127         # Vimeo specific: extract video codec and quality information
1128         # First consider quality, then codecs, then take everything
1129         # TODO bind to format param
1130         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1131         files = { 'hd': [], 'sd': [], 'other': []}
1132         for codec_name, codec_extension in codecs:
1133             if codec_name in config["video"]["files"]:
1134                 if 'hd' in config["video"]["files"][codec_name]:
1135                     files['hd'].append((codec_name, codec_extension, 'hd'))
1136                 elif 'sd' in config["video"]["files"][codec_name]:
1137                     files['sd'].append((codec_name, codec_extension, 'sd'))
1138                 else:
1139                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1140
1141         for quality in ('hd', 'sd', 'other'):
1142             if len(files[quality]) > 0:
1143                 video_quality = files[quality][0][2]
1144                 video_codec = files[quality][0][0]
1145                 video_extension = files[quality][0][1]
1146                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1147                 break
1148         else:
1149             self._downloader.report_error(u'no known codec found')
1150             return
1151
1152         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1153                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1154
1155         return [{
1156             'id':       video_id,
1157             'url':      video_url,
1158             'uploader': video_uploader,
1159             'uploader_id': video_uploader_id,
1160             'upload_date':  video_upload_date,
1161             'title':    video_title,
1162             'ext':      video_extension,
1163             'thumbnail':    video_thumbnail,
1164             'description':  video_description,
1165         }]
1166
1167
1168 class ArteTvIE(InfoExtractor):
1169     """arte.tv information extractor."""
1170
1171     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1172     _LIVE_URL = r'index-[0-9]+\.html$'
1173
1174     IE_NAME = u'arte.tv'
1175
1176     def fetch_webpage(self, url):
1177         request = compat_urllib_request.Request(url)
1178         try:
1179             self.report_download_webpage(url)
1180             webpage = compat_urllib_request.urlopen(request).read()
1181         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1182             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1183             return
1184         except ValueError as err:
1185             self._downloader.report_error(u'Invalid URL: %s' % url)
1186             return
1187         return webpage
1188
1189     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1190         page = self.fetch_webpage(url)
1191         mobj = re.search(regex, page, regexFlags)
1192         info = {}
1193
1194         if mobj is None:
1195             self._downloader.report_error(u'Invalid URL: %s' % url)
1196             return
1197
1198         for (i, key, err) in matchTuples:
1199             if mobj.group(i) is None:
1200                 self._downloader.report_error(err)
1201                 return
1202             else:
1203                 info[key] = mobj.group(i)
1204
1205         return info
1206
1207     def extractLiveStream(self, url):
1208         video_lang = url.split('/')[-4]
1209         info = self.grep_webpage(
1210             url,
1211             r'src="(.*?/videothek_js.*?\.js)',
1212             0,
1213             [
1214                 (1, 'url', u'Invalid URL: %s' % url)
1215             ]
1216         )
1217         http_host = url.split('/')[2]
1218         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1219         info = self.grep_webpage(
1220             next_url,
1221             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1222                 '(http://.*?\.swf).*?' +
1223                 '(rtmp://.*?)\'',
1224             re.DOTALL,
1225             [
1226                 (1, 'path',   u'could not extract video path: %s' % url),
1227                 (2, 'player', u'could not extract video player: %s' % url),
1228                 (3, 'url',    u'could not extract video url: %s' % url)
1229             ]
1230         )
1231         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1232
1233     def extractPlus7Stream(self, url):
1234         video_lang = url.split('/')[-3]
1235         info = self.grep_webpage(
1236             url,
1237             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1238             0,
1239             [
1240                 (1, 'url', u'Invalid URL: %s' % url)
1241             ]
1242         )
1243         next_url = compat_urllib_parse.unquote(info.get('url'))
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1247             0,
1248             [
1249                 (1, 'url', u'Could not find <video> tag: %s' % url)
1250             ]
1251         )
1252         next_url = compat_urllib_parse.unquote(info.get('url'))
1253
1254         info = self.grep_webpage(
1255             next_url,
1256             r'<video id="(.*?)".*?>.*?' +
1257                 '<name>(.*?)</name>.*?' +
1258                 '<dateVideo>(.*?)</dateVideo>.*?' +
1259                 '<url quality="hd">(.*?)</url>',
1260             re.DOTALL,
1261             [
1262                 (1, 'id',    u'could not extract video id: %s' % url),
1263                 (2, 'title', u'could not extract video title: %s' % url),
1264                 (3, 'date',  u'could not extract video date: %s' % url),
1265                 (4, 'url',   u'could not extract video url: %s' % url)
1266             ]
1267         )
1268
1269         return {
1270             'id':           info.get('id'),
1271             'url':          compat_urllib_parse.unquote(info.get('url')),
1272             'uploader':     u'arte.tv',
1273             'upload_date':  info.get('date'),
1274             'title':        info.get('title').decode('utf-8'),
1275             'ext':          u'mp4',
1276             'format':       u'NA',
1277             'player_url':   None,
1278         }
1279
1280     def _real_extract(self, url):
1281         video_id = url.split('/')[-1]
1282         self.report_extraction(video_id)
1283
1284         if re.search(self._LIVE_URL, video_id) is not None:
1285             self.extractLiveStream(url)
1286             return
1287         else:
1288             info = self.extractPlus7Stream(url)
1289
1290         return [info]
1291
1292
1293 class GenericIE(InfoExtractor):
1294     """Generic last-resort information extractor."""
1295
1296     _VALID_URL = r'.*'
1297     IE_NAME = u'generic'
1298
1299     def report_download_webpage(self, video_id):
1300         """Report webpage download."""
1301         if not self._downloader.params.get('test', False):
1302             self._downloader.report_warning(u'Falling back on generic information extractor.')
1303         super(GenericIE, self).report_download_webpage(video_id)
1304
1305     def report_following_redirect(self, new_url):
1306         """Report information extraction."""
1307         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308
1309     def _test_redirect(self, url):
1310         """Check if it is a redirect, like url shorteners, in case return the new url."""
1311         class HeadRequest(compat_urllib_request.Request):
1312             def get_method(self):
1313                 return "HEAD"
1314
1315         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1316             """
1317             Subclass the HTTPRedirectHandler to make it use our
1318             HeadRequest also on the redirected URL
1319             """
1320             def redirect_request(self, req, fp, code, msg, headers, newurl):
1321                 if code in (301, 302, 303, 307):
1322                     newurl = newurl.replace(' ', '%20')
1323                     newheaders = dict((k,v) for k,v in req.headers.items()
1324                                       if k.lower() not in ("content-length", "content-type"))
1325                     return HeadRequest(newurl,
1326                                        headers=newheaders,
1327                                        origin_req_host=req.get_origin_req_host(),
1328                                        unverifiable=True)
1329                 else:
1330                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331
1332         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1333             """
1334             Fallback to GET if HEAD is not allowed (405 HTTP error)
1335             """
1336             def http_error_405(self, req, fp, code, msg, headers):
1337                 fp.read()
1338                 fp.close()
1339
1340                 newheaders = dict((k,v) for k,v in req.headers.items()
1341                                   if k.lower() not in ("content-length", "content-type"))
1342                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1343                                                  headers=newheaders,
1344                                                  origin_req_host=req.get_origin_req_host(),
1345                                                  unverifiable=True))
1346
1347         # Build our opener
1348         opener = compat_urllib_request.OpenerDirector()
1349         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350                         HTTPMethodFallback, HEADRedirectHandler,
1351                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352             opener.add_handler(handler())
1353
1354         response = opener.open(HeadRequest(url))
1355         new_url = response.geturl()
1356
1357         if url == new_url:
1358             return False
1359
1360         self.report_following_redirect(new_url)
1361         return new_url
1362
1363     def _real_extract(self, url):
1364         new_url = self._test_redirect(url)
1365         if new_url: return [self.url_result(new_url)]
1366
1367         video_id = url.split('/')[-1]
1368         try:
1369             webpage = self._download_webpage(url, video_id)
1370         except ValueError as err:
1371             # since this is the last-resort InfoExtractor, if
1372             # this error is thrown, it'll be thrown here
1373             self._downloader.report_error(u'Invalid URL: %s' % url)
1374             return
1375
1376         self.report_extraction(video_id)
1377         # Start with something easy: JW Player in SWFObject
1378         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1379         if mobj is None:
1380             # Broaden the search a little bit
1381             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1382         if mobj is None:
1383             # Broaden the search a little bit: JWPlayer JS loader
1384             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1385         if mobj is None:
1386             self._downloader.report_error(u'Invalid URL: %s' % url)
1387             return
1388
1389         # It's possible that one of the regexes
1390         # matched, but returned an empty group:
1391         if mobj.group(1) is None:
1392             self._downloader.report_error(u'Invalid URL: %s' % url)
1393             return
1394
1395         video_url = compat_urllib_parse.unquote(mobj.group(1))
1396         video_id = os.path.basename(video_url)
1397
1398         # here's a fun little line of code for you:
1399         video_extension = os.path.splitext(video_id)[1][1:]
1400         video_id = os.path.splitext(video_id)[0]
1401
1402         # it's tempting to parse this further, but you would
1403         # have to take into account all the variations like
1404         #   Video Title - Site Name
1405         #   Site Name | Video Title
1406         #   Video Title - Tagline | Site Name
1407         # and so on and so forth; it's just not practical
1408         mobj = re.search(r'<title>(.*)</title>', webpage)
1409         if mobj is None:
1410             self._downloader.report_error(u'unable to extract title')
1411             return
1412         video_title = mobj.group(1)
1413
1414         # video uploader is domain name
1415         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416         if mobj is None:
1417             self._downloader.report_error(u'unable to extract title')
1418             return
1419         video_uploader = mobj.group(1)
1420
1421         return [{
1422             'id':       video_id,
1423             'url':      video_url,
1424             'uploader': video_uploader,
1425             'upload_date':  None,
1426             'title':    video_title,
1427             'ext':      video_extension,
1428         }]
1429
1430
1431 class YoutubeSearchIE(InfoExtractor):
1432     """Information Extractor for YouTube search queries."""
1433     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435     _max_youtube_results = 1000
1436     IE_NAME = u'youtube:search'
1437
1438     def report_download_page(self, query, pagenum):
1439         """Report attempt to download search page with given number."""
1440         query = query.decode(preferredencoding())
1441         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1442
1443     def _real_extract(self, query):
1444         mobj = re.match(self._VALID_URL, query)
1445         if mobj is None:
1446             self._downloader.report_error(u'invalid search query "%s"' % query)
1447             return
1448
1449         prefix, query = query.split(':')
1450         prefix = prefix[8:]
1451         query = query.encode('utf-8')
1452         if prefix == '':
1453             return self._get_n_results(query, 1)
1454         elif prefix == 'all':
1455             self._get_n_results(query, self._max_youtube_results)
1456         else:
1457             try:
1458                 n = int(prefix)
1459                 if n <= 0:
1460                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1461                     return
1462                 elif n > self._max_youtube_results:
1463                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1464                     n = self._max_youtube_results
1465                 return self._get_n_results(query, n)
1466             except ValueError: # parsing prefix as integer fails
1467                 return self._get_n_results(query, 1)
1468
1469     def _get_n_results(self, query, n):
1470         """Get a specified number of results for a query"""
1471
1472         video_ids = []
1473         pagenum = 0
1474         limit = n
1475
1476         while (50 * pagenum) < limit:
1477             self.report_download_page(query, pagenum+1)
1478             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479             request = compat_urllib_request.Request(result_url)
1480             try:
1481                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1484                 return
1485             api_response = json.loads(data)['data']
1486
1487             if not 'items' in api_response:
1488                 self._downloader.report_error(u'[youtube] No video results')
1489                 return
1490
1491             new_ids = list(video['id'] for video in api_response['items'])
1492             video_ids += new_ids
1493
1494             limit = min(n, api_response['totalItems'])
1495             pagenum += 1
1496
1497         if len(video_ids) > n:
1498             video_ids = video_ids[:n]
1499         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1500         return videos
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504     """Information Extractor for Google Video search queries."""
1505     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509     _max_google_results = 1000
1510     IE_NAME = u'video.google:search'
1511
1512     def report_download_page(self, query, pagenum):
1513         """Report attempt to download playlist page with given number."""
1514         query = query.decode(preferredencoding())
1515         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1516
1517     def _real_extract(self, query):
1518         mobj = re.match(self._VALID_URL, query)
1519         if mobj is None:
1520             self._downloader.report_error(u'invalid search query "%s"' % query)
1521             return
1522
1523         prefix, query = query.split(':')
1524         prefix = prefix[8:]
1525         query = query.encode('utf-8')
1526         if prefix == '':
1527             self._download_n_results(query, 1)
1528             return
1529         elif prefix == 'all':
1530             self._download_n_results(query, self._max_google_results)
1531             return
1532         else:
1533             try:
1534                 n = int(prefix)
1535                 if n <= 0:
1536                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1537                     return
1538                 elif n > self._max_google_results:
1539                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540                     n = self._max_google_results
1541                 self._download_n_results(query, n)
1542                 return
1543             except ValueError: # parsing prefix as integer fails
1544                 self._download_n_results(query, 1)
1545                 return
1546
1547     def _download_n_results(self, query, n):
1548         """Downloads a specified number of results for a query"""
1549
1550         video_ids = []
1551         pagenum = 0
1552
1553         while True:
1554             self.report_download_page(query, pagenum)
1555             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556             request = compat_urllib_request.Request(result_url)
1557             try:
1558                 page = compat_urllib_request.urlopen(request).read()
1559             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1561                 return
1562
1563             # Extract video identifiers
1564             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565                 video_id = mobj.group(1)
1566                 if video_id not in video_ids:
1567                     video_ids.append(video_id)
1568                     if len(video_ids) == n:
1569                         # Specified n videos reached
1570                         for id in video_ids:
1571                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572                         return
1573
1574             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575                 for id in video_ids:
1576                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577                 return
1578
1579             pagenum = pagenum + 1
1580
1581
1582 class YahooSearchIE(InfoExtractor):
1583     """Information Extractor for Yahoo! Video search queries."""
1584
1585     _WORKING = False
1586     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1587     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1588     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1589     _MORE_PAGES_INDICATOR = r'\s*Next'
1590     _max_yahoo_results = 1000
1591     IE_NAME = u'video.yahoo:search'
1592
1593     def report_download_page(self, query, pagenum):
1594         """Report attempt to download playlist page with given number."""
1595         query = query.decode(preferredencoding())
1596         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1597
1598     def _real_extract(self, query):
1599         mobj = re.match(self._VALID_URL, query)
1600         if mobj is None:
1601             self._downloader.report_error(u'invalid search query "%s"' % query)
1602             return
1603
1604         prefix, query = query.split(':')
1605         prefix = prefix[8:]
1606         query = query.encode('utf-8')
1607         if prefix == '':
1608             self._download_n_results(query, 1)
1609             return
1610         elif prefix == 'all':
1611             self._download_n_results(query, self._max_yahoo_results)
1612             return
1613         else:
1614             try:
1615                 n = int(prefix)
1616                 if n <= 0:
1617                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1618                     return
1619                 elif n > self._max_yahoo_results:
1620                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621                     n = self._max_yahoo_results
1622                 self._download_n_results(query, n)
1623                 return
1624             except ValueError: # parsing prefix as integer fails
1625                 self._download_n_results(query, 1)
1626                 return
1627
1628     def _download_n_results(self, query, n):
1629         """Downloads a specified number of results for a query"""
1630
1631         video_ids = []
1632         already_seen = set()
1633         pagenum = 1
1634
1635         while True:
1636             self.report_download_page(query, pagenum)
1637             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638             request = compat_urllib_request.Request(result_url)
1639             try:
1640                 page = compat_urllib_request.urlopen(request).read()
1641             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1643                 return
1644
1645             # Extract video identifiers
1646             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647                 video_id = mobj.group(1)
1648                 if video_id not in already_seen:
1649                     video_ids.append(video_id)
1650                     already_seen.add(video_id)
1651                     if len(video_ids) == n:
1652                         # Specified n videos reached
1653                         for id in video_ids:
1654                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655                         return
1656
1657             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658                 for id in video_ids:
1659                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                 return
1661
1662             pagenum = pagenum + 1
1663
1664
1665 class YoutubePlaylistIE(InfoExtractor):
1666     """Information Extractor for YouTube playlists."""
1667
1668     _VALID_URL = r"""(?:
1669                         (?:https?://)?
1670                         (?:\w+\.)?
1671                         youtube\.com/
1672                         (?:
1673                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1674                            \? (?:.*?&)*? (?:p|a|list)=
1675                         |  p/
1676                         )
1677                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1678                         .*
1679                      |
1680                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1681                      )"""
1682     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1683     _MAX_RESULTS = 50
1684     IE_NAME = u'youtube:playlist'
1685
1686     @classmethod
1687     def suitable(cls, url):
1688         """Receives a URL and returns True if suitable for this IE."""
1689         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1690
1691     def report_download_page(self, playlist_id, pagenum):
1692         """Report attempt to download playlist page with given number."""
1693         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1694
1695     def _real_extract(self, url):
1696         # Extract playlist id
1697         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1698         if mobj is None:
1699             self._downloader.report_error(u'invalid url: %s' % url)
1700             return
1701
1702         # Download playlist videos from API
1703         playlist_id = mobj.group(1) or mobj.group(2)
1704         page_num = 1
1705         videos = []
1706
1707         while True:
1708             self.report_download_page(playlist_id, page_num)
1709
1710             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1711             try:
1712                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1713             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1715                 return
1716
1717             try:
1718                 response = json.loads(page)
1719             except ValueError as err:
1720                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1721                 return
1722
1723             if 'feed' not in response:
1724                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1725                 return
1726             playlist_title = response['feed']['title']['$t']
1727             if 'entry' not in response['feed']:
1728                 # Number of videos is a multiple of self._MAX_RESULTS
1729                 break
1730
1731             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1732                         for entry in response['feed']['entry']
1733                         if 'content' in entry ]
1734
1735             if len(response['feed']['entry']) < self._MAX_RESULTS:
1736                 break
1737             page_num += 1
1738
1739         videos = [v[1] for v in sorted(videos)]
1740
1741         url_results = [self.url_result(url, 'Youtube') for url in videos]
1742         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1743
1744
1745 class YoutubeChannelIE(InfoExtractor):
1746     """Information Extractor for YouTube channels."""
1747
1748     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1749     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1750     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1751     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1752     IE_NAME = u'youtube:channel'
1753
1754     def report_download_page(self, channel_id, pagenum):
1755         """Report attempt to download channel page with given number."""
1756         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1757
1758     def extract_videos_from_page(self, page):
1759         ids_in_page = []
1760         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1761             if mobj.group(1) not in ids_in_page:
1762                 ids_in_page.append(mobj.group(1))
1763         return ids_in_page
1764
1765     def _real_extract(self, url):
1766         # Extract channel id
1767         mobj = re.match(self._VALID_URL, url)
1768         if mobj is None:
1769             self._downloader.report_error(u'invalid url: %s' % url)
1770             return
1771
1772         # Download channel page
1773         channel_id = mobj.group(1)
1774         video_ids = []
1775         pagenum = 1
1776
1777         self.report_download_page(channel_id, pagenum)
1778         url = self._TEMPLATE_URL % (channel_id, pagenum)
1779         request = compat_urllib_request.Request(url)
1780         try:
1781             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1784             return
1785
1786         # Extract video identifiers
1787         ids_in_page = self.extract_videos_from_page(page)
1788         video_ids.extend(ids_in_page)
1789
1790         # Download any subsequent channel pages using the json-based channel_ajax query
1791         if self._MORE_PAGES_INDICATOR in page:
1792             while True:
1793                 pagenum = pagenum + 1
1794
1795                 self.report_download_page(channel_id, pagenum)
1796                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1797                 request = compat_urllib_request.Request(url)
1798                 try:
1799                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1800                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1801                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1802                     return
1803
1804                 page = json.loads(page)
1805
1806                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1807                 video_ids.extend(ids_in_page)
1808
1809                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1810                     break
1811
1812         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1813
1814         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1815         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1816         return [self.playlist_result(url_entries, channel_id)]
1817
1818
1819 class YoutubeUserIE(InfoExtractor):
1820     """Information Extractor for YouTube users."""
1821
1822     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1823     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1824     _GDATA_PAGE_SIZE = 50
1825     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1826     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1827     IE_NAME = u'youtube:user'
1828
1829     def report_download_page(self, username, start_index):
1830         """Report attempt to download user page."""
1831         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1832                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1833
1834     def _real_extract(self, url):
1835         # Extract username
1836         mobj = re.match(self._VALID_URL, url)
1837         if mobj is None:
1838             self._downloader.report_error(u'invalid url: %s' % url)
1839             return
1840
1841         username = mobj.group(1)
1842
1843         # Download video ids using YouTube Data API. Result size per
1844         # query is limited (currently to 50 videos) so we need to query
1845         # page by page until there are no video ids - it means we got
1846         # all of them.
1847
1848         video_ids = []
1849         pagenum = 0
1850
1851         while True:
1852             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1853             self.report_download_page(username, start_index)
1854
1855             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1856
1857             try:
1858                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1861                 return
1862
1863             # Extract video identifiers
1864             ids_in_page = []
1865
1866             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867                 if mobj.group(1) not in ids_in_page:
1868                     ids_in_page.append(mobj.group(1))
1869
1870             video_ids.extend(ids_in_page)
1871
1872             # A little optimization - if current page is not
1873             # "full", ie. does not contain PAGE_SIZE video ids then
1874             # we can assume that this page is the last one - there
1875             # are no more ids on further pages - no need to query
1876             # again.
1877
1878             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1879                 break
1880
1881             pagenum += 1
1882
1883         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1884         url_results = [self.url_result(url, 'Youtube') for url in urls]
1885         return [self.playlist_result(url_results, playlist_title = username)]
1886
1887
1888 class BlipTVUserIE(InfoExtractor):
1889     """Information Extractor for blip.tv users."""
1890
1891     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1892     _PAGE_SIZE = 12
1893     IE_NAME = u'blip.tv:user'
1894
1895     def report_download_page(self, username, pagenum):
1896         """Report attempt to download user page."""
1897         self.to_screen(u'user %s: Downloading video ids from page %d' %
1898                 (username, pagenum))
1899
1900     def _real_extract(self, url):
1901         # Extract username
1902         mobj = re.match(self._VALID_URL, url)
1903         if mobj is None:
1904             self._downloader.report_error(u'invalid url: %s' % url)
1905             return
1906
1907         username = mobj.group(1)
1908
1909         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1910
1911         request = compat_urllib_request.Request(url)
1912
1913         try:
1914             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1915             mobj = re.search(r'data-users-id="([^"]+)"', page)
1916             page_base = page_base % mobj.group(1)
1917         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1918             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1919             return
1920
1921
1922         # Download video ids using BlipTV Ajax calls. Result size per
1923         # query is limited (currently to 12 videos) so we need to query
1924         # page by page until there are no video ids - it means we got
1925         # all of them.
1926
1927         video_ids = []
1928         pagenum = 1
1929
1930         while True:
1931             self.report_download_page(username, pagenum)
1932             url = page_base + "&page=" + str(pagenum)
1933             request = compat_urllib_request.Request( url )
1934             try:
1935                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1936             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1937                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1938                 return
1939
1940             # Extract video identifiers
1941             ids_in_page = []
1942
1943             for mobj in re.finditer(r'href="/([^"]+)"', page):
1944                 if mobj.group(1) not in ids_in_page:
1945                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1946
1947             video_ids.extend(ids_in_page)
1948
1949             # A little optimization - if current page is not
1950             # "full", ie. does not contain PAGE_SIZE video ids then
1951             # we can assume that this page is the last one - there
1952             # are no more ids on further pages - no need to query
1953             # again.
1954
1955             if len(ids_in_page) < self._PAGE_SIZE:
1956                 break
1957
1958             pagenum += 1
1959
1960         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1961         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1962         return [self.playlist_result(url_entries, playlist_title = username)]
1963
1964
1965 class DepositFilesIE(InfoExtractor):
1966     """Information extractor for depositfiles.com"""
1967
1968     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1969
1970     def _real_extract(self, url):
1971         file_id = url.split('/')[-1]
1972         # Rebuild url in english locale
1973         url = 'http://depositfiles.com/en/files/' + file_id
1974
1975         # Retrieve file webpage with 'Free download' button pressed
1976         free_download_indication = { 'gateway_result' : '1' }
1977         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1978         try:
1979             self.report_download_webpage(file_id)
1980             webpage = compat_urllib_request.urlopen(request).read()
1981         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1983             return
1984
1985         # Search for the real file URL
1986         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1987         if (mobj is None) or (mobj.group(1) is None):
1988             # Try to figure out reason of the error.
1989             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1990             if (mobj is not None) and (mobj.group(1) is not None):
1991                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1992                 self._downloader.report_error(u'%s' % restriction_message)
1993             else:
1994                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1995             return
1996
1997         file_url = mobj.group(1)
1998         file_extension = os.path.splitext(file_url)[1][1:]
1999
2000         # Search for file title
2001         mobj = re.search(r'<b title="(.*?)">', webpage)
2002         if mobj is None:
2003             self._downloader.report_error(u'unable to extract title')
2004             return
2005         file_title = mobj.group(1).decode('utf-8')
2006
2007         return [{
2008             'id':       file_id.decode('utf-8'),
2009             'url':      file_url.decode('utf-8'),
2010             'uploader': None,
2011             'upload_date':  None,
2012             'title':    file_title,
2013             'ext':      file_extension.decode('utf-8'),
2014         }]
2015
2016
2017 class FacebookIE(InfoExtractor):
2018     """Information Extractor for Facebook"""
2019
2020     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2021     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2022     _NETRC_MACHINE = 'facebook'
2023     IE_NAME = u'facebook'
2024
2025     def report_login(self):
2026         """Report attempt to log in."""
2027         self.to_screen(u'Logging in')
2028
2029     def _real_initialize(self):
2030         if self._downloader is None:
2031             return
2032
2033         useremail = None
2034         password = None
2035         downloader_params = self._downloader.params
2036
2037         # Attempt to use provided username and password or .netrc data
2038         if downloader_params.get('username', None) is not None:
2039             useremail = downloader_params['username']
2040             password = downloader_params['password']
2041         elif downloader_params.get('usenetrc', False):
2042             try:
2043                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2044                 if info is not None:
2045                     useremail = info[0]
2046                     password = info[2]
2047                 else:
2048                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2049             except (IOError, netrc.NetrcParseError) as err:
2050                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2051                 return
2052
2053         if useremail is None:
2054             return
2055
2056         # Log in
2057         login_form = {
2058             'email': useremail,
2059             'pass': password,
2060             'login': 'Log+In'
2061             }
2062         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2063         try:
2064             self.report_login()
2065             login_results = compat_urllib_request.urlopen(request).read()
2066             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2067                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2068                 return
2069         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2071             return
2072
2073     def _real_extract(self, url):
2074         mobj = re.match(self._VALID_URL, url)
2075         if mobj is None:
2076             self._downloader.report_error(u'invalid URL: %s' % url)
2077             return
2078         video_id = mobj.group('ID')
2079
2080         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2081         webpage = self._download_webpage(url, video_id)
2082
2083         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2084         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2085         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2086         if not m:
2087             raise ExtractorError(u'Cannot parse data')
2088         data = dict(json.loads(m.group(1)))
2089         params_raw = compat_urllib_parse.unquote(data['params'])
2090         params = json.loads(params_raw)
2091         video_data = params['video_data'][0]
2092         video_url = video_data.get('hd_src')
2093         if not video_url:
2094             video_url = video_data['sd_src']
2095         if not video_url:
2096             raise ExtractorError(u'Cannot find video URL')
2097         video_duration = int(video_data['video_duration'])
2098         thumbnail = video_data['thumbnail_src']
2099
2100         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2101         if not m:
2102             raise ExtractorError(u'Cannot find title in webpage')
2103         video_title = unescapeHTML(m.group(1))
2104
2105         info = {
2106             'id': video_id,
2107             'title': video_title,
2108             'url': video_url,
2109             'ext': 'mp4',
2110             'duration': video_duration,
2111             'thumbnail': thumbnail,
2112         }
2113         return [info]
2114
2115
2116 class BlipTVIE(InfoExtractor):
2117     """Information extractor for blip.tv"""
2118
2119     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2120     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2121     IE_NAME = u'blip.tv'
2122
2123     def report_direct_download(self, title):
2124         """Report information extraction."""
2125         self.to_screen(u'%s: Direct download detected' % title)
2126
2127     def _real_extract(self, url):
2128         mobj = re.match(self._VALID_URL, url)
2129         if mobj is None:
2130             self._downloader.report_error(u'invalid URL: %s' % url)
2131             return
2132
2133         urlp = compat_urllib_parse_urlparse(url)
2134         if urlp.path.startswith('/play/'):
2135             request = compat_urllib_request.Request(url)
2136             response = compat_urllib_request.urlopen(request)
2137             redirecturl = response.geturl()
2138             rurlp = compat_urllib_parse_urlparse(redirecturl)
2139             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2140             url = 'http://blip.tv/a/a-' + file_id
2141             return self._real_extract(url)
2142
2143
2144         if '?' in url:
2145             cchar = '&'
2146         else:
2147             cchar = '?'
2148         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2149         request = compat_urllib_request.Request(json_url)
2150         request.add_header('User-Agent', 'iTunes/10.6.1')
2151         self.report_extraction(mobj.group(1))
2152         info = None
2153         try:
2154             urlh = compat_urllib_request.urlopen(request)
2155             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2156                 basename = url.split('/')[-1]
2157                 title,ext = os.path.splitext(basename)
2158                 title = title.decode('UTF-8')
2159                 ext = ext.replace('.', '')
2160                 self.report_direct_download(title)
2161                 info = {
2162                     'id': title,
2163                     'url': url,
2164                     'uploader': None,
2165                     'upload_date': None,
2166                     'title': title,
2167                     'ext': ext,
2168                     'urlhandle': urlh
2169                 }
2170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2172         if info is None: # Regular URL
2173             try:
2174                 json_code_bytes = urlh.read()
2175                 json_code = json_code_bytes.decode('utf-8')
2176             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2178                 return
2179
2180             try:
2181                 json_data = json.loads(json_code)
2182                 if 'Post' in json_data:
2183                     data = json_data['Post']
2184                 else:
2185                     data = json_data
2186
2187                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2188                 video_url = data['media']['url']
2189                 umobj = re.match(self._URL_EXT, video_url)
2190                 if umobj is None:
2191                     raise ValueError('Can not determine filename extension')
2192                 ext = umobj.group(1)
2193
2194                 info = {
2195                     'id': data['item_id'],
2196                     'url': video_url,
2197                     'uploader': data['display_name'],
2198                     'upload_date': upload_date,
2199                     'title': data['title'],
2200                     'ext': ext,
2201                     'format': data['media']['mimeType'],
2202                     'thumbnail': data['thumbnailUrl'],
2203                     'description': data['description'],
2204                     'player_url': data['embedUrl'],
2205                     'user_agent': 'iTunes/10.6.1',
2206                 }
2207             except (ValueError,KeyError) as err:
2208                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2209                 return
2210
2211         return [info]
2212
2213
2214 class MyVideoIE(InfoExtractor):
2215     """Information Extractor for myvideo.de."""
2216
2217     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2218     IE_NAME = u'myvideo'
2219
2220     def _real_extract(self,url):
2221         mobj = re.match(self._VALID_URL, url)
2222         if mobj is None:
2223             self._download.report_error(u'invalid URL: %s' % url)
2224             return
2225
2226         video_id = mobj.group(1)
2227
2228         # Get video webpage
2229         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2230         webpage = self._download_webpage(webpage_url, video_id)
2231
2232         self.report_extraction(video_id)
2233         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2234                  webpage)
2235         if mobj is None:
2236             self._downloader.report_error(u'unable to extract media URL')
2237             return
2238         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2239
2240         mobj = re.search('<title>([^<]+)</title>', webpage)
2241         if mobj is None:
2242             self._downloader.report_error(u'unable to extract title')
2243             return
2244
2245         video_title = mobj.group(1)
2246
2247         return [{
2248             'id':       video_id,
2249             'url':      video_url,
2250             'uploader': None,
2251             'upload_date':  None,
2252             'title':    video_title,
2253             'ext':      u'flv',
2254         }]
2255
2256 class ComedyCentralIE(InfoExtractor):
2257     """Information extractor for The Daily Show and Colbert Report """
2258
2259     # urls can be abbreviations like :thedailyshow or :colbert
2260     # urls for episodes like:
2261     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2262     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2263     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2264     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2265                       |(https?://)?(www\.)?
2266                           (?P<showname>thedailyshow|colbertnation)\.com/
2267                          (full-episodes/(?P<episode>.*)|
2268                           (?P<clip>
2269                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2270                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2271                      $"""
2272
2273     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2274
2275     _video_extensions = {
2276         '3500': 'mp4',
2277         '2200': 'mp4',
2278         '1700': 'mp4',
2279         '1200': 'mp4',
2280         '750': 'mp4',
2281         '400': 'mp4',
2282     }
2283     _video_dimensions = {
2284         '3500': '1280x720',
2285         '2200': '960x540',
2286         '1700': '768x432',
2287         '1200': '640x360',
2288         '750': '512x288',
2289         '400': '384x216',
2290     }
2291
2292     @classmethod
2293     def suitable(cls, url):
2294         """Receives a URL and returns True if suitable for this IE."""
2295         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2296
2297     def report_config_download(self, episode_id, media_id):
2298         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2299
2300     def report_index_download(self, episode_id):
2301         self.to_screen(u'%s: Downloading show index' % episode_id)
2302
2303     def _print_formats(self, formats):
2304         print('Available formats:')
2305         for x in formats:
2306             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2307
2308
2309     def _real_extract(self, url):
2310         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2311         if mobj is None:
2312             self._downloader.report_error(u'invalid URL: %s' % url)
2313             return
2314
2315         if mobj.group('shortname'):
2316             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2317                 url = u'http://www.thedailyshow.com/full-episodes/'
2318             else:
2319                 url = u'http://www.colbertnation.com/full-episodes/'
2320             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2321             assert mobj is not None
2322
2323         if mobj.group('clip'):
2324             if mobj.group('showname') == 'thedailyshow':
2325                 epTitle = mobj.group('tdstitle')
2326             else:
2327                 epTitle = mobj.group('cntitle')
2328             dlNewest = False
2329         else:
2330             dlNewest = not mobj.group('episode')
2331             if dlNewest:
2332                 epTitle = mobj.group('showname')
2333             else:
2334                 epTitle = mobj.group('episode')
2335
2336         req = compat_urllib_request.Request(url)
2337         self.report_extraction(epTitle)
2338         try:
2339             htmlHandle = compat_urllib_request.urlopen(req)
2340             html = htmlHandle.read()
2341             webpage = html.decode('utf-8')
2342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2343             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2344             return
2345         if dlNewest:
2346             url = htmlHandle.geturl()
2347             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2348             if mobj is None:
2349                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2350                 return
2351             if mobj.group('episode') == '':
2352                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2353                 return
2354             epTitle = mobj.group('episode')
2355
2356         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2357
2358         if len(mMovieParams) == 0:
2359             # The Colbert Report embeds the information in a without
2360             # a URL prefix; so extract the alternate reference
2361             # and then add the URL prefix manually.
2362
2363             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2364             if len(altMovieParams) == 0:
2365                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2366                 return
2367             else:
2368                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2369
2370         uri = mMovieParams[0][1]
2371         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2372         self.report_index_download(epTitle)
2373         try:
2374             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2375         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2376             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2377             return
2378
2379         results = []
2380
2381         idoc = xml.etree.ElementTree.fromstring(indexXml)
2382         itemEls = idoc.findall('.//item')
2383         for partNum,itemEl in enumerate(itemEls):
2384             mediaId = itemEl.findall('./guid')[0].text
2385             shortMediaId = mediaId.split(':')[-1]
2386             showId = mediaId.split(':')[-2].replace('.com', '')
2387             officialTitle = itemEl.findall('./title')[0].text
2388             officialDate = itemEl.findall('./pubDate')[0].text
2389
2390             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2391                         compat_urllib_parse.urlencode({'uri': mediaId}))
2392             configReq = compat_urllib_request.Request(configUrl)
2393             self.report_config_download(epTitle, shortMediaId)
2394             try:
2395                 configXml = compat_urllib_request.urlopen(configReq).read()
2396             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2397                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2398                 return
2399
2400             cdoc = xml.etree.ElementTree.fromstring(configXml)
2401             turls = []
2402             for rendition in cdoc.findall('.//rendition'):
2403                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2404                 turls.append(finfo)
2405
2406             if len(turls) == 0:
2407                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2408                 continue
2409
2410             if self._downloader.params.get('listformats', None):
2411                 self._print_formats([i[0] for i in turls])
2412                 return
2413
2414             # For now, just pick the highest bitrate
2415             format,rtmp_video_url = turls[-1]
2416
2417             # Get the format arg from the arg stream
2418             req_format = self._downloader.params.get('format', None)
2419
2420             # Select format if we can find one
2421             for f,v in turls:
2422                 if f == req_format:
2423                     format, rtmp_video_url = f, v
2424                     break
2425
2426             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2427             if not m:
2428                 raise ExtractorError(u'Cannot transform RTMP url')
2429             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2430             video_url = base + m.group('finalid')
2431
2432             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2433             info = {
2434                 'id': shortMediaId,
2435                 'url': video_url,
2436                 'uploader': showId,
2437                 'upload_date': officialDate,
2438                 'title': effTitle,
2439                 'ext': 'mp4',
2440                 'format': format,
2441                 'thumbnail': None,
2442                 'description': officialTitle,
2443             }
2444             results.append(info)
2445
2446         return results
2447
2448
2449 class EscapistIE(InfoExtractor):
2450     """Information extractor for The Escapist """
2451
2452     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2453     IE_NAME = u'escapist'
2454
2455     def report_config_download(self, showName):
2456         self.to_screen(u'%s: Downloading configuration' % showName)
2457
2458     def _real_extract(self, url):
2459         mobj = re.match(self._VALID_URL, url)
2460         if mobj is None:
2461             self._downloader.report_error(u'invalid URL: %s' % url)
2462             return
2463         showName = mobj.group('showname')
2464         videoId = mobj.group('episode')
2465
2466         self.report_extraction(showName)
2467         try:
2468             webPage = compat_urllib_request.urlopen(url)
2469             webPageBytes = webPage.read()
2470             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2471             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2472         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2474             return
2475
2476         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2477         description = unescapeHTML(descMatch.group(1))
2478         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2479         imgUrl = unescapeHTML(imgMatch.group(1))
2480         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2481         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2482         configUrlMatch = re.search('config=(.*)$', playerUrl)
2483         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2484
2485         self.report_config_download(showName)
2486         try:
2487             configJSON = compat_urllib_request.urlopen(configUrl)
2488             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2489             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2491             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2492             return
2493
2494         # Technically, it's JavaScript, not JSON
2495         configJSON = configJSON.replace("'", '"')
2496
2497         try:
2498             config = json.loads(configJSON)
2499         except (ValueError,) as err:
2500             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2501             return
2502
2503         playlist = config['playlist']
2504         videoUrl = playlist[1]['url']
2505
2506         info = {
2507             'id': videoId,
2508             'url': videoUrl,
2509             'uploader': showName,
2510             'upload_date': None,
2511             'title': showName,
2512             'ext': 'mp4',
2513             'thumbnail': imgUrl,
2514             'description': description,
2515             'player_url': playerUrl,
2516         }
2517
2518         return [info]
2519
2520 class CollegeHumorIE(InfoExtractor):
2521     """Information extractor for collegehumor.com"""
2522
2523     _WORKING = False
2524     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2525     IE_NAME = u'collegehumor'
2526
2527     def report_manifest(self, video_id):
2528         """Report information extraction."""
2529         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2530
2531     def _real_extract(self, url):
2532         mobj = re.match(self._VALID_URL, url)
2533         if mobj is None:
2534             self._downloader.report_error(u'invalid URL: %s' % url)
2535             return
2536         video_id = mobj.group('videoid')
2537
2538         info = {
2539             'id': video_id,
2540             'uploader': None,
2541             'upload_date': None,
2542         }
2543
2544         self.report_extraction(video_id)
2545         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2546         try:
2547             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2548         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2549             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2550             return
2551
2552         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2553         try:
2554             videoNode = mdoc.findall('./video')[0]
2555             info['description'] = videoNode.findall('./description')[0].text
2556             info['title'] = videoNode.findall('./caption')[0].text
2557             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2558             manifest_url = videoNode.findall('./file')[0].text
2559         except IndexError:
2560             self._downloader.report_error(u'Invalid metadata XML file')
2561             return
2562
2563         manifest_url += '?hdcore=2.10.3'
2564         self.report_manifest(video_id)
2565         try:
2566             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2569             return
2570
2571         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2572         try:
2573             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2574             node_id = media_node.attrib['url']
2575             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2576         except IndexError as err:
2577             self._downloader.report_error(u'Invalid manifest file')
2578             return
2579
2580         url_pr = compat_urllib_parse_urlparse(manifest_url)
2581         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2582
2583         info['url'] = url
2584         info['ext'] = 'f4f'
2585         return [info]
2586
2587
2588 class XVideosIE(InfoExtractor):
2589     """Information extractor for xvideos.com"""
2590
2591     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2592     IE_NAME = u'xvideos'
2593
2594     def _real_extract(self, url):
2595         mobj = re.match(self._VALID_URL, url)
2596         if mobj is None:
2597             self._downloader.report_error(u'invalid URL: %s' % url)
2598             return
2599         video_id = mobj.group(1)
2600
2601         webpage = self._download_webpage(url, video_id)
2602
2603         self.report_extraction(video_id)
2604
2605
2606         # Extract video URL
2607         mobj = re.search(r'flv_url=(.+?)&', webpage)
2608         if mobj is None:
2609             self._downloader.report_error(u'unable to extract video url')
2610             return
2611         video_url = compat_urllib_parse.unquote(mobj.group(1))
2612
2613
2614         # Extract title
2615         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2616         if mobj is None:
2617             self._downloader.report_error(u'unable to extract video title')
2618             return
2619         video_title = mobj.group(1)
2620
2621
2622         # Extract video thumbnail
2623         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2624         if mobj is None:
2625             self._downloader.report_error(u'unable to extract video thumbnail')
2626             return
2627         video_thumbnail = mobj.group(0)
2628
2629         info = {
2630             'id': video_id,
2631             'url': video_url,
2632             'uploader': None,
2633             'upload_date': None,
2634             'title': video_title,
2635             'ext': 'flv',
2636             'thumbnail': video_thumbnail,
2637             'description': None,
2638         }
2639
2640         return [info]
2641
2642
2643 class SoundcloudIE(InfoExtractor):
2644     """Information extractor for soundcloud.com
2645        To access the media, the uid of the song and a stream token
2646        must be extracted from the page source and the script must make
2647        a request to media.soundcloud.com/crossdomain.xml. Then
2648        the media can be grabbed by requesting from an url composed
2649        of the stream token and uid
2650      """
2651
2652     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2653     IE_NAME = u'soundcloud'
2654
2655     def report_resolve(self, video_id):
2656         """Report information extraction."""
2657         self.to_screen(u'%s: Resolving id' % video_id)
2658
2659     def _real_extract(self, url):
2660         mobj = re.match(self._VALID_URL, url)
2661         if mobj is None:
2662             self._downloader.report_error(u'invalid URL: %s' % url)
2663             return
2664
2665         # extract uploader (which is in the url)
2666         uploader = mobj.group(1)
2667         # extract simple title (uploader + slug of song title)
2668         slug_title =  mobj.group(2)
2669         simple_title = uploader + u'-' + slug_title
2670
2671         self.report_resolve('%s/%s' % (uploader, slug_title))
2672
2673         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2674         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2675         request = compat_urllib_request.Request(resolv_url)
2676         try:
2677             info_json_bytes = compat_urllib_request.urlopen(request).read()
2678             info_json = info_json_bytes.decode('utf-8')
2679         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2680             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2681             return
2682
2683         info = json.loads(info_json)
2684         video_id = info['id']
2685         self.report_extraction('%s/%s' % (uploader, slug_title))
2686
2687         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2688         request = compat_urllib_request.Request(streams_url)
2689         try:
2690             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2691             stream_json = stream_json_bytes.decode('utf-8')
2692         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2693             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2694             return
2695
2696         streams = json.loads(stream_json)
2697         mediaURL = streams['http_mp3_128_url']
2698
2699         return [{
2700             'id':       info['id'],
2701             'url':      mediaURL,
2702             'uploader': info['user']['username'],
2703             'upload_date':  info['created_at'],
2704             'title':    info['title'],
2705             'ext':      u'mp3',
2706             'description': info['description'],
2707         }]
2708
2709 class SoundcloudSetIE(InfoExtractor):
2710     """Information extractor for soundcloud.com sets
2711        To access the media, the uid of the song and a stream token
2712        must be extracted from the page source and the script must make
2713        a request to media.soundcloud.com/crossdomain.xml. Then
2714        the media can be grabbed by requesting from an url composed
2715        of the stream token and uid
2716      """
2717
2718     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2719     IE_NAME = u'soundcloud'
2720
2721     def report_resolve(self, video_id):
2722         """Report information extraction."""
2723         self.to_screen(u'%s: Resolving id' % video_id)
2724
2725     def _real_extract(self, url):
2726         mobj = re.match(self._VALID_URL, url)
2727         if mobj is None:
2728             self._downloader.report_error(u'invalid URL: %s' % url)
2729             return
2730
2731         # extract uploader (which is in the url)
2732         uploader = mobj.group(1)
2733         # extract simple title (uploader + slug of song title)
2734         slug_title =  mobj.group(2)
2735         simple_title = uploader + u'-' + slug_title
2736
2737         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2738
2739         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2740         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2741         request = compat_urllib_request.Request(resolv_url)
2742         try:
2743             info_json_bytes = compat_urllib_request.urlopen(request).read()
2744             info_json = info_json_bytes.decode('utf-8')
2745         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2747             return
2748
2749         videos = []
2750         info = json.loads(info_json)
2751         if 'errors' in info:
2752             for err in info['errors']:
2753                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2754             return
2755
2756         for track in info['tracks']:
2757             video_id = track['id']
2758             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2759
2760             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2761             request = compat_urllib_request.Request(streams_url)
2762             try:
2763                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2764                 stream_json = stream_json_bytes.decode('utf-8')
2765             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2766                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2767                 return
2768
2769             streams = json.loads(stream_json)
2770             mediaURL = streams['http_mp3_128_url']
2771
2772             videos.append({
2773                 'id':       video_id,
2774                 'url':      mediaURL,
2775                 'uploader': track['user']['username'],
2776                 'upload_date':  track['created_at'],
2777                 'title':    track['title'],
2778                 'ext':      u'mp3',
2779                 'description': track['description'],
2780             })
2781         return videos
2782
2783
2784 class InfoQIE(InfoExtractor):
2785     """Information extractor for infoq.com"""
2786     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2787
2788     def _real_extract(self, url):
2789         mobj = re.match(self._VALID_URL, url)
2790         if mobj is None:
2791             self._downloader.report_error(u'invalid URL: %s' % url)
2792             return
2793
2794         webpage = self._download_webpage(url, video_id=url)
2795         self.report_extraction(url)
2796
2797         # Extract video URL
2798         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2799         if mobj is None:
2800             self._downloader.report_error(u'unable to extract video url')
2801             return
2802         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2803         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2804
2805         # Extract title
2806         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2807         if mobj is None:
2808             self._downloader.report_error(u'unable to extract video title')
2809             return
2810         video_title = mobj.group(1)
2811
2812         # Extract description
2813         video_description = u'No description available.'
2814         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2815         if mobj is not None:
2816             video_description = mobj.group(1)
2817
2818         video_filename = video_url.split('/')[-1]
2819         video_id, extension = video_filename.split('.')
2820
2821         info = {
2822             'id': video_id,
2823             'url': video_url,
2824             'uploader': None,
2825             'upload_date': None,
2826             'title': video_title,
2827             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2828             'thumbnail': None,
2829             'description': video_description,
2830         }
2831
2832         return [info]
2833
2834 class MixcloudIE(InfoExtractor):
2835     """Information extractor for www.mixcloud.com"""
2836
2837     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2838     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2839     IE_NAME = u'mixcloud'
2840
2841     def report_download_json(self, file_id):
2842         """Report JSON download."""
2843         self.to_screen(u'Downloading json')
2844
2845     def get_urls(self, jsonData, fmt, bitrate='best'):
2846         """Get urls from 'audio_formats' section in json"""
2847         file_url = None
2848         try:
2849             bitrate_list = jsonData[fmt]
2850             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2851                 bitrate = max(bitrate_list) # select highest
2852
2853             url_list = jsonData[fmt][bitrate]
2854         except TypeError: # we have no bitrate info.
2855             url_list = jsonData[fmt]
2856         return url_list
2857
2858     def check_urls(self, url_list):
2859         """Returns 1st active url from list"""
2860         for url in url_list:
2861             try:
2862                 compat_urllib_request.urlopen(url)
2863                 return url
2864             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2865                 url = None
2866
2867         return None
2868
2869     def _print_formats(self, formats):
2870         print('Available formats:')
2871         for fmt in formats.keys():
2872             for b in formats[fmt]:
2873                 try:
2874                     ext = formats[fmt][b][0]
2875                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2876                 except TypeError: # we have no bitrate info
2877                     ext = formats[fmt][0]
2878                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2879                     break
2880
2881     def _real_extract(self, url):
2882         mobj = re.match(self._VALID_URL, url)
2883         if mobj is None:
2884             self._downloader.report_error(u'invalid URL: %s' % url)
2885             return
2886         # extract uploader & filename from url
2887         uploader = mobj.group(1).decode('utf-8')
2888         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2889
2890         # construct API request
2891         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2892         # retrieve .json file with links to files
2893         request = compat_urllib_request.Request(file_url)
2894         try:
2895             self.report_download_json(file_url)
2896             jsonData = compat_urllib_request.urlopen(request).read()
2897         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2898             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2899             return
2900
2901         # parse JSON
2902         json_data = json.loads(jsonData)
2903         player_url = json_data['player_swf_url']
2904         formats = dict(json_data['audio_formats'])
2905
2906         req_format = self._downloader.params.get('format', None)
2907         bitrate = None
2908
2909         if self._downloader.params.get('listformats', None):
2910             self._print_formats(formats)
2911             return
2912
2913         if req_format is None or req_format == 'best':
2914             for format_param in formats.keys():
2915                 url_list = self.get_urls(formats, format_param)
2916                 # check urls
2917                 file_url = self.check_urls(url_list)
2918                 if file_url is not None:
2919                     break # got it!
2920         else:
2921             if req_format not in formats:
2922                 self._downloader.report_error(u'format is not available')
2923                 return
2924
2925             url_list = self.get_urls(formats, req_format)
2926             file_url = self.check_urls(url_list)
2927             format_param = req_format
2928
2929         return [{
2930             'id': file_id.decode('utf-8'),
2931             'url': file_url.decode('utf-8'),
2932             'uploader': uploader.decode('utf-8'),
2933             'upload_date': None,
2934             'title': json_data['name'],
2935             'ext': file_url.split('.')[-1].decode('utf-8'),
2936             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2937             'thumbnail': json_data['thumbnail_url'],
2938             'description': json_data['description'],
2939             'player_url': player_url.decode('utf-8'),
2940         }]
2941
2942 class StanfordOpenClassroomIE(InfoExtractor):
2943     """Information extractor for Stanford's Open ClassRoom"""
2944
2945     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2946     IE_NAME = u'stanfordoc'
2947
2948     def _real_extract(self, url):
2949         mobj = re.match(self._VALID_URL, url)
2950         if mobj is None:
2951             raise ExtractorError(u'Invalid URL: %s' % url)
2952
2953         if mobj.group('course') and mobj.group('video'): # A specific video
2954             course = mobj.group('course')
2955             video = mobj.group('video')
2956             info = {
2957                 'id': course + '_' + video,
2958                 'uploader': None,
2959                 'upload_date': None,
2960             }
2961
2962             self.report_extraction(info['id'])
2963             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2964             xmlUrl = baseUrl + video + '.xml'
2965             try:
2966                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2967             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2968                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2969                 return
2970             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2971             try:
2972                 info['title'] = mdoc.findall('./title')[0].text
2973                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2974             except IndexError:
2975                 self._downloader.report_error(u'Invalid metadata XML file')
2976                 return
2977             info['ext'] = info['url'].rpartition('.')[2]
2978             return [info]
2979         elif mobj.group('course'): # A course page
2980             course = mobj.group('course')
2981             info = {
2982                 'id': course,
2983                 'type': 'playlist',
2984                 'uploader': None,
2985                 'upload_date': None,
2986             }
2987
2988             coursepage = self._download_webpage(url, info['id'],
2989                                         note='Downloading course info page',
2990                                         errnote='Unable to download course info page')
2991
2992             m = re.search('<h1>([^<]+)</h1>', coursepage)
2993             if m:
2994                 info['title'] = unescapeHTML(m.group(1))
2995             else:
2996                 info['title'] = info['id']
2997
2998             m = re.search('<description>([^<]+)</description>', coursepage)
2999             if m:
3000                 info['description'] = unescapeHTML(m.group(1))
3001
3002             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3003             info['list'] = [
3004                 {
3005                     'type': 'reference',
3006                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3007                 }
3008                     for vpage in links]
3009             results = []
3010             for entry in info['list']:
3011                 assert entry['type'] == 'reference'
3012                 results += self.extract(entry['url'])
3013             return results
3014         else: # Root page
3015             info = {
3016                 'id': 'Stanford OpenClassroom',
3017                 'type': 'playlist',
3018                 'uploader': None,
3019                 'upload_date': None,
3020             }
3021
3022             self.report_download_webpage(info['id'])
3023             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3024             try:
3025                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3026             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3028                 return
3029
3030             info['title'] = info['id']
3031
3032             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3033             info['list'] = [
3034                 {
3035                     'type': 'reference',
3036                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3037                 }
3038                     for cpage in links]
3039
3040             results = []
3041             for entry in info['list']:
3042                 assert entry['type'] == 'reference'
3043                 results += self.extract(entry['url'])
3044             return results
3045
3046 class MTVIE(InfoExtractor):
3047     """Information extractor for MTV.com"""
3048
3049     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3050     IE_NAME = u'mtv'
3051
3052     def _real_extract(self, url):
3053         mobj = re.match(self._VALID_URL, url)
3054         if mobj is None:
3055             self._downloader.report_error(u'invalid URL: %s' % url)
3056             return
3057         if not mobj.group('proto'):
3058             url = 'http://' + url
3059         video_id = mobj.group('videoid')
3060
3061         webpage = self._download_webpage(url, video_id)
3062
3063         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3064         if mobj is None:
3065             self._downloader.report_error(u'unable to extract song name')
3066             return
3067         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3069         if mobj is None:
3070             self._downloader.report_error(u'unable to extract performer')
3071             return
3072         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3073         video_title = performer + ' - ' + song_name
3074
3075         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3076         if mobj is None:
3077             self._downloader.report_error(u'unable to mtvn_uri')
3078             return
3079         mtvn_uri = mobj.group(1)
3080
3081         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3082         if mobj is None:
3083             self._downloader.report_error(u'unable to extract content id')
3084             return
3085         content_id = mobj.group(1)
3086
3087         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3088         self.report_extraction(video_id)
3089         request = compat_urllib_request.Request(videogen_url)
3090         try:
3091             metadataXml = compat_urllib_request.urlopen(request).read()
3092         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3093             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3094             return
3095
3096         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3097         renditions = mdoc.findall('.//rendition')
3098
3099         # For now, always pick the highest quality.
3100         rendition = renditions[-1]
3101
3102         try:
3103             _,_,ext = rendition.attrib['type'].partition('/')
3104             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3105             video_url = rendition.find('./src').text
3106         except KeyError:
3107             self._downloader.report_error('Invalid rendition field.')
3108             return
3109
3110         info = {
3111             'id': video_id,
3112             'url': video_url,
3113             'uploader': performer,
3114             'upload_date': None,
3115             'title': video_title,
3116             'ext': ext,
3117             'format': format,
3118         }
3119
3120         return [info]
3121
3122
3123 class YoukuIE(InfoExtractor):
3124     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3125
3126     def _gen_sid(self):
3127         nowTime = int(time.time() * 1000)
3128         random1 = random.randint(1000,1998)
3129         random2 = random.randint(1000,9999)
3130
3131         return "%d%d%d" %(nowTime,random1,random2)
3132
3133     def _get_file_ID_mix_string(self, seed):
3134         mixed = []
3135         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3136         seed = float(seed)
3137         for i in range(len(source)):
3138             seed  =  (seed * 211 + 30031 ) % 65536
3139             index  =  math.floor(seed / 65536 * len(source) )
3140             mixed.append(source[int(index)])
3141             source.remove(source[int(index)])
3142         #return ''.join(mixed)
3143         return mixed
3144
3145     def _get_file_id(self, fileId, seed):
3146         mixed = self._get_file_ID_mix_string(seed)
3147         ids = fileId.split('*')
3148         realId = []
3149         for ch in ids:
3150             if ch:
3151                 realId.append(mixed[int(ch)])
3152         return ''.join(realId)
3153
3154     def _real_extract(self, url):
3155         mobj = re.match(self._VALID_URL, url)
3156         if mobj is None:
3157             self._downloader.report_error(u'invalid URL: %s' % url)
3158             return
3159         video_id = mobj.group('ID')
3160
3161         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3162
3163         request = compat_urllib_request.Request(info_url, None, std_headers)
3164         try:
3165             self.report_download_webpage(video_id)
3166             jsondata = compat_urllib_request.urlopen(request).read()
3167         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3168             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3169             return
3170
3171         self.report_extraction(video_id)
3172         try:
3173             jsonstr = jsondata.decode('utf-8')
3174             config = json.loads(jsonstr)
3175
3176             video_title =  config['data'][0]['title']
3177             seed = config['data'][0]['seed']
3178
3179             format = self._downloader.params.get('format', None)
3180             supported_format = list(config['data'][0]['streamfileids'].keys())
3181
3182             if format is None or format == 'best':
3183                 if 'hd2' in supported_format:
3184                     format = 'hd2'
3185                 else:
3186                     format = 'flv'
3187                 ext = u'flv'
3188             elif format == 'worst':
3189                 format = 'mp4'
3190                 ext = u'mp4'
3191             else:
3192                 format = 'flv'
3193                 ext = u'flv'
3194
3195
3196             fileid = config['data'][0]['streamfileids'][format]
3197             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3198         except (UnicodeDecodeError, ValueError, KeyError):
3199             self._downloader.report_error(u'unable to extract info section')
3200             return
3201
3202         files_info=[]
3203         sid = self._gen_sid()
3204         fileid = self._get_file_id(fileid, seed)
3205
3206         #column 8,9 of fileid represent the segment number
3207         #fileid[7:9] should be changed
3208         for index, key in enumerate(keys):
3209
3210             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3211             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3212
3213             info = {
3214                 'id': '%s_part%02d' % (video_id, index),
3215                 'url': download_url,
3216                 'uploader': None,
3217                 'upload_date': None,
3218                 'title': video_title,
3219                 'ext': ext,
3220             }
3221             files_info.append(info)
3222
3223         return files_info
3224
3225
3226 class XNXXIE(InfoExtractor):
3227     """Information extractor for xnxx.com"""
3228
3229     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3230     IE_NAME = u'xnxx'
3231     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3232     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3233     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3234
3235     def _real_extract(self, url):
3236         mobj = re.match(self._VALID_URL, url)
3237         if mobj is None:
3238             self._downloader.report_error(u'invalid URL: %s' % url)
3239             return
3240         video_id = mobj.group(1)
3241
3242         self.report_download_webpage(video_id)
3243
3244         # Get webpage content
3245         try:
3246             webpage_bytes = compat_urllib_request.urlopen(url).read()
3247             webpage = webpage_bytes.decode('utf-8')
3248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3249             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3250             return
3251
3252         result = re.search(self.VIDEO_URL_RE, webpage)
3253         if result is None:
3254             self._downloader.report_error(u'unable to extract video url')
3255             return
3256         video_url = compat_urllib_parse.unquote(result.group(1))
3257
3258         result = re.search(self.VIDEO_TITLE_RE, webpage)
3259         if result is None:
3260             self._downloader.report_error(u'unable to extract video title')
3261             return
3262         video_title = result.group(1)
3263
3264         result = re.search(self.VIDEO_THUMB_RE, webpage)
3265         if result is None:
3266             self._downloader.report_error(u'unable to extract video thumbnail')
3267             return
3268         video_thumbnail = result.group(1)
3269
3270         return [{
3271             'id': video_id,
3272             'url': video_url,
3273             'uploader': None,
3274             'upload_date': None,
3275             'title': video_title,
3276             'ext': 'flv',
3277             'thumbnail': video_thumbnail,
3278             'description': None,
3279         }]
3280
3281
3282 class GooglePlusIE(InfoExtractor):
3283     """Information extractor for plus.google.com."""
3284
3285     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3286     IE_NAME = u'plus.google'
3287
3288     def report_extract_entry(self, url):
3289         """Report downloading extry"""
3290         self.to_screen(u'Downloading entry: %s' % url)
3291
3292     def report_date(self, upload_date):
3293         """Report downloading extry"""
3294         self.to_screen(u'Entry date: %s' % upload_date)
3295
3296     def report_uploader(self, uploader):
3297         """Report downloading extry"""
3298         self.to_screen(u'Uploader: %s' % uploader)
3299
3300     def report_title(self, video_title):
3301         """Report downloading extry"""
3302         self.to_screen(u'Title: %s' % video_title)
3303
3304     def report_extract_vid_page(self, video_page):
3305         """Report information extraction."""
3306         self.to_screen(u'Extracting video page: %s' % video_page)
3307
3308     def _real_extract(self, url):
3309         # Extract id from URL
3310         mobj = re.match(self._VALID_URL, url)
3311         if mobj is None:
3312             self._downloader.report_error(u'Invalid URL: %s' % url)
3313             return
3314
3315         post_url = mobj.group(0)
3316         video_id = mobj.group(1)
3317
3318         video_extension = 'flv'
3319
3320         # Step 1, Retrieve post webpage to extract further information
3321         self.report_extract_entry(post_url)
3322         request = compat_urllib_request.Request(post_url)
3323         try:
3324             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3325         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3326             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3327             return
3328
3329         # Extract update date
3330         upload_date = None
3331         pattern = 'title="Timestamp">(.*?)</a>'
3332         mobj = re.search(pattern, webpage)
3333         if mobj:
3334             upload_date = mobj.group(1)
3335             # Convert timestring to a format suitable for filename
3336             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3337             upload_date = upload_date.strftime('%Y%m%d')
3338         self.report_date(upload_date)
3339
3340         # Extract uploader
3341         uploader = None
3342         pattern = r'rel\="author".*?>(.*?)</a>'
3343         mobj = re.search(pattern, webpage)
3344         if mobj:
3345             uploader = mobj.group(1)
3346         self.report_uploader(uploader)
3347
3348         # Extract title
3349         # Get the first line for title
3350         video_title = u'NA'
3351         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3352         mobj = re.search(pattern, webpage)
3353         if mobj:
3354             video_title = mobj.group(1)
3355         self.report_title(video_title)
3356
3357         # Step 2, Stimulate clicking the image box to launch video
3358         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3359         mobj = re.search(pattern, webpage)
3360         if mobj is None:
3361             self._downloader.report_error(u'unable to extract video page URL')
3362
3363         video_page = mobj.group(1)
3364         request = compat_urllib_request.Request(video_page)
3365         try:
3366             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3367         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3368             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3369             return
3370         self.report_extract_vid_page(video_page)
3371
3372
3373         # Extract video links on video page
3374         """Extract video links of all sizes"""
3375         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3376         mobj = re.findall(pattern, webpage)
3377         if len(mobj) == 0:
3378             self._downloader.report_error(u'unable to extract video links')
3379
3380         # Sort in resolution
3381         links = sorted(mobj)
3382
3383         # Choose the lowest of the sort, i.e. highest resolution
3384         video_url = links[-1]
3385         # Only get the url. The resolution part in the tuple has no use anymore
3386         video_url = video_url[-1]
3387         # Treat escaped \u0026 style hex
3388         try:
3389             video_url = video_url.decode("unicode_escape")
3390         except AttributeError: # Python 3
3391             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3392
3393
3394         return [{
3395             'id':       video_id,
3396             'url':      video_url,
3397             'uploader': uploader,
3398             'upload_date':  upload_date,
3399             'title':    video_title,
3400             'ext':      video_extension,
3401         }]
3402
3403 class NBAIE(InfoExtractor):
3404     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3405     IE_NAME = u'nba'
3406
3407     def _real_extract(self, url):
3408         mobj = re.match(self._VALID_URL, url)
3409         if mobj is None:
3410             self._downloader.report_error(u'invalid URL: %s' % url)
3411             return
3412
3413         video_id = mobj.group(1)
3414         if video_id.endswith('/index.html'):
3415             video_id = video_id[:-len('/index.html')]
3416
3417         webpage = self._download_webpage(url, video_id)
3418
3419         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3420         def _findProp(rexp, default=None):
3421             m = re.search(rexp, webpage)
3422             if m:
3423                 return unescapeHTML(m.group(1))
3424             else:
3425                 return default
3426
3427         shortened_video_id = video_id.rpartition('/')[2]
3428         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3429         info = {
3430             'id': shortened_video_id,
3431             'url': video_url,
3432             'ext': 'mp4',
3433             'title': title,
3434             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3435             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3436         }
3437         return [info]
3438
3439 class JustinTVIE(InfoExtractor):
3440     """Information extractor for justin.tv and twitch.tv"""
3441     # TODO: One broadcast may be split into multiple videos. The key
3442     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3443     # starts at 1 and increases. Can we treat all parts as one video?
3444
3445     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3446         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3447     _JUSTIN_PAGE_LIMIT = 100
3448     IE_NAME = u'justin.tv'
3449
3450     def report_download_page(self, channel, offset):
3451         """Report attempt to download a single page of videos."""
3452         self.to_screen(u'%s: Downloading video information from %d to %d' %
3453                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3454
3455     # Return count of items, list of *valid* items
3456     def _parse_page(self, url):
3457         try:
3458             urlh = compat_urllib_request.urlopen(url)
3459             webpage_bytes = urlh.read()
3460             webpage = webpage_bytes.decode('utf-8', 'ignore')
3461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3462             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3463             return
3464
3465         response = json.loads(webpage)
3466         if type(response) != list:
3467             error_text = response.get('error', 'unknown error')
3468             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3469             return
3470         info = []
3471         for clip in response:
3472             video_url = clip['video_file_url']
3473             if video_url:
3474                 video_extension = os.path.splitext(video_url)[1][1:]
3475                 video_date = re.sub('-', '', clip['start_time'][:10])
3476                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3477                 video_id = clip['id']
3478                 video_title = clip.get('title', video_id)
3479                 info.append({
3480                     'id': video_id,
3481                     'url': video_url,
3482                     'title': video_title,
3483                     'uploader': clip.get('channel_name', video_uploader_id),
3484                     'uploader_id': video_uploader_id,
3485                     'upload_date': video_date,
3486                     'ext': video_extension,
3487                 })
3488         return (len(response), info)
3489
3490     def _real_extract(self, url):
3491         mobj = re.match(self._VALID_URL, url)
3492         if mobj is None:
3493             self._downloader.report_error(u'invalid URL: %s' % url)
3494             return
3495
3496         api = 'http://api.justin.tv'
3497         video_id = mobj.group(mobj.lastindex)
3498         paged = False
3499         if mobj.lastindex == 1:
3500             paged = True
3501             api += '/channel/archives/%s.json'
3502         else:
3503             api += '/broadcast/by_archive/%s.json'
3504         api = api % (video_id,)
3505
3506         self.report_extraction(video_id)
3507
3508         info = []
3509         offset = 0
3510         limit = self._JUSTIN_PAGE_LIMIT
3511         while True:
3512             if paged:
3513                 self.report_download_page(video_id, offset)
3514             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3515             page_count, page_info = self._parse_page(page_url)
3516             info.extend(page_info)
3517             if not paged or page_count != limit:
3518                 break
3519             offset += limit
3520         return info
3521
3522 class FunnyOrDieIE(InfoExtractor):
3523     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3524
3525     def _real_extract(self, url):
3526         mobj = re.match(self._VALID_URL, url)
3527         if mobj is None:
3528             self._downloader.report_error(u'invalid URL: %s' % url)
3529             return
3530
3531         video_id = mobj.group('id')
3532         webpage = self._download_webpage(url, video_id)
3533
3534         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3535         if not m:
3536             self._downloader.report_error(u'unable to find video information')
3537         video_url = unescapeHTML(m.group('url'))
3538
3539         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3540         if not m:
3541             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3542             if not m:
3543                 self._downloader.report_error(u'Cannot find video title')
3544         title = clean_html(m.group('title'))
3545
3546         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3547         if m:
3548             desc = unescapeHTML(m.group('desc'))
3549         else:
3550             desc = None
3551
3552         info = {
3553             'id': video_id,
3554             'url': video_url,
3555             'ext': 'mp4',
3556             'title': title,
3557             'description': desc,
3558         }
3559         return [info]
3560
3561 class SteamIE(InfoExtractor):
3562     _VALID_URL = r"""http://store.steampowered.com/
3563                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3564                 (?P<gameID>\d+)/?
3565                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3566                 """
3567
3568     @classmethod
3569     def suitable(cls, url):
3570         """Receives a URL and returns True if suitable for this IE."""
3571         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3572
3573     def _real_extract(self, url):
3574         m = re.match(self._VALID_URL, url, re.VERBOSE)
3575         gameID = m.group('gameID')
3576         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3577         self.report_age_confirmation()
3578         webpage = self._download_webpage(videourl, gameID)
3579         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3580
3581         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3582         mweb = re.finditer(urlRE, webpage)
3583         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3584         titles = re.finditer(namesRE, webpage)
3585         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3586         thumbs = re.finditer(thumbsRE, webpage)
3587         videos = []
3588         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3589             video_id = vid.group('videoID')
3590             title = vtitle.group('videoName')
3591             video_url = vid.group('videoURL')
3592             video_thumb = thumb.group('thumbnail')
3593             if not video_url:
3594                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3595             info = {
3596                 'id':video_id,
3597                 'url':video_url,
3598                 'ext': 'flv',
3599                 'title': unescapeHTML(title),
3600                 'thumbnail': video_thumb
3601                   }
3602             videos.append(info)
3603         return [self.playlist_result(videos, gameID, game_title)]
3604
3605 class UstreamIE(InfoExtractor):
3606     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3607     IE_NAME = u'ustream'
3608
3609     def _real_extract(self, url):
3610         m = re.match(self._VALID_URL, url)
3611         video_id = m.group('videoID')
3612         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3613         webpage = self._download_webpage(url, video_id)
3614         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3615         title = m.group('title')
3616         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3617         uploader = m.group('uploader')
3618         info = {
3619                 'id':video_id,
3620                 'url':video_url,
3621                 'ext': 'flv',
3622                 'title': title,
3623                 'uploader': uploader
3624                   }
3625         return [info]
3626
3627 class WorldStarHipHopIE(InfoExtractor):
3628     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3629     IE_NAME = u'WorldStarHipHop'
3630
3631     def _real_extract(self, url):
3632         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3633
3634         webpage_src = compat_urllib_request.urlopen(url).read()
3635         webpage_src = webpage_src.decode('utf-8')
3636
3637         mobj = re.search(_src_url, webpage_src)
3638
3639         m = re.match(self._VALID_URL, url)
3640         video_id = m.group('id')
3641
3642         if mobj is not None:
3643             video_url = mobj.group()
3644             if 'mp4' in video_url:
3645                 ext = 'mp4'
3646             else:
3647                 ext = 'flv'
3648         else:
3649             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3650             return
3651
3652         _title = r"""<title>(.*)</title>"""
3653
3654         mobj = re.search(_title, webpage_src)
3655
3656         if mobj is not None:
3657             title = mobj.group(1)
3658         else:
3659             title = 'World Start Hip Hop - %s' % time.ctime()
3660
3661         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3662         mobj = re.search(_thumbnail, webpage_src)
3663
3664         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3665         if mobj is not None:
3666             thumbnail = mobj.group(1)
3667         else:
3668             _title = r"""candytitles.*>(.*)</span>"""
3669             mobj = re.search(_title, webpage_src)
3670             if mobj is not None:
3671                 title = mobj.group(1)
3672             thumbnail = None
3673
3674         results = [{
3675                     'id': video_id,
3676                     'url' : video_url,
3677                     'title' : title,
3678                     'thumbnail' : thumbnail,
3679                     'ext' : ext,
3680                     }]
3681         return results
3682
3683 class RBMARadioIE(InfoExtractor):
3684     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3685
3686     def _real_extract(self, url):
3687         m = re.match(self._VALID_URL, url)
3688         video_id = m.group('videoID')
3689
3690         webpage = self._download_webpage(url, video_id)
3691         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3692         if not m:
3693             raise ExtractorError(u'Cannot find metadata')
3694         json_data = m.group(1)
3695
3696         try:
3697             data = json.loads(json_data)
3698         except ValueError as e:
3699             raise ExtractorError(u'Invalid JSON: ' + str(e))
3700
3701         video_url = data['akamai_url'] + '&cbr=256'
3702         url_parts = compat_urllib_parse_urlparse(video_url)
3703         video_ext = url_parts.path.rpartition('.')[2]
3704         info = {
3705                 'id': video_id,
3706                 'url': video_url,
3707                 'ext': video_ext,
3708                 'title': data['title'],
3709                 'description': data.get('teaser_text'),
3710                 'location': data.get('country_of_origin'),
3711                 'uploader': data.get('host', {}).get('name'),
3712                 'uploader_id': data.get('host', {}).get('slug'),
3713                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3714                 'duration': data.get('duration'),
3715         }
3716         return [info]
3717
3718
3719 class YouPornIE(InfoExtractor):
3720     """Information extractor for youporn.com."""
3721     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3722
3723     def _print_formats(self, formats):
3724         """Print all available formats"""
3725         print(u'Available formats:')
3726         print(u'ext\t\tformat')
3727         print(u'---------------------------------')
3728         for format in formats:
3729             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3730
3731     def _specific(self, req_format, formats):
3732         for x in formats:
3733             if(x["format"]==req_format):
3734                 return x
3735         return None
3736
3737     def _real_extract(self, url):
3738         mobj = re.match(self._VALID_URL, url)
3739         if mobj is None:
3740             self._downloader.report_error(u'invalid URL: %s' % url)
3741             return
3742
3743         video_id = mobj.group('videoid')
3744
3745         req = compat_urllib_request.Request(url)
3746         req.add_header('Cookie', 'age_verified=1')
3747         webpage = self._download_webpage(req, video_id)
3748
3749         # Get the video title
3750         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3751         if result is None:
3752             raise ExtractorError(u'Unable to extract video title')
3753         video_title = result.group('title').strip()
3754
3755         # Get the video date
3756         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3757         if result is None:
3758             self._downloader.report_warning(u'unable to extract video date')
3759             upload_date = None
3760         else:
3761             upload_date = result.group('date').strip()
3762
3763         # Get the video uploader
3764         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3765         if result is None:
3766             self._downloader.report_warning(u'unable to extract uploader')
3767             video_uploader = None
3768         else:
3769             video_uploader = result.group('uploader').strip()
3770             video_uploader = clean_html( video_uploader )
3771
3772         # Get all of the formats available
3773         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3774         result = re.search(DOWNLOAD_LIST_RE, webpage)
3775         if result is None:
3776             raise ExtractorError(u'Unable to extract download list')
3777         download_list_html = result.group('download_list').strip()
3778
3779         # Get all of the links from the page
3780         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3781         links = re.findall(LINK_RE, download_list_html)
3782         if(len(links) == 0):
3783             raise ExtractorError(u'ERROR: no known formats available for video')
3784
3785         self.to_screen(u'Links found: %d' % len(links))
3786
3787         formats = []
3788         for link in links:
3789
3790             # A link looks like this:
3791             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3792             # A path looks like this:
3793             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3794             video_url = unescapeHTML( link )
3795             path = compat_urllib_parse_urlparse( video_url ).path
3796             extension = os.path.splitext( path )[1][1:]
3797             format = path.split('/')[4].split('_')[:2]
3798             size = format[0]
3799             bitrate = format[1]
3800             format = "-".join( format )
3801             title = u'%s-%s-%s' % (video_title, size, bitrate)
3802
3803             formats.append({
3804                 'id': video_id,
3805                 'url': video_url,
3806                 'uploader': video_uploader,
3807                 'upload_date': upload_date,
3808                 'title': title,
3809                 'ext': extension,
3810                 'format': format,
3811                 'thumbnail': None,
3812                 'description': None,
3813                 'player_url': None
3814             })
3815
3816         if self._downloader.params.get('listformats', None):
3817             self._print_formats(formats)
3818             return
3819
3820         req_format = self._downloader.params.get('format', None)
3821         self.to_screen(u'Format: %s' % req_format)
3822
3823         if req_format is None or req_format == 'best':
3824             return [formats[0]]
3825         elif req_format == 'worst':
3826             return [formats[-1]]
3827         elif req_format in ('-1', 'all'):
3828             return formats
3829         else:
3830             format = self._specific( req_format, formats )
3831             if result is None:
3832                 self._downloader.report_error(u'requested format not available')
3833                 return
3834             return [format]
3835
3836
3837
3838 class PornotubeIE(InfoExtractor):
3839     """Information extractor for pornotube.com."""
3840     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3841
3842     def _real_extract(self, url):
3843         mobj = re.match(self._VALID_URL, url)
3844         if mobj is None:
3845             self._downloader.report_error(u'invalid URL: %s' % url)
3846             return
3847
3848         video_id = mobj.group('videoid')
3849         video_title = mobj.group('title')
3850
3851         # Get webpage content
3852         webpage = self._download_webpage(url, video_id)
3853
3854         # Get the video URL
3855         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3856         result = re.search(VIDEO_URL_RE, webpage)
3857         if result is None:
3858             self._downloader.report_error(u'unable to extract video url')
3859             return
3860         video_url = compat_urllib_parse.unquote(result.group('url'))
3861
3862         #Get the uploaded date
3863         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3864         result = re.search(VIDEO_UPLOADED_RE, webpage)
3865         if result is None:
3866             self._downloader.report_error(u'unable to extract video title')
3867             return
3868         upload_date = result.group('date')
3869
3870         info = {'id': video_id,
3871                 'url': video_url,
3872                 'uploader': None,
3873                 'upload_date': upload_date,
3874                 'title': video_title,
3875                 'ext': 'flv',
3876                 'format': 'flv'}
3877
3878         return [info]
3879
3880 class YouJizzIE(InfoExtractor):
3881     """Information extractor for youjizz.com."""
3882     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3883
3884     def _real_extract(self, url):
3885         mobj = re.match(self._VALID_URL, url)
3886         if mobj is None:
3887             self._downloader.report_error(u'invalid URL: %s' % url)
3888             return
3889
3890         video_id = mobj.group('videoid')
3891
3892         # Get webpage content
3893         webpage = self._download_webpage(url, video_id)
3894
3895         # Get the video title
3896         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3897         if result is None:
3898             raise ExtractorError(u'ERROR: unable to extract video title')
3899         video_title = result.group('title').strip()
3900
3901         # Get the embed page
3902         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3903         if result is None:
3904             raise ExtractorError(u'ERROR: unable to extract embed page')
3905
3906         embed_page_url = result.group(0).strip()
3907         video_id = result.group('videoid')
3908
3909         webpage = self._download_webpage(embed_page_url, video_id)
3910
3911         # Get the video URL
3912         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3913         if result is None:
3914             raise ExtractorError(u'ERROR: unable to extract video url')
3915         video_url = result.group('source')
3916
3917         info = {'id': video_id,
3918                 'url': video_url,
3919                 'title': video_title,
3920                 'ext': 'flv',
3921                 'format': 'flv',
3922                 'player_url': embed_page_url}
3923
3924         return [info]
3925
3926 class EightTracksIE(InfoExtractor):
3927     IE_NAME = '8tracks'
3928     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3929
3930     def _real_extract(self, url):
3931         mobj = re.match(self._VALID_URL, url)
3932         if mobj is None:
3933             raise ExtractorError(u'Invalid URL: %s' % url)
3934         playlist_id = mobj.group('id')
3935
3936         webpage = self._download_webpage(url, playlist_id)
3937
3938         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3939         if not m:
3940             raise ExtractorError(u'Cannot find trax information')
3941         json_like = m.group(1)
3942         data = json.loads(json_like)
3943
3944         session = str(random.randint(0, 1000000000))
3945         mix_id = data['id']
3946         track_count = data['tracks_count']
3947         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3948         next_url = first_url
3949         res = []
3950         for i in itertools.count():
3951             api_json = self._download_webpage(next_url, playlist_id,
3952                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3953                 errnote=u'Failed to download song information')
3954             api_data = json.loads(api_json)
3955             track_data = api_data[u'set']['track']
3956             info = {
3957                 'id': track_data['id'],
3958                 'url': track_data['track_file_stream_url'],
3959                 'title': track_data['performer'] + u' - ' + track_data['name'],
3960                 'raw_title': track_data['name'],
3961                 'uploader_id': data['user']['login'],
3962                 'ext': 'm4a',
3963             }
3964             res.append(info)
3965             if api_data['set']['at_last_track']:
3966                 break
3967             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3968         return res
3969
3970 class KeekIE(InfoExtractor):
3971     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3972     IE_NAME = u'keek'
3973
3974     def _real_extract(self, url):
3975         m = re.match(self._VALID_URL, url)
3976         video_id = m.group('videoID')
3977         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3978         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3979         webpage = self._download_webpage(url, video_id)
3980         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3981         title = unescapeHTML(m.group('title'))
3982         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3983         uploader = clean_html(m.group('uploader'))
3984         info = {
3985                 'id': video_id,
3986                 'url': video_url,
3987                 'ext': 'mp4',
3988                 'title': title,
3989                 'thumbnail': thumbnail,
3990                 'uploader': uploader
3991         }
3992         return [info]
3993
3994 class TEDIE(InfoExtractor):
3995     _VALID_URL=r'''http://www.ted.com/
3996                    (
3997                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3998                         |
3999                         ((?P<type_talk>talks)) # We have a simple talk
4000                    )
4001                    /(?P<name>\w+) # Here goes the name and then ".html"
4002                    '''
4003
4004     @classmethod
4005     def suitable(cls, url):
4006         """Receives a URL and returns True if suitable for this IE."""
4007         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4008
4009     def _real_extract(self, url):
4010         m=re.match(self._VALID_URL, url, re.VERBOSE)
4011         if m.group('type_talk'):
4012             return [self._talk_info(url)]
4013         else :
4014             playlist_id=m.group('playlist_id')
4015             name=m.group('name')
4016             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4017             return [self._playlist_videos_info(url,name,playlist_id)]
4018
4019     def _talk_video_link(self,mediaSlug):
4020         '''Returns the video link for that mediaSlug'''
4021         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4022
4023     def _playlist_videos_info(self,url,name,playlist_id=0):
4024         '''Returns the videos of the playlist'''
4025         video_RE=r'''
4026                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4027                      ([.\s]*?)data-playlist_item_id="(\d+)"
4028                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4029                      '''
4030         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4031         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4032         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4033         m_names=re.finditer(video_name_RE,webpage)
4034
4035         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4036         m_playlist = re.search(playlist_RE, webpage)
4037         playlist_title = m_playlist.group('playlist_title')
4038
4039         playlist_entries = []
4040         for m_video, m_name in zip(m_videos,m_names):
4041             video_id=m_video.group('video_id')
4042             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4043             playlist_entries.append(self.url_result(talk_url, 'TED'))
4044         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4045
4046     def _talk_info(self, url, video_id=0):
4047         """Return the video for the talk in the url"""
4048         m=re.match(self._VALID_URL, url,re.VERBOSE)
4049         videoName=m.group('name')
4050         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4051         # If the url includes the language we get the title translated
4052         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4053         title=re.search(title_RE, webpage).group('title')
4054         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4055                         "id":(?P<videoID>[\d]+).*?
4056                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4057         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4058         thumb_match=re.search(thumb_RE,webpage)
4059         info_match=re.search(info_RE,webpage,re.VERBOSE)
4060         video_id=info_match.group('videoID')
4061         mediaSlug=info_match.group('mediaSlug')
4062         video_url=self._talk_video_link(mediaSlug)
4063         info = {
4064                 'id': video_id,
4065                 'url': video_url,
4066                 'ext': 'mp4',
4067                 'title': title,
4068                 'thumbnail': thumb_match.group('thumbnail')
4069                 }
4070         return info
4071
4072 class MySpassIE(InfoExtractor):
4073     _VALID_URL = r'http://www.myspass.de/.*'
4074
4075     def _real_extract(self, url):
4076         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4077
4078         # video id is the last path element of the URL
4079         # usually there is a trailing slash, so also try the second but last
4080         url_path = compat_urllib_parse_urlparse(url).path
4081         url_parent_path, video_id = os.path.split(url_path)
4082         if not video_id:
4083             _, video_id = os.path.split(url_parent_path)
4084
4085         # get metadata
4086         metadata_url = META_DATA_URL_TEMPLATE % video_id
4087         metadata_text = self._download_webpage(metadata_url, video_id)
4088         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4089
4090         # extract values from metadata
4091         url_flv_el = metadata.find('url_flv')
4092         if url_flv_el is None:
4093             self._downloader.report_error(u'unable to extract download url')
4094             return
4095         video_url = url_flv_el.text
4096         extension = os.path.splitext(video_url)[1][1:]
4097         title_el = metadata.find('title')
4098         if title_el is None:
4099             self._downloader.report_error(u'unable to extract title')
4100             return
4101         title = title_el.text
4102         format_id_el = metadata.find('format_id')
4103         if format_id_el is None:
4104             format = ext
4105         else:
4106             format = format_id_el.text
4107         description_el = metadata.find('description')
4108         if description_el is not None:
4109             description = description_el.text
4110         else:
4111             description = None
4112         imagePreview_el = metadata.find('imagePreview')
4113         if imagePreview_el is not None:
4114             thumbnail = imagePreview_el.text
4115         else:
4116             thumbnail = None
4117         info = {
4118             'id': video_id,
4119             'url': video_url,
4120             'title': title,
4121             'ext': extension,
4122             'format': format,
4123             'thumbnail': thumbnail,
4124             'description': description
4125         }
4126         return [info]
4127
4128 class SpiegelIE(InfoExtractor):
4129     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4130
4131     def _real_extract(self, url):
4132         m = re.match(self._VALID_URL, url)
4133         video_id = m.group('videoID')
4134
4135         webpage = self._download_webpage(url, video_id)
4136         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4137         if not m:
4138             raise ExtractorError(u'Cannot find title')
4139         video_title = unescapeHTML(m.group(1))
4140
4141         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4142         xml_code = self._download_webpage(xml_url, video_id,
4143                     note=u'Downloading XML', errnote=u'Failed to download XML')
4144
4145         idoc = xml.etree.ElementTree.fromstring(xml_code)
4146         last_type = idoc[-1]
4147         filename = last_type.findall('./filename')[0].text
4148         duration = float(last_type.findall('./duration')[0].text)
4149
4150         video_url = 'http://video2.spiegel.de/flash/' + filename
4151         video_ext = filename.rpartition('.')[2]
4152         info = {
4153             'id': video_id,
4154             'url': video_url,
4155             'ext': video_ext,
4156             'title': video_title,
4157             'duration': duration,
4158         }
4159         return [info]
4160
4161 class LiveLeakIE(InfoExtractor):
4162
4163     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4164     IE_NAME = u'liveleak'
4165
4166     def _real_extract(self, url):
4167         mobj = re.match(self._VALID_URL, url)
4168         if mobj is None:
4169             self._downloader.report_error(u'invalid URL: %s' % url)
4170             return
4171
4172         video_id = mobj.group('video_id')
4173
4174         webpage = self._download_webpage(url, video_id)
4175
4176         m = re.search(r'file: "(.*?)",', webpage)
4177         if not m:
4178             self._downloader.report_error(u'unable to find video url')
4179             return
4180         video_url = m.group(1)
4181
4182         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4183         if not m:
4184             self._downloader.report_error(u'Cannot find video title')
4185         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4186
4187         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4188         if m:
4189             desc = unescapeHTML(m.group('desc'))
4190         else:
4191             desc = None
4192
4193         m = re.search(r'By:.*?(\w+)</a>', webpage)
4194         if m:
4195             uploader = clean_html(m.group(1))
4196         else:
4197             uploader = None
4198
4199         info = {
4200             'id':  video_id,
4201             'url': video_url,
4202             'ext': 'mp4',
4203             'title': title,
4204             'description': desc,
4205             'uploader': uploader
4206         }
4207
4208         return [info]
4209
4210 class ARDIE(InfoExtractor):
4211     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4212     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4213     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4214
4215     def _real_extract(self, url):
4216         # determine video id from url
4217         m = re.match(self._VALID_URL, url)
4218
4219         numid = re.search(r'documentId=([0-9]+)', url)
4220         if numid:
4221             video_id = numid.group(1)
4222         else:
4223             video_id = m.group('video_id')
4224
4225         # determine title and media streams from webpage
4226         html = self._download_webpage(url, video_id)
4227         title = re.search(self._TITLE, html).group('title')
4228         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4229         if not streams:
4230             assert '"fsk"' in html
4231             self._downloader.report_error(u'this video is only available after 8:00 pm')
4232             return
4233
4234         # choose default media type and highest quality for now
4235         stream = max([s for s in streams if int(s["media_type"]) == 0],
4236                      key=lambda s: int(s["quality"]))
4237
4238         # there's two possibilities: RTMP stream or HTTP download
4239         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4240         if stream['rtmp_url']:
4241             self.to_screen(u'RTMP download detected')
4242             assert stream['video_url'].startswith('mp4:')
4243             info["url"] = stream["rtmp_url"]
4244             info["play_path"] = stream['video_url']
4245         else:
4246             assert stream["video_url"].endswith('.mp4')
4247             info["url"] = stream["video_url"]
4248         return [info]
4249
4250 class TumblrIE(InfoExtractor):
4251     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4252
4253     def _real_extract(self, url):
4254         m_url = re.match(self._VALID_URL, url)
4255         video_id = m_url.group('id')
4256         blog = m_url.group('blog_name')
4257
4258         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4259         webpage = self._download_webpage(url, video_id)
4260
4261         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4262         video = re.search(re_video, webpage)
4263         if video is None:
4264             self.to_screen("No video founded")
4265             return []
4266         video_url = video.group('video_url')
4267         ext = video.group('ext')
4268
4269         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4270         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4271
4272         # The only place where you can get a title, it's not complete,
4273         # but searching in other places doesn't work for all videos
4274         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4275         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4276
4277         return [{'id': video_id,
4278                  'url': video_url,
4279                  'title': title,
4280                  'thumbnail': thumb,
4281                  'ext': ext
4282                  }]
4283
4284
4285 def gen_extractors():
4286     """ Return a list of an instance of every supported extractor.
4287     The order does matter; the first extractor matched is the one handling the URL.
4288     """
4289     return [
4290         YoutubePlaylistIE(),
4291         YoutubeChannelIE(),
4292         YoutubeUserIE(),
4293         YoutubeSearchIE(),
4294         YoutubeIE(),
4295         MetacafeIE(),
4296         DailymotionIE(),
4297         GoogleSearchIE(),
4298         PhotobucketIE(),
4299         YahooIE(),
4300         YahooSearchIE(),
4301         DepositFilesIE(),
4302         FacebookIE(),
4303         BlipTVUserIE(),
4304         BlipTVIE(),
4305         VimeoIE(),
4306         MyVideoIE(),
4307         ComedyCentralIE(),
4308         EscapistIE(),
4309         CollegeHumorIE(),
4310         XVideosIE(),
4311         SoundcloudSetIE(),
4312         SoundcloudIE(),
4313         InfoQIE(),
4314         MixcloudIE(),
4315         StanfordOpenClassroomIE(),
4316         MTVIE(),
4317         YoukuIE(),
4318         XNXXIE(),
4319         YouJizzIE(),
4320         PornotubeIE(),
4321         YouPornIE(),
4322         GooglePlusIE(),
4323         ArteTvIE(),
4324         NBAIE(),
4325         WorldStarHipHopIE(),
4326         JustinTVIE(),
4327         FunnyOrDieIE(),
4328         SteamIE(),
4329         UstreamIE(),
4330         RBMARadioIE(),
4331         EightTracksIE(),
4332         KeekIE(),
4333         TEDIE(),
4334         MySpassIE(),
4335         SpiegelIE(),
4336         LiveLeakIE(),
4337         ARDIE(),
4338         TumblrIE(),
4339         GenericIE()
4340     ]
4341
4342 def get_info_extractor(ie_name):
4343     """Returns the info extractor class with the given ie_name"""
4344     return globals()[ie_name+'IE']