_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
 566             for expression in format_expressions:
 567                 try:
 568                     upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
 569                 except:
 570                     pass
 571
 572         # description
 573         video_description = get_element_by_id("eow-description", video_webpage)
 574         if video_description:
 575             video_description = clean_html(video_description)
 576         else:
 577             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 578             if fd_mobj:
 579                 video_description = unescapeHTML(fd_mobj.group(1))
 580             else:
 581                 video_description = u''
 582
 583         # subtitles
 584         video_subtitles = None
 585
 586         if self._downloader.params.get('writesubtitles', False):
 587             video_subtitles = self._extract_subtitle(video_id)
 588             if video_subtitles:
 589                 (sub_error, sub_lang, sub) = video_subtitles[0]
 590                 if sub_error:
 591                     self._downloader.report_error(sub_error)
 592
 593         if self._downloader.params.get('allsubtitles', False):
 594             video_subtitles = self._extract_all_subtitles(video_id)
 595             for video_subtitle in video_subtitles:
 596                 (sub_error, sub_lang, sub) = video_subtitle
 597                 if sub_error:
 598                     self._downloader.report_error(sub_error)
 599
 600         if self._downloader.params.get('listsubtitles', False):
 601             sub_lang_list = self._list_available_subtitles(video_id)
 602             return
 603
 604         if 'length_seconds' not in video_info:
 605             self._downloader.report_warning(u'unable to extract video duration')
 606             video_duration = ''
 607         else:
 608             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 609
 610         # token
 611         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 612
 613         # Decide which formats to download
 614         req_format = self._downloader.params.get('format', None)
 615
 616         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 617             self.report_rtmp_download()
 618             video_url_list = [(None, video_info['conn'][0])]
 619         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 620             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 621             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 622             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 623             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 624
 625             format_limit = self._downloader.params.get('format_limit', None)
 626             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 627             if format_limit is not None and format_limit in available_formats:
 628                 format_list = available_formats[available_formats.index(format_limit):]
 629             else:
 630                 format_list = available_formats
 631             existing_formats = [x for x in format_list if x in url_map]
 632             if len(existing_formats) == 0:
 633                 raise ExtractorError(u'no known formats available for video')
 634             if self._downloader.params.get('listformats', None):
 635                 self._print_formats(existing_formats)
 636                 return
 637             if req_format is None or req_format == 'best':
 638                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 639             elif req_format == 'worst':
 640                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 641             elif req_format in ('-1', 'all'):
 642                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 643             else:
 644                 # Specific formats. We pick the first in a slash-delimeted sequence.
 645                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 646                 req_formats = req_format.split('/')
 647                 video_url_list = None
 648                 for rf in req_formats:
 649                     if rf in url_map:
 650                         video_url_list = [(rf, url_map[rf])]
 651                         break
 652                 if video_url_list is None:
 653                     raise ExtractorError(u'requested format not available')
 654         else:
 655             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 656
 657         results = []
 658         for format_param, video_real_url in video_url_list:
 659             # Extension
 660             video_extension = self._video_extensions.get(format_param, 'flv')
 661
 662             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 663                                               self._video_dimensions.get(format_param, '???'))
 664
 665             results.append({
 666                 'id':       video_id,
 667                 'url':      video_real_url,
 668                 'uploader': video_uploader,
 669                 'uploader_id': video_uploader_id,
 670                 'upload_date':  upload_date,
 671                 'title':    video_title,
 672                 'ext':      video_extension,
 673                 'format':   video_format,
 674                 'thumbnail':    video_thumbnail,
 675                 'description':  video_description,
 676                 'player_url':   player_url,
 677                 'subtitles':    video_subtitles,
 678                 'duration':     video_duration
 679             })
 680         return results
 681
 682
 683 class MetacafeIE(InfoExtractor):
 684     """Information Extractor for metacafe.com."""
 685
 686     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 687     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 688     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 689     IE_NAME = u'metacafe'
 690
 691     def report_disclaimer(self):
 692         """Report disclaimer retrieval."""
 693         self.to_screen(u'Retrieving disclaimer')
 694
 695     def _real_initialize(self):
 696         # Retrieve disclaimer
 697         request = compat_urllib_request.Request(self._DISCLAIMER)
 698         try:
 699             self.report_disclaimer()
 700             disclaimer = compat_urllib_request.urlopen(request).read()
 701         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 702             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 703             return
 704
 705         # Confirm age
 706         disclaimer_form = {
 707             'filters': '0',
 708             'submit': "Continue - I'm over 18",
 709             }
 710         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 711         try:
 712             self.report_age_confirmation()
 713             disclaimer = compat_urllib_request.urlopen(request).read()
 714         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 715             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 716             return
 717
 718     def _real_extract(self, url):
 719         # Extract id and simplified title from URL
 720         mobj = re.match(self._VALID_URL, url)
 721         if mobj is None:
 722             self._downloader.report_error(u'invalid URL: %s' % url)
 723             return
 724
 725         video_id = mobj.group(1)
 726
 727         # Check if video comes from YouTube
 728         mobj2 = re.match(r'^yt-(.*)$', video_id)
 729         if mobj2 is not None:
 730             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 731
 732         # Retrieve video webpage to extract further information
 733         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 734
 735         # Extract URL, uploader and title from webpage
 736         self.report_extraction(video_id)
 737         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 738         if mobj is not None:
 739             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 740             video_extension = mediaURL[-3:]
 741
 742             # Extract gdaKey if available
 743             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 744             if mobj is None:
 745                 video_url = mediaURL
 746             else:
 747                 gdaKey = mobj.group(1)
 748                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 749         else:
 750             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 751             if mobj is None:
 752                 self._downloader.report_error(u'unable to extract media URL')
 753                 return
 754             vardict = compat_parse_qs(mobj.group(1))
 755             if 'mediaData' not in vardict:
 756                 self._downloader.report_error(u'unable to extract media URL')
 757                 return
 758             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 759             if mobj is None:
 760                 self._downloader.report_error(u'unable to extract media URL')
 761                 return
 762             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 763             video_extension = mediaURL[-3:]
 764             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 765
 766         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 767         if mobj is None:
 768             self._downloader.report_error(u'unable to extract title')
 769             return
 770         video_title = mobj.group(1).decode('utf-8')
 771
 772         mobj = re.search(r'submitter=(.*?);', webpage)
 773         if mobj is None:
 774             self._downloader.report_error(u'unable to extract uploader nickname')
 775             return
 776         video_uploader = mobj.group(1)
 777
 778         return [{
 779             'id':       video_id.decode('utf-8'),
 780             'url':      video_url.decode('utf-8'),
 781             'uploader': video_uploader.decode('utf-8'),
 782             'upload_date':  None,
 783             'title':    video_title,
 784             'ext':      video_extension.decode('utf-8'),
 785         }]
 786
 787
 788 class DailymotionIE(InfoExtractor):
 789     """Information Extractor for Dailymotion"""
 790
 791     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 792     IE_NAME = u'dailymotion'
 793     _WORKING = False
 794
 795     def _real_extract(self, url):
 796         # Extract id and simplified title from URL
 797         mobj = re.match(self._VALID_URL, url)
 798         if mobj is None:
 799             self._downloader.report_error(u'invalid URL: %s' % url)
 800             return
 801
 802         video_id = mobj.group(1).split('_')[0].split('?')[0]
 803
 804         video_extension = 'mp4'
 805
 806         # Retrieve video webpage to extract further information
 807         request = compat_urllib_request.Request(url)
 808         request.add_header('Cookie', 'family_filter=off')
 809         webpage = self._download_webpage(request, video_id)
 810
 811         # Extract URL, uploader and title from webpage
 812         self.report_extraction(video_id)
 813         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 814         if mobj is None:
 815             self._downloader.report_error(u'unable to extract media URL')
 816             return
 817         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 818
 819         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 820             if key in flashvars:
 821                 max_quality = key
 822                 self.to_screen(u'Using %s' % key)
 823                 break
 824         else:
 825             self._downloader.report_error(u'unable to extract video URL')
 826             return
 827
 828         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 829         if mobj is None:
 830             self._downloader.report_error(u'unable to extract video URL')
 831             return
 832
 833         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 834
 835         # TODO: support choosing qualities
 836
 837         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 838         if mobj is None:
 839             self._downloader.report_error(u'unable to extract title')
 840             return
 841         video_title = unescapeHTML(mobj.group('title'))
 842
 843         video_uploader = None
 844         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 845         if mobj is None:
 846             # lookin for official user
 847             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 848             if mobj_official is None:
 849                 self._downloader.report_warning(u'unable to extract uploader nickname')
 850             else:
 851                 video_uploader = mobj_official.group(1)
 852         else:
 853             video_uploader = mobj.group(1)
 854
 855         video_upload_date = None
 856         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 857         if mobj is not None:
 858             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 859
 860         return [{
 861             'id':       video_id,
 862             'url':      video_url,
 863             'uploader': video_uploader,
 864             'upload_date':  video_upload_date,
 865             'title':    video_title,
 866             'ext':      video_extension,
 867         }]
 868
 869
 870 class PhotobucketIE(InfoExtractor):
 871     """Information extractor for photobucket.com."""
 872
 873     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 874     IE_NAME = u'photobucket'
 875
 876     def _real_extract(self, url):
 877         # Extract id from URL
 878         mobj = re.match(self._VALID_URL, url)
 879         if mobj is None:
 880             self._downloader.report_error(u'Invalid URL: %s' % url)
 881             return
 882
 883         video_id = mobj.group(1)
 884
 885         video_extension = 'flv'
 886
 887         # Retrieve video webpage to extract further information
 888         request = compat_urllib_request.Request(url)
 889         try:
 890             self.report_download_webpage(video_id)
 891             webpage = compat_urllib_request.urlopen(request).read()
 892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 893             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 894             return
 895
 896         # Extract URL, uploader, and title from webpage
 897         self.report_extraction(video_id)
 898         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 899         if mobj is None:
 900             self._downloader.report_error(u'unable to extract media URL')
 901             return
 902         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 903
 904         video_url = mediaURL
 905
 906         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 907         if mobj is None:
 908             self._downloader.report_error(u'unable to extract title')
 909             return
 910         video_title = mobj.group(1).decode('utf-8')
 911
 912         video_uploader = mobj.group(2).decode('utf-8')
 913
 914         return [{
 915             'id':       video_id.decode('utf-8'),
 916             'url':      video_url.decode('utf-8'),
 917             'uploader': video_uploader,
 918             'upload_date':  None,
 919             'title':    video_title,
 920             'ext':      video_extension.decode('utf-8'),
 921         }]
 922
 923
 924 class YahooIE(InfoExtractor):
 925     """Information extractor for video.yahoo.com."""
 926
 927     _WORKING = False
 928     # _VALID_URL matches all Yahoo! Video URLs
 929     # _VPAGE_URL matches only the extractable '/watch/' URLs
 930     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 931     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 932     IE_NAME = u'video.yahoo'
 933
 934     def _real_extract(self, url, new_video=True):
 935         # Extract ID from URL
 936         mobj = re.match(self._VALID_URL, url)
 937         if mobj is None:
 938             self._downloader.report_error(u'Invalid URL: %s' % url)
 939             return
 940
 941         video_id = mobj.group(2)
 942         video_extension = 'flv'
 943
 944         # Rewrite valid but non-extractable URLs as
 945         # extractable English language /watch/ URLs
 946         if re.match(self._VPAGE_URL, url) is None:
 947             request = compat_urllib_request.Request(url)
 948             try:
 949                 webpage = compat_urllib_request.urlopen(request).read()
 950             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 951                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 952                 return
 953
 954             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 955             if mobj is None:
 956                 self._downloader.report_error(u'Unable to extract id field')
 957                 return
 958             yahoo_id = mobj.group(1)
 959
 960             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 961             if mobj is None:
 962                 self._downloader.report_error(u'Unable to extract vid field')
 963                 return
 964             yahoo_vid = mobj.group(1)
 965
 966             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 967             return self._real_extract(url, new_video=False)
 968
 969         # Retrieve video webpage to extract further information
 970         request = compat_urllib_request.Request(url)
 971         try:
 972             self.report_download_webpage(video_id)
 973             webpage = compat_urllib_request.urlopen(request).read()
 974         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 975             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 976             return
 977
 978         # Extract uploader and title from webpage
 979         self.report_extraction(video_id)
 980         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 981         if mobj is None:
 982             self._downloader.report_error(u'unable to extract video title')
 983             return
 984         video_title = mobj.group(1).decode('utf-8')
 985
 986         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 987         if mobj is None:
 988             self._downloader.report_error(u'unable to extract video uploader')
 989             return
 990         video_uploader = mobj.group(1).decode('utf-8')
 991
 992         # Extract video thumbnail
 993         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 994         if mobj is None:
 995             self._downloader.report_error(u'unable to extract video thumbnail')
 996             return
 997         video_thumbnail = mobj.group(1).decode('utf-8')
 998
 999         # Extract video description
1000         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1001         if mobj is None:
1002             self._downloader.report_error(u'unable to extract video description')
1003             return
1004         video_description = mobj.group(1).decode('utf-8')
1005         if not video_description:
1006             video_description = 'No description available.'
1007
1008         # Extract video height and width
1009         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video height')
1012             return
1013         yv_video_height = mobj.group(1)
1014
1015         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1016         if mobj is None:
1017             self._downloader.report_error(u'unable to extract video width')
1018             return
1019         yv_video_width = mobj.group(1)
1020
1021         # Retrieve video playlist to extract media URL
1022         # I'm not completely sure what all these options are, but we
1023         # seem to need most of them, otherwise the server sends a 401.
1024         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1025         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1026         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1029         try:
1030             self.report_download_webpage(video_id)
1031             webpage = compat_urllib_request.urlopen(request).read()
1032         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1034             return
1035
1036         # Extract media URL from playlist XML
1037         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1038         if mobj is None:
1039             self._downloader.report_error(u'Unable to extract media URL')
1040             return
1041         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042         video_url = unescapeHTML(video_url)
1043
1044         return [{
1045             'id':       video_id.decode('utf-8'),
1046             'url':      video_url,
1047             'uploader': video_uploader,
1048             'upload_date':  None,
1049             'title':    video_title,
1050             'ext':      video_extension.decode('utf-8'),
1051             'thumbnail':    video_thumbnail.decode('utf-8'),
1052             'description':  video_description,
1053         }]
1054
1055
1056 class VimeoIE(InfoExtractor):
1057     """Information extractor for vimeo.com."""
1058
1059     # _VALID_URL matches Vimeo URLs
1060     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1061     IE_NAME = u'vimeo'
1062
1063     def _real_extract(self, url, new_video=True):
1064         # Extract ID from URL
1065         mobj = re.match(self._VALID_URL, url)
1066         if mobj is None:
1067             self._downloader.report_error(u'Invalid URL: %s' % url)
1068             return
1069
1070         video_id = mobj.group('id')
1071         if not mobj.group('proto'):
1072             url = 'https://' + url
1073         if mobj.group('direct_link'):
1074             url = 'https://vimeo.com/' + video_id
1075
1076         # Retrieve video webpage to extract further information
1077         request = compat_urllib_request.Request(url, None, std_headers)
1078         try:
1079             self.report_download_webpage(video_id)
1080             webpage_bytes = compat_urllib_request.urlopen(request).read()
1081             webpage = webpage_bytes.decode('utf-8')
1082         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1084             return
1085
1086         # Now we begin extracting as much information as we can from what we
1087         # retrieved. First we extract the information common to all extractors,
1088         # and latter we extract those that are Vimeo specific.
1089         self.report_extraction(video_id)
1090
1091         # Extract the config JSON
1092         try:
1093             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094             config = json.loads(config)
1095         except:
1096             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1097                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1098             else:
1099                 self._downloader.report_error(u'unable to extract info section')
1100             return
1101
1102         # Extract title
1103         video_title = config["video"]["title"]
1104
1105         # Extract uploader and uploader_id
1106         video_uploader = config["video"]["owner"]["name"]
1107         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1108
1109         # Extract video thumbnail
1110         video_thumbnail = config["video"]["thumbnail"]
1111
1112         # Extract video description
1113         video_description = get_element_by_attribute("itemprop", "description", webpage)
1114         if video_description: video_description = clean_html(video_description)
1115         else: video_description = u''
1116
1117         # Extract upload date
1118         video_upload_date = None
1119         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1120         if mobj is not None:
1121             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1122
1123         # Vimeo specific: extract request signature and timestamp
1124         sig = config['request']['signature']
1125         timestamp = config['request']['timestamp']
1126
1127         # Vimeo specific: extract video codec and quality information
1128         # First consider quality, then codecs, then take everything
1129         # TODO bind to format param
1130         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1131         files = { 'hd': [], 'sd': [], 'other': []}
1132         for codec_name, codec_extension in codecs:
1133             if codec_name in config["video"]["files"]:
1134                 if 'hd' in config["video"]["files"][codec_name]:
1135                     files['hd'].append((codec_name, codec_extension, 'hd'))
1136                 elif 'sd' in config["video"]["files"][codec_name]:
1137                     files['sd'].append((codec_name, codec_extension, 'sd'))
1138                 else:
1139                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1140
1141         for quality in ('hd', 'sd', 'other'):
1142             if len(files[quality]) > 0:
1143                 video_quality = files[quality][0][2]
1144                 video_codec = files[quality][0][0]
1145                 video_extension = files[quality][0][1]
1146                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1147                 break
1148         else:
1149             self._downloader.report_error(u'no known codec found')
1150             return
1151
1152         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1153                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1154
1155         return [{
1156             'id':       video_id,
1157             'url':      video_url,
1158             'uploader': video_uploader,
1159             'uploader_id': video_uploader_id,
1160             'upload_date':  video_upload_date,
1161             'title':    video_title,
1162             'ext':      video_extension,
1163             'thumbnail':    video_thumbnail,
1164             'description':  video_description,
1165         }]
1166
1167
1168 class ArteTvIE(InfoExtractor):
1169     """arte.tv information extractor."""
1170
1171     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1172     _LIVE_URL = r'index-[0-9]+\.html$'
1173
1174     IE_NAME = u'arte.tv'
1175
1176     def fetch_webpage(self, url):
1177         request = compat_urllib_request.Request(url)
1178         try:
1179             self.report_download_webpage(url)
1180             webpage = compat_urllib_request.urlopen(request).read()
1181         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1182             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1183             return
1184         except ValueError as err:
1185             self._downloader.report_error(u'Invalid URL: %s' % url)
1186             return
1187         return webpage
1188
1189     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1190         page = self.fetch_webpage(url)
1191         mobj = re.search(regex, page, regexFlags)
1192         info = {}
1193
1194         if mobj is None:
1195             self._downloader.report_error(u'Invalid URL: %s' % url)
1196             return
1197
1198         for (i, key, err) in matchTuples:
1199             if mobj.group(i) is None:
1200                 self._downloader.report_error(err)
1201                 return
1202             else:
1203                 info[key] = mobj.group(i)
1204
1205         return info
1206
1207     def extractLiveStream(self, url):
1208         video_lang = url.split('/')[-4]
1209         info = self.grep_webpage(
1210             url,
1211             r'src="(.*?/videothek_js.*?\.js)',
1212             0,
1213             [
1214                 (1, 'url', u'Invalid URL: %s' % url)
1215             ]
1216         )
1217         http_host = url.split('/')[2]
1218         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1219         info = self.grep_webpage(
1220             next_url,
1221             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1222                 '(http://.*?\.swf).*?' +
1223                 '(rtmp://.*?)\'',
1224             re.DOTALL,
1225             [
1226                 (1, 'path',   u'could not extract video path: %s' % url),
1227                 (2, 'player', u'could not extract video player: %s' % url),
1228                 (3, 'url',    u'could not extract video url: %s' % url)
1229             ]
1230         )
1231         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1232
1233     def extractPlus7Stream(self, url):
1234         video_lang = url.split('/')[-3]
1235         info = self.grep_webpage(
1236             url,
1237             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1238             0,
1239             [
1240                 (1, 'url', u'Invalid URL: %s' % url)
1241             ]
1242         )
1243         next_url = compat_urllib_parse.unquote(info.get('url'))
1244         info = self.grep_webpage(
1245             next_url,
1246             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1247             0,
1248             [
1249                 (1, 'url', u'Could not find <video> tag: %s' % url)
1250             ]
1251         )
1252         next_url = compat_urllib_parse.unquote(info.get('url'))
1253
1254         info = self.grep_webpage(
1255             next_url,
1256             r'<video id="(.*?)".*?>.*?' +
1257                 '<name>(.*?)</name>.*?' +
1258                 '<dateVideo>(.*?)</dateVideo>.*?' +
1259                 '<url quality="hd">(.*?)</url>',
1260             re.DOTALL,
1261             [
1262                 (1, 'id',    u'could not extract video id: %s' % url),
1263                 (2, 'title', u'could not extract video title: %s' % url),
1264                 (3, 'date',  u'could not extract video date: %s' % url),
1265                 (4, 'url',   u'could not extract video url: %s' % url)
1266             ]
1267         )
1268
1269         return {
1270             'id':           info.get('id'),
1271             'url':          compat_urllib_parse.unquote(info.get('url')),
1272             'uploader':     u'arte.tv',
1273             'upload_date':  info.get('date'),
1274             'title':        info.get('title').decode('utf-8'),
1275             'ext':          u'mp4',
1276             'format':       u'NA',
1277             'player_url':   None,
1278         }
1279
1280     def _real_extract(self, url):
1281         video_id = url.split('/')[-1]
1282         self.report_extraction(video_id)
1283
1284         if re.search(self._LIVE_URL, video_id) is not None:
1285             self.extractLiveStream(url)
1286             return
1287         else:
1288             info = self.extractPlus7Stream(url)
1289
1290         return [info]
1291
1292
1293 class GenericIE(InfoExtractor):
1294     """Generic last-resort information extractor."""
1295
1296     _VALID_URL = r'.*'
1297     IE_NAME = u'generic'
1298
1299     def report_download_webpage(self, video_id):
1300         """Report webpage download."""
1301         if not self._downloader.params.get('test', False):
1302             self._downloader.report_warning(u'Falling back on generic information extractor.')
1303         super(GenericIE, self).report_download_webpage(video_id)
1304
1305     def report_following_redirect(self, new_url):
1306         """Report information extraction."""
1307         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1308
1309     def _test_redirect(self, url):
1310         """Check if it is a redirect, like url shorteners, in case return the new url."""
1311         class HeadRequest(compat_urllib_request.Request):
1312             def get_method(self):
1313                 return "HEAD"
1314
1315         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1316             """
1317             Subclass the HTTPRedirectHandler to make it use our
1318             HeadRequest also on the redirected URL
1319             """
1320             def redirect_request(self, req, fp, code, msg, headers, newurl):
1321                 if code in (301, 302, 303, 307):
1322                     newurl = newurl.replace(' ', '%20')
1323                     newheaders = dict((k,v) for k,v in req.headers.items()
1324                                       if k.lower() not in ("content-length", "content-type"))
1325                     return HeadRequest(newurl,
1326                                        headers=newheaders,
1327                                        origin_req_host=req.get_origin_req_host(),
1328                                        unverifiable=True)
1329                 else:
1330                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1331
1332         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1333             """
1334             Fallback to GET if HEAD is not allowed (405 HTTP error)
1335             """
1336             def http_error_405(self, req, fp, code, msg, headers):
1337                 fp.read()
1338                 fp.close()
1339
1340                 newheaders = dict((k,v) for k,v in req.headers.items()
1341                                   if k.lower() not in ("content-length", "content-type"))
1342                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1343                                                  headers=newheaders,
1344                                                  origin_req_host=req.get_origin_req_host(),
1345                                                  unverifiable=True))
1346
1347         # Build our opener
1348         opener = compat_urllib_request.OpenerDirector()
1349         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350                         HTTPMethodFallback, HEADRedirectHandler,
1351                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352             opener.add_handler(handler())
1353
1354         response = opener.open(HeadRequest(url))
1355         new_url = response.geturl()
1356
1357         if url == new_url:
1358             return False
1359
1360         self.report_following_redirect(new_url)
1361         return new_url
1362
1363     def _real_extract(self, url):
1364         new_url = self._test_redirect(url)
1365         if new_url: return [self.url_result(new_url)]
1366
1367         video_id = url.split('/')[-1]
1368         try:
1369             webpage = self._download_webpage(url, video_id)
1370         except ValueError as err:
1371             # since this is the last-resort InfoExtractor, if
1372             # this error is thrown, it'll be thrown here
1373             self._downloader.report_error(u'Invalid URL: %s' % url)
1374             return
1375
1376         self.report_extraction(video_id)
1377         # Start with something easy: JW Player in SWFObject
1378         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1379         if mobj is None:
1380             # Broaden the search a little bit
1381             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1382         if mobj is None:
1383             # Broaden the search a little bit: JWPlayer JS loader
1384             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1385         if mobj is None:
1386             self._downloader.report_error(u'Invalid URL: %s' % url)
1387             return
1388
1389         # It's possible that one of the regexes
1390         # matched, but returned an empty group:
1391         if mobj.group(1) is None:
1392             self._downloader.report_error(u'Invalid URL: %s' % url)
1393             return
1394
1395         video_url = compat_urllib_parse.unquote(mobj.group(1))
1396         video_id = os.path.basename(video_url)
1397
1398         # here's a fun little line of code for you:
1399         video_extension = os.path.splitext(video_id)[1][1:]
1400         video_id = os.path.splitext(video_id)[0]
1401
1402         # it's tempting to parse this further, but you would
1403         # have to take into account all the variations like
1404         #   Video Title - Site Name
1405         #   Site Name | Video Title
1406         #   Video Title - Tagline | Site Name
1407         # and so on and so forth; it's just not practical
1408         mobj = re.search(r'<title>(.*)</title>', webpage)
1409         if mobj is None:
1410             self._downloader.report_error(u'unable to extract title')
1411             return
1412         video_title = mobj.group(1)
1413
1414         # video uploader is domain name
1415         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1416         if mobj is None:
1417             self._downloader.report_error(u'unable to extract title')
1418             return
1419         video_uploader = mobj.group(1)
1420
1421         return [{
1422             'id':       video_id,
1423             'url':      video_url,
1424             'uploader': video_uploader,
1425             'upload_date':  None,
1426             'title':    video_title,
1427             'ext':      video_extension,
1428         }]
1429
1430
1431 class YoutubeSearchIE(InfoExtractor):
1432     """Information Extractor for YouTube search queries."""
1433     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435     _max_youtube_results = 1000
1436     IE_NAME = u'youtube:search'
1437
1438     def report_download_page(self, query, pagenum):
1439         """Report attempt to download search page with given number."""
1440         query = query.decode(preferredencoding())
1441         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1442
1443     def _real_extract(self, query):
1444         mobj = re.match(self._VALID_URL, query)
1445         if mobj is None:
1446             self._downloader.report_error(u'invalid search query "%s"' % query)
1447             return
1448
1449         prefix, query = query.split(':')
1450         prefix = prefix[8:]
1451         query = query.encode('utf-8')
1452         if prefix == '':
1453             return self._get_n_results(query, 1)
1454         elif prefix == 'all':
1455             self._get_n_results(query, self._max_youtube_results)
1456         else:
1457             try:
1458                 n = int(prefix)
1459                 if n <= 0:
1460                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1461                     return
1462                 elif n > self._max_youtube_results:
1463                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1464                     n = self._max_youtube_results
1465                 return self._get_n_results(query, n)
1466             except ValueError: # parsing prefix as integer fails
1467                 return self._get_n_results(query, 1)
1468
1469     def _get_n_results(self, query, n):
1470         """Get a specified number of results for a query"""
1471
1472         video_ids = []
1473         pagenum = 0
1474         limit = n
1475
1476         while (50 * pagenum) < limit:
1477             self.report_download_page(query, pagenum+1)
1478             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479             request = compat_urllib_request.Request(result_url)
1480             try:
1481                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1484                 return
1485             api_response = json.loads(data)['data']
1486
1487             if not 'items' in api_response:
1488                 self._downloader.report_error(u'[youtube] No video results')
1489                 return
1490
1491             new_ids = list(video['id'] for video in api_response['items'])
1492             video_ids += new_ids
1493
1494             limit = min(n, api_response['totalItems'])
1495             pagenum += 1
1496
1497         if len(video_ids) > n:
1498             video_ids = video_ids[:n]
1499         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1500         return videos
1501
1502
1503 class GoogleSearchIE(InfoExtractor):
1504     """Information Extractor for Google Video search queries."""
1505     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509     _max_google_results = 1000
1510     IE_NAME = u'video.google:search'
1511
1512     def report_download_page(self, query, pagenum):
1513         """Report attempt to download playlist page with given number."""
1514         query = query.decode(preferredencoding())
1515         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1516
1517     def _real_extract(self, query):
1518         mobj = re.match(self._VALID_URL, query)
1519         if mobj is None:
1520             self._downloader.report_error(u'invalid search query "%s"' % query)
1521             return
1522
1523         prefix, query = query.split(':')
1524         prefix = prefix[8:]
1525         query = query.encode('utf-8')
1526         if prefix == '':
1527             self._download_n_results(query, 1)
1528             return
1529         elif prefix == 'all':
1530             self._download_n_results(query, self._max_google_results)
1531             return
1532         else:
1533             try:
1534                 n = int(prefix)
1535                 if n <= 0:
1536                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1537                     return
1538                 elif n > self._max_google_results:
1539                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540                     n = self._max_google_results
1541                 self._download_n_results(query, n)
1542                 return
1543             except ValueError: # parsing prefix as integer fails
1544                 self._download_n_results(query, 1)
1545                 return
1546
1547     def _download_n_results(self, query, n):
1548         """Downloads a specified number of results for a query"""
1549
1550         video_ids = []
1551         pagenum = 0
1552
1553         while True:
1554             self.report_download_page(query, pagenum)
1555             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556             request = compat_urllib_request.Request(result_url)
1557             try:
1558                 page = compat_urllib_request.urlopen(request).read()
1559             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1561                 return
1562
1563             # Extract video identifiers
1564             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565                 video_id = mobj.group(1)
1566                 if video_id not in video_ids:
1567                     video_ids.append(video_id)
1568                     if len(video_ids) == n:
1569                         # Specified n videos reached
1570                         for id in video_ids:
1571                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572                         return
1573
1574             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575                 for id in video_ids:
1576                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577                 return
1578
1579             pagenum = pagenum + 1
1580
1581
1582 class YahooSearchIE(InfoExtractor):
1583     """Information Extractor for Yahoo! Video search queries."""
1584
1585     _WORKING = False
1586     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1587     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1588     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1589     _MORE_PAGES_INDICATOR = r'\s*Next'
1590     _max_yahoo_results = 1000
1591     IE_NAME = u'video.yahoo:search'
1592
1593     def report_download_page(self, query, pagenum):
1594         """Report attempt to download playlist page with given number."""
1595         query = query.decode(preferredencoding())
1596         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1597
1598     def _real_extract(self, query):
1599         mobj = re.match(self._VALID_URL, query)
1600         if mobj is None:
1601             self._downloader.report_error(u'invalid search query "%s"' % query)
1602             return
1603
1604         prefix, query = query.split(':')
1605         prefix = prefix[8:]
1606         query = query.encode('utf-8')
1607         if prefix == '':
1608             self._download_n_results(query, 1)
1609             return
1610         elif prefix == 'all':
1611             self._download_n_results(query, self._max_yahoo_results)
1612             return
1613         else:
1614             try:
1615                 n = int(prefix)
1616                 if n <= 0:
1617                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1618                     return
1619                 elif n > self._max_yahoo_results:
1620                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621                     n = self._max_yahoo_results
1622                 self._download_n_results(query, n)
1623                 return
1624             except ValueError: # parsing prefix as integer fails
1625                 self._download_n_results(query, 1)
1626                 return
1627
1628     def _download_n_results(self, query, n):
1629         """Downloads a specified number of results for a query"""
1630
1631         video_ids = []
1632         already_seen = set()
1633         pagenum = 1
1634
1635         while True:
1636             self.report_download_page(query, pagenum)
1637             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638             request = compat_urllib_request.Request(result_url)
1639             try:
1640                 page = compat_urllib_request.urlopen(request).read()
1641             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1643                 return
1644
1645             # Extract video identifiers
1646             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647                 video_id = mobj.group(1)
1648                 if video_id not in already_seen:
1649                     video_ids.append(video_id)
1650                     already_seen.add(video_id)
1651                     if len(video_ids) == n:
1652                         # Specified n videos reached
1653                         for id in video_ids:
1654                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1655                         return
1656
1657             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658                 for id in video_ids:
1659                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1660                 return
1661
1662             pagenum = pagenum + 1
1663
1664
1665 class YoutubePlaylistIE(InfoExtractor):
1666     """Information Extractor for YouTube playlists."""
1667
1668     _VALID_URL = r"""(?:
1669                         (?:https?://)?
1670                         (?:\w+\.)?
1671                         youtube\.com/
1672                         (?:
1673                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1674                            \? (?:.*?&)*? (?:p|a|list)=
1675                         |  p/
1676                         )
1677                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1678                         .*
1679                      |
1680                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1681                      )"""
1682     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1683     _MAX_RESULTS = 50
1684     IE_NAME = u'youtube:playlist'
1685
1686     @classmethod
1687     def suitable(cls, url):
1688         """Receives a URL and returns True if suitable for this IE."""
1689         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1690
1691     def report_download_page(self, playlist_id, pagenum):
1692         """Report attempt to download playlist page with given number."""
1693         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1694
1695     def _real_extract(self, url):
1696         # Extract playlist id
1697         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1698         if mobj is None:
1699             self._downloader.report_error(u'invalid url: %s' % url)
1700             return
1701
1702         # Download playlist videos from API
1703         playlist_id = mobj.group(1) or mobj.group(2)
1704         page_num = 1
1705         videos = []
1706
1707         while True:
1708             self.report_download_page(playlist_id, page_num)
1709
1710             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1711             try:
1712                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1713             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1715                 return
1716
1717             try:
1718                 response = json.loads(page)
1719             except ValueError as err:
1720                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1721                 return
1722
1723             if 'feed' not in response:
1724                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1725                 return
1726             playlist_title = response['feed']['title']['$t']
1727             if 'entry' not in response['feed']:
1728                 # Number of videos is a multiple of self._MAX_RESULTS
1729                 break
1730
1731             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1732                         for entry in response['feed']['entry']
1733                         if 'content' in entry ]
1734
1735             if len(response['feed']['entry']) < self._MAX_RESULTS:
1736                 break
1737             page_num += 1
1738
1739         videos = [v[1] for v in sorted(videos)]
1740
1741         url_results = [self.url_result(url, 'Youtube') for url in videos]
1742         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1743
1744
1745 class YoutubeChannelIE(InfoExtractor):
1746     """Information Extractor for YouTube channels."""
1747
1748     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1749     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1750     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1751     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1752     IE_NAME = u'youtube:channel'
1753
1754     def report_download_page(self, channel_id, pagenum):
1755         """Report attempt to download channel page with given number."""
1756         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1757
1758     def extract_videos_from_page(self, page):
1759         ids_in_page = []
1760         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1761             if mobj.group(1) not in ids_in_page:
1762                 ids_in_page.append(mobj.group(1))
1763         return ids_in_page
1764
1765     def _real_extract(self, url):
1766         # Extract channel id
1767         mobj = re.match(self._VALID_URL, url)
1768         if mobj is None:
1769             self._downloader.report_error(u'invalid url: %s' % url)
1770             return
1771
1772         # Download channel page
1773         channel_id = mobj.group(1)
1774         video_ids = []
1775         pagenum = 1
1776
1777         self.report_download_page(channel_id, pagenum)
1778         url = self._TEMPLATE_URL % (channel_id, pagenum)
1779         request = compat_urllib_request.Request(url)
1780         try:
1781             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1782         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1784             return
1785
1786         # Extract video identifiers
1787         ids_in_page = self.extract_videos_from_page(page)
1788         video_ids.extend(ids_in_page)
1789
1790         # Download any subsequent channel pages using the json-based channel_ajax query
1791         if self._MORE_PAGES_INDICATOR in page:
1792             while True:
1793                 pagenum = pagenum + 1
1794
1795                 self.report_download_page(channel_id, pagenum)
1796                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1797                 request = compat_urllib_request.Request(url)
1798                 try:
1799                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1800                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1801                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1802                     return
1803
1804                 page = json.loads(page)
1805
1806                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1807                 video_ids.extend(ids_in_page)
1808
1809                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1810                     break
1811
1812         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1813
1814         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1815         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1816         return [self.playlist_result(url_entries, channel_id)]
1817
1818
1819 class YoutubeUserIE(InfoExtractor):
1820     """Information Extractor for YouTube users."""
1821
1822     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1823     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1824     _GDATA_PAGE_SIZE = 50
1825     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1826     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1827     IE_NAME = u'youtube:user'
1828
1829     def report_download_page(self, username, start_index):
1830         """Report attempt to download user page."""
1831         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1832                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1833
1834     def _real_extract(self, url):
1835         # Extract username
1836         mobj = re.match(self._VALID_URL, url)
1837         if mobj is None:
1838             self._downloader.report_error(u'invalid url: %s' % url)
1839             return
1840
1841         username = mobj.group(1)
1842
1843         # Download video ids using YouTube Data API. Result size per
1844         # query is limited (currently to 50 videos) so we need to query
1845         # page by page until there are no video ids - it means we got
1846         # all of them.
1847
1848         video_ids = []
1849         pagenum = 0
1850
1851         while True:
1852             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1853             self.report_download_page(username, start_index)
1854
1855             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1856
1857             try:
1858                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1861                 return
1862
1863             # Extract video identifiers
1864             ids_in_page = []
1865
1866             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867                 if mobj.group(1) not in ids_in_page:
1868                     ids_in_page.append(mobj.group(1))
1869
1870             video_ids.extend(ids_in_page)
1871
1872             # A little optimization - if current page is not
1873             # "full", ie. does not contain PAGE_SIZE video ids then
1874             # we can assume that this page is the last one - there
1875             # are no more ids on further pages - no need to query
1876             # again.
1877
1878             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1879                 break
1880
1881             pagenum += 1
1882
1883         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1884         url_results = [self.url_result(url, 'Youtube') for url in urls]
1885         return [self.playlist_result(url_results, playlist_title = username)]
1886
1887
1888 class BlipTVUserIE(InfoExtractor):
1889     """Information Extractor for blip.tv users."""
1890
1891     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1892     _PAGE_SIZE = 12
1893     IE_NAME = u'blip.tv:user'
1894
1895     def report_download_page(self, username, pagenum):
1896         """Report attempt to download user page."""
1897         self.to_screen(u'user %s: Downloading video ids from page %d' %
1898                 (username, pagenum))
1899
1900     def _real_extract(self, url):
1901         # Extract username
1902         mobj = re.match(self._VALID_URL, url)
1903         if mobj is None:
1904             self._downloader.report_error(u'invalid url: %s' % url)
1905             return
1906
1907         username = mobj.group(1)
1908
1909         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1910
1911         request = compat_urllib_request.Request(url)
1912
1913         try:
1914             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1915             mobj = re.search(r'data-users-id="([^"]+)"', page)
1916             page_base = page_base % mobj.group(1)
1917         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1918             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1919             return
1920
1921
1922         # Download video ids using BlipTV Ajax calls. Result size per
1923         # query is limited (currently to 12 videos) so we need to query
1924         # page by page until there are no video ids - it means we got
1925         # all of them.
1926
1927         video_ids = []
1928         pagenum = 1
1929
1930         while True:
1931             self.report_download_page(username, pagenum)
1932             url = page_base + "&page=" + str(pagenum)
1933             request = compat_urllib_request.Request( url )
1934             try:
1935                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1936             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1937                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1938                 return
1939
1940             # Extract video identifiers
1941             ids_in_page = []
1942
1943             for mobj in re.finditer(r'href="/([^"]+)"', page):
1944                 if mobj.group(1) not in ids_in_page:
1945                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1946
1947             video_ids.extend(ids_in_page)
1948
1949             # A little optimization - if current page is not
1950             # "full", ie. does not contain PAGE_SIZE video ids then
1951             # we can assume that this page is the last one - there
1952             # are no more ids on further pages - no need to query
1953             # again.
1954
1955             if len(ids_in_page) < self._PAGE_SIZE:
1956                 break
1957
1958             pagenum += 1
1959
1960         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1961         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1962         return [self.playlist_result(url_entries, playlist_title = username)]
1963
1964
1965 class DepositFilesIE(InfoExtractor):
1966     """Information extractor for depositfiles.com"""
1967
1968     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1969
1970     def _real_extract(self, url):
1971         file_id = url.split('/')[-1]
1972         # Rebuild url in english locale
1973         url = 'http://depositfiles.com/en/files/' + file_id
1974
1975         # Retrieve file webpage with 'Free download' button pressed
1976         free_download_indication = { 'gateway_result' : '1' }
1977         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1978         try:
1979             self.report_download_webpage(file_id)
1980             webpage = compat_urllib_request.urlopen(request).read()
1981         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1983             return
1984
1985         # Search for the real file URL
1986         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1987         if (mobj is None) or (mobj.group(1) is None):
1988             # Try to figure out reason of the error.
1989             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1990             if (mobj is not None) and (mobj.group(1) is not None):
1991                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1992                 self._downloader.report_error(u'%s' % restriction_message)
1993             else:
1994                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1995             return
1996
1997         file_url = mobj.group(1)
1998         file_extension = os.path.splitext(file_url)[1][1:]
1999
2000         # Search for file title
2001         mobj = re.search(r'<b title="(.*?)">', webpage)
2002         if mobj is None:
2003             self._downloader.report_error(u'unable to extract title')
2004             return
2005         file_title = mobj.group(1).decode('utf-8')
2006
2007         return [{
2008             'id':       file_id.decode('utf-8'),
2009             'url':      file_url.decode('utf-8'),
2010             'uploader': None,
2011             'upload_date':  None,
2012             'title':    file_title,
2013             'ext':      file_extension.decode('utf-8'),
2014         }]
2015
2016
2017 class FacebookIE(InfoExtractor):
2018     """Information Extractor for Facebook"""
2019
2020     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2021     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2022     _NETRC_MACHINE = 'facebook'
2023     IE_NAME = u'facebook'
2024
2025     def report_login(self):
2026         """Report attempt to log in."""
2027         self.to_screen(u'Logging in')
2028
2029     def _real_initialize(self):
2030         if self._downloader is None:
2031             return
2032
2033         useremail = None
2034         password = None
2035         downloader_params = self._downloader.params
2036
2037         # Attempt to use provided username and password or .netrc data
2038         if downloader_params.get('username', None) is not None:
2039             useremail = downloader_params['username']
2040             password = downloader_params['password']
2041         elif downloader_params.get('usenetrc', False):
2042             try:
2043                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2044                 if info is not None:
2045                     useremail = info[0]
2046                     password = info[2]
2047                 else:
2048                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2049             except (IOError, netrc.NetrcParseError) as err:
2050                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2051                 return
2052
2053         if useremail is None:
2054             return
2055
2056         # Log in
2057         login_form = {
2058             'email': useremail,
2059             'pass': password,
2060             'login': 'Log+In'
2061             }
2062         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2063         try:
2064             self.report_login()
2065             login_results = compat_urllib_request.urlopen(request).read()
2066             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2067                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2068                 return
2069         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2071             return
2072
2073     def _real_extract(self, url):
2074         mobj = re.match(self._VALID_URL, url)
2075         if mobj is None:
2076             self._downloader.report_error(u'invalid URL: %s' % url)
2077             return
2078         video_id = mobj.group('ID')
2079
2080         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2081         webpage = self._download_webpage(url, video_id)
2082
2083         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2084         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2085         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2086         if not m:
2087             raise ExtractorError(u'Cannot parse data')
2088         data = dict(json.loads(m.group(1)))
2089         params_raw = compat_urllib_parse.unquote(data['params'])
2090         params = json.loads(params_raw)
2091         video_data = params['video_data'][0]
2092         video_url = video_data.get('hd_src')
2093         if not video_url:
2094             video_url = video_data['sd_src']
2095         if not video_url:
2096             raise ExtractorError(u'Cannot find video URL')
2097         video_duration = int(video_data['video_duration'])
2098         thumbnail = video_data['thumbnail_src']
2099
2100         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2101         if not m:
2102             raise ExtractorError(u'Cannot find title in webpage')
2103         video_title = unescapeHTML(m.group(1))
2104
2105         info = {
2106             'id': video_id,
2107             'title': video_title,
2108             'url': video_url,
2109             'ext': 'mp4',
2110             'duration': video_duration,
2111             'thumbnail': thumbnail,
2112         }
2113         return [info]
2114
2115
2116 class BlipTVIE(InfoExtractor):
2117     """Information extractor for blip.tv"""
2118
2119     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2120     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2121     IE_NAME = u'blip.tv'
2122
2123     def report_direct_download(self, title):
2124         """Report information extraction."""
2125         self.to_screen(u'%s: Direct download detected' % title)
2126
2127     def _real_extract(self, url):
2128         mobj = re.match(self._VALID_URL, url)
2129         if mobj is None:
2130             self._downloader.report_error(u'invalid URL: %s' % url)
2131             return
2132
2133         urlp = compat_urllib_parse_urlparse(url)
2134         if urlp.path.startswith('/play/'):
2135             request = compat_urllib_request.Request(url)
2136             response = compat_urllib_request.urlopen(request)
2137             redirecturl = response.geturl()
2138             rurlp = compat_urllib_parse_urlparse(redirecturl)
2139             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2140             url = 'http://blip.tv/a/a-' + file_id
2141             return self._real_extract(url)
2142
2143
2144         if '?' in url:
2145             cchar = '&'
2146         else:
2147             cchar = '?'
2148         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2149         request = compat_urllib_request.Request(json_url)
2150         request.add_header('User-Agent', 'iTunes/10.6.1')
2151         self.report_extraction(mobj.group(1))
2152         info = None
2153         try:
2154             urlh = compat_urllib_request.urlopen(request)
2155             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2156                 basename = url.split('/')[-1]
2157                 title,ext = os.path.splitext(basename)
2158                 title = title.decode('UTF-8')
2159                 ext = ext.replace('.', '')
2160                 self.report_direct_download(title)
2161                 info = {
2162                     'id': title,
2163                     'url': url,
2164                     'uploader': None,
2165                     'upload_date': None,
2166                     'title': title,
2167                     'ext': ext,
2168                     'urlhandle': urlh
2169                 }
2170         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2172         if info is None: # Regular URL
2173             try:
2174                 json_code_bytes = urlh.read()
2175                 json_code = json_code_bytes.decode('utf-8')
2176             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2178                 return
2179
2180             try:
2181                 json_data = json.loads(json_code)
2182                 if 'Post' in json_data:
2183                     data = json_data['Post']
2184                 else:
2185                     data = json_data
2186
2187                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2188                 video_url = data['media']['url']
2189                 umobj = re.match(self._URL_EXT, video_url)
2190                 if umobj is None:
2191                     raise ValueError('Can not determine filename extension')
2192                 ext = umobj.group(1)
2193
2194                 info = {
2195                     'id': data['item_id'],
2196                     'url': video_url,
2197                     'uploader': data['display_name'],
2198                     'upload_date': upload_date,
2199                     'title': data['title'],
2200                     'ext': ext,
2201                     'format': data['media']['mimeType'],
2202                     'thumbnail': data['thumbnailUrl'],
2203                     'description': data['description'],
2204                     'player_url': data['embedUrl'],
2205                     'user_agent': 'iTunes/10.6.1',
2206                 }
2207             except (ValueError,KeyError) as err:
2208                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2209                 return
2210
2211         return [info]
2212
2213
2214 class MyVideoIE(InfoExtractor):
2215     """Information Extractor for myvideo.de."""
2216
2217     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2218     IE_NAME = u'myvideo'
2219
2220     def _real_extract(self,url):
2221         mobj = re.match(self._VALID_URL, url)
2222         if mobj is None:
2223             self._download.report_error(u'invalid URL: %s' % url)
2224             return
2225
2226         video_id = mobj.group(1)
2227
2228         # Get video webpage
2229         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2230         webpage = self._download_webpage(webpage_url, video_id)
2231
2232         self.report_extraction(video_id)
2233         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2234                  webpage)
2235         if mobj is None:
2236             self._downloader.report_error(u'unable to extract media URL')
2237             return
2238         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2239
2240         mobj = re.search('<title>([^<]+)</title>', webpage)
2241         if mobj is None:
2242             self._downloader.report_error(u'unable to extract title')
2243             return
2244
2245         video_title = mobj.group(1)
2246
2247         return [{
2248             'id':       video_id,
2249             'url':      video_url,
2250             'uploader': None,
2251             'upload_date':  None,
2252             'title':    video_title,
2253             'ext':      u'flv',
2254         }]
2255
2256 class ComedyCentralIE(InfoExtractor):
2257     """Information extractor for The Daily Show and Colbert Report """
2258
2259     # urls can be abbreviations like :thedailyshow or :colbert
2260     # urls for episodes like:
2261     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2262     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2263     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2264     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2265                       |(https?://)?(www\.)?
2266                           (?P<showname>thedailyshow|colbertnation)\.com/
2267                          (full-episodes/(?P<episode>.*)|
2268                           (?P<clip>
2269                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2270                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2271                      $"""
2272
2273     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2274
2275     _video_extensions = {
2276         '3500': 'mp4',
2277         '2200': 'mp4',
2278         '1700': 'mp4',
2279         '1200': 'mp4',
2280         '750': 'mp4',
2281         '400': 'mp4',
2282     }
2283     _video_dimensions = {
2284         '3500': '1280x720',
2285         '2200': '960x540',
2286         '1700': '768x432',
2287         '1200': '640x360',
2288         '750': '512x288',
2289         '400': '384x216',
2290     }
2291
2292     @classmethod
2293     def suitable(cls, url):
2294         """Receives a URL and returns True if suitable for this IE."""
2295         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2296
2297     def report_config_download(self, episode_id, media_id):
2298         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2299
2300     def report_index_download(self, episode_id):
2301         self.to_screen(u'%s: Downloading show index' % episode_id)
2302
2303     def _print_formats(self, formats):
2304         print('Available formats:')
2305         for x in formats:
2306             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2307
2308
2309     def _real_extract(self, url):
2310         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2311         if mobj is None:
2312             self._downloader.report_error(u'invalid URL: %s' % url)
2313             return
2314
2315         if mobj.group('shortname'):
2316             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2317                 url = u'http://www.thedailyshow.com/full-episodes/'
2318             else:
2319                 url = u'http://www.colbertnation.com/full-episodes/'
2320             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2321             assert mobj is not None
2322
2323         if mobj.group('clip'):
2324             if mobj.group('showname') == 'thedailyshow':
2325                 epTitle = mobj.group('tdstitle')
2326             else:
2327                 epTitle = mobj.group('cntitle')
2328             dlNewest = False
2329         else:
2330             dlNewest = not mobj.group('episode')
2331             if dlNewest:
2332                 epTitle = mobj.group('showname')
2333             else:
2334                 epTitle = mobj.group('episode')
2335
2336         req = compat_urllib_request.Request(url)
2337         self.report_extraction(epTitle)
2338         try:
2339             htmlHandle = compat_urllib_request.urlopen(req)
2340             html = htmlHandle.read()
2341             webpage = html.decode('utf-8')
2342         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2343             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2344             return
2345         if dlNewest:
2346             url = htmlHandle.geturl()
2347             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2348             if mobj is None:
2349                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2350                 return
2351             if mobj.group('episode') == '':
2352                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2353                 return
2354             epTitle = mobj.group('episode')
2355
2356         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2357
2358         if len(mMovieParams) == 0:
2359             # The Colbert Report embeds the information in a without
2360             # a URL prefix; so extract the alternate reference
2361             # and then add the URL prefix manually.
2362
2363             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2364             if len(altMovieParams) == 0:
2365                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2366                 return
2367             else:
2368                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2369
2370         uri = mMovieParams[0][1]
2371         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2372         self.report_index_download(epTitle)
2373         try:
2374             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2375         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2376             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2377             return
2378
2379         results = []
2380
2381         idoc = xml.etree.ElementTree.fromstring(indexXml)
2382         itemEls = idoc.findall('.//item')
2383         for partNum,itemEl in enumerate(itemEls):
2384             mediaId = itemEl.findall('./guid')[0].text
2385             shortMediaId = mediaId.split(':')[-1]
2386             showId = mediaId.split(':')[-2].replace('.com', '')
2387             officialTitle = itemEl.findall('./title')[0].text
2388             officialDate = itemEl.findall('./pubDate')[0].text
2389
2390             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2391                         compat_urllib_parse.urlencode({'uri': mediaId}))
2392             configReq = compat_urllib_request.Request(configUrl)
2393             self.report_config_download(epTitle, shortMediaId)
2394             try:
2395                 configXml = compat_urllib_request.urlopen(configReq).read()
2396             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2397                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2398                 return
2399
2400             cdoc = xml.etree.ElementTree.fromstring(configXml)
2401             turls = []
2402             for rendition in cdoc.findall('.//rendition'):
2403                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2404                 turls.append(finfo)
2405
2406             if len(turls) == 0:
2407                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2408                 continue
2409
2410             if self._downloader.params.get('listformats', None):
2411                 self._print_formats([i[0] for i in turls])
2412                 return
2413
2414             # For now, just pick the highest bitrate
2415             format,rtmp_video_url = turls[-1]
2416
2417             # Get the format arg from the arg stream
2418             req_format = self._downloader.params.get('format', None)
2419
2420             # Select format if we can find one
2421             for f,v in turls:
2422                 if f == req_format:
2423                     format, rtmp_video_url = f, v
2424                     break
2425
2426             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2427             if not m:
2428                 raise ExtractorError(u'Cannot transform RTMP url')
2429             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2430             video_url = base + m.group('finalid')
2431
2432             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2433             info = {
2434                 'id': shortMediaId,
2435                 'url': video_url,
2436                 'uploader': showId,
2437                 'upload_date': officialDate,
2438                 'title': effTitle,
2439                 'ext': 'mp4',
2440                 'format': format,
2441                 'thumbnail': None,
2442                 'description': officialTitle,
2443             }
2444             results.append(info)
2445
2446         return results
2447
2448
2449 class EscapistIE(InfoExtractor):
2450     """Information extractor for The Escapist """
2451
2452     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2453     IE_NAME = u'escapist'
2454
2455     def report_config_download(self, showName):
2456         self.to_screen(u'%s: Downloading configuration' % showName)
2457
2458     def _real_extract(self, url):
2459         mobj = re.match(self._VALID_URL, url)
2460         if mobj is None:
2461             self._downloader.report_error(u'invalid URL: %s' % url)
2462             return
2463         showName = mobj.group('showname')
2464         videoId = mobj.group('episode')
2465
2466         self.report_extraction(showName)
2467         try:
2468             webPage = compat_urllib_request.urlopen(url)
2469             webPageBytes = webPage.read()
2470             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2471             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2472         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2474             return
2475
2476         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2477         description = unescapeHTML(descMatch.group(1))
2478         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2479         imgUrl = unescapeHTML(imgMatch.group(1))
2480         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2481         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2482         configUrlMatch = re.search('config=(.*)$', playerUrl)
2483         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2484
2485         self.report_config_download(showName)
2486         try:
2487             configJSON = compat_urllib_request.urlopen(configUrl)
2488             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2489             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2490         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2491             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2492             return
2493
2494         # Technically, it's JavaScript, not JSON
2495         configJSON = configJSON.replace("'", '"')
2496
2497         try:
2498             config = json.loads(configJSON)
2499         except (ValueError,) as err:
2500             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2501             return
2502
2503         playlist = config['playlist']
2504         videoUrl = playlist[1]['url']
2505
2506         info = {
2507             'id': videoId,
2508             'url': videoUrl,
2509             'uploader': showName,
2510             'upload_date': None,
2511             'title': showName,
2512             'ext': 'mp4',
2513             'thumbnail': imgUrl,
2514             'description': description,
2515             'player_url': playerUrl,
2516         }
2517
2518         return [info]
2519
2520 class CollegeHumorIE(InfoExtractor):
2521     """Information extractor for collegehumor.com"""
2522
2523     _WORKING = False
2524     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2525     IE_NAME = u'collegehumor'
2526
2527     def report_manifest(self, video_id):
2528         """Report information extraction."""
2529         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2530
2531     def _real_extract(self, url):
2532         mobj = re.match(self._VALID_URL, url)
2533         if mobj is None:
2534             self._downloader.report_error(u'invalid URL: %s' % url)
2535             return
2536         video_id = mobj.group('videoid')
2537
2538         info = {
2539             'id': video_id,
2540             'uploader': None,
2541             'upload_date': None,
2542         }
2543
2544         self.report_extraction(video_id)
2545         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2546         try:
2547             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2548         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2549             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2550             return
2551
2552         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2553         try:
2554             videoNode = mdoc.findall('./video')[0]
2555             info['description'] = videoNode.findall('./description')[0].text
2556             info['title'] = videoNode.findall('./caption')[0].text
2557             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2558             manifest_url = videoNode.findall('./file')[0].text
2559         except IndexError:
2560             self._downloader.report_error(u'Invalid metadata XML file')
2561             return
2562
2563         manifest_url += '?hdcore=2.10.3'
2564         self.report_manifest(video_id)
2565         try:
2566             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2567         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2569             return
2570
2571         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2572         try:
2573             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2574             node_id = media_node.attrib['url']
2575             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2576         except IndexError as err:
2577             self._downloader.report_error(u'Invalid manifest file')
2578             return
2579
2580         url_pr = compat_urllib_parse_urlparse(manifest_url)
2581         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2582
2583         info['url'] = url
2584         info['ext'] = 'f4f'
2585         return [info]
2586
2587
2588 class XVideosIE(InfoExtractor):
2589     """Information extractor for xvideos.com"""
2590
2591     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2592     IE_NAME = u'xvideos'
2593
2594     def _real_extract(self, url):
2595         mobj = re.match(self._VALID_URL, url)
2596         if mobj is None:
2597             self._downloader.report_error(u'invalid URL: %s' % url)
2598             return
2599         video_id = mobj.group(1)
2600
2601         webpage = self._download_webpage(url, video_id)
2602
2603         self.report_extraction(video_id)
2604
2605
2606         # Extract video URL
2607         mobj = re.search(r'flv_url=(.+?)&', webpage)
2608         if mobj is None:
2609             self._downloader.report_error(u'unable to extract video url')
2610             return
2611         video_url = compat_urllib_parse.unquote(mobj.group(1))
2612
2613
2614         # Extract title
2615         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2616         if mobj is None:
2617             self._downloader.report_error(u'unable to extract video title')
2618             return
2619         video_title = mobj.group(1)
2620
2621
2622         # Extract video thumbnail
2623         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2624         if mobj is None:
2625             self._downloader.report_error(u'unable to extract video thumbnail')
2626             return
2627         video_thumbnail = mobj.group(0)
2628
2629         info = {
2630             'id': video_id,
2631             'url': video_url,
2632             'uploader': None,
2633             'upload_date': None,
2634             'title': video_title,
2635             'ext': 'flv',
2636             'thumbnail': video_thumbnail,
2637             'description': None,
2638         }
2639
2640         return [info]
2641
2642
2643 class SoundcloudIE(InfoExtractor):
2644     """Information extractor for soundcloud.com
2645        To access the media, the uid of the song and a stream token
2646        must be extracted from the page source and the script must make
2647        a request to media.soundcloud.com/crossdomain.xml. Then
2648        the media can be grabbed by requesting from an url composed
2649        of the stream token and uid
2650      """
2651
2652     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2653     IE_NAME = u'soundcloud'
2654
2655     def report_resolve(self, video_id):
2656         """Report information extraction."""
2657         self.to_screen(u'%s: Resolving id' % video_id)
2658
2659     def _real_extract(self, url):
2660         mobj = re.match(self._VALID_URL, url)
2661         if mobj is None:
2662             self._downloader.report_error(u'invalid URL: %s' % url)
2663             return
2664
2665         # extract uploader (which is in the url)
2666         uploader = mobj.group(1)
2667         # extract simple title (uploader + slug of song title)
2668         slug_title =  mobj.group(2)
2669         simple_title = uploader + u'-' + slug_title
2670
2671         self.report_resolve('%s/%s' % (uploader, slug_title))
2672
2673         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2674         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2675         request = compat_urllib_request.Request(resolv_url)
2676         try:
2677             info_json_bytes = compat_urllib_request.urlopen(request).read()
2678             info_json = info_json_bytes.decode('utf-8')
2679         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2680             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2681             return
2682
2683         info = json.loads(info_json)
2684         video_id = info['id']
2685         self.report_extraction('%s/%s' % (uploader, slug_title))
2686
2687         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2688         request = compat_urllib_request.Request(streams_url)
2689         try:
2690             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2691             stream_json = stream_json_bytes.decode('utf-8')
2692         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2693             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2694             return
2695
2696         streams = json.loads(stream_json)
2697         mediaURL = streams['http_mp3_128_url']
2698
2699         return [{
2700             'id':       info['id'],
2701             'url':      mediaURL,
2702             'uploader': info['user']['username'],
2703             'upload_date':  info['created_at'],
2704             'title':    info['title'],
2705             'ext':      u'mp3',
2706             'description': info['description'],
2707         }]
2708
2709 class SoundcloudSetIE(InfoExtractor):
2710     """Information extractor for soundcloud.com sets
2711        To access the media, the uid of the song and a stream token
2712        must be extracted from the page source and the script must make
2713        a request to media.soundcloud.com/crossdomain.xml. Then
2714        the media can be grabbed by requesting from an url composed
2715        of the stream token and uid
2716      """
2717
2718     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2719     IE_NAME = u'soundcloud'
2720
2721     def report_resolve(self, video_id):
2722         """Report information extraction."""
2723         self.to_screen(u'%s: Resolving id' % video_id)
2724
2725     def _real_extract(self, url):
2726         mobj = re.match(self._VALID_URL, url)
2727         if mobj is None:
2728             self._downloader.report_error(u'invalid URL: %s' % url)
2729             return
2730
2731         # extract uploader (which is in the url)
2732         uploader = mobj.group(1)
2733         # extract simple title (uploader + slug of song title)
2734         slug_title =  mobj.group(2)
2735         simple_title = uploader + u'-' + slug_title
2736
2737         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2738
2739         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2740         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2741         request = compat_urllib_request.Request(resolv_url)
2742         try:
2743             info_json_bytes = compat_urllib_request.urlopen(request).read()
2744             info_json = info_json_bytes.decode('utf-8')
2745         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2747             return
2748
2749         videos = []
2750         info = json.loads(info_json)
2751         if 'errors' in info:
2752             for err in info['errors']:
2753                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2754             return
2755
2756         for track in info['tracks']:
2757             video_id = track['id']
2758             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2759
2760             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2761             request = compat_urllib_request.Request(streams_url)
2762             try:
2763                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2764                 stream_json = stream_json_bytes.decode('utf-8')
2765             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2766                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2767                 return
2768
2769             streams = json.loads(stream_json)
2770             mediaURL = streams['http_mp3_128_url']
2771
2772             videos.append({
2773                 'id':       video_id,
2774                 'url':      mediaURL,
2775                 'uploader': track['user']['username'],
2776                 'upload_date':  track['created_at'],
2777                 'title':    track['title'],
2778                 'ext':      u'mp3',
2779                 'description': track['description'],
2780             })
2781         return videos
2782
2783
2784 class InfoQIE(InfoExtractor):
2785     """Information extractor for infoq.com"""
2786     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2787
2788     def _real_extract(self, url):
2789         mobj = re.match(self._VALID_URL, url)
2790         if mobj is None:
2791             self._downloader.report_error(u'invalid URL: %s' % url)
2792             return
2793
2794         webpage = self._download_webpage(url, video_id=url)
2795         self.report_extraction(url)
2796
2797         # Extract video URL
2798         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2799         if mobj is None:
2800             self._downloader.report_error(u'unable to extract video url')
2801             return
2802         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2803         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2804
2805         # Extract title
2806         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2807         if mobj is None:
2808             self._downloader.report_error(u'unable to extract video title')
2809             return
2810         video_title = mobj.group(1)
2811
2812         # Extract description
2813         video_description = u'No description available.'
2814         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2815         if mobj is not None:
2816             video_description = mobj.group(1)
2817
2818         video_filename = video_url.split('/')[-1]
2819         video_id, extension = video_filename.split('.')
2820
2821         info = {
2822             'id': video_id,
2823             'url': video_url,
2824             'uploader': None,
2825             'upload_date': None,
2826             'title': video_title,
2827             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2828             'thumbnail': None,
2829             'description': video_description,
2830         }
2831
2832         return [info]
2833
2834 class MixcloudIE(InfoExtractor):
2835     """Information extractor for www.mixcloud.com"""
2836
2837     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2838     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2839     IE_NAME = u'mixcloud'
2840
2841     def report_download_json(self, file_id):
2842         """Report JSON download."""
2843         self.to_screen(u'Downloading json')
2844
2845     def get_urls(self, jsonData, fmt, bitrate='best'):
2846         """Get urls from 'audio_formats' section in json"""
2847         file_url = None
2848         try:
2849             bitrate_list = jsonData[fmt]
2850             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2851                 bitrate = max(bitrate_list) # select highest
2852
2853             url_list = jsonData[fmt][bitrate]
2854         except TypeError: # we have no bitrate info.
2855             url_list = jsonData[fmt]
2856         return url_list
2857
2858     def check_urls(self, url_list):
2859         """Returns 1st active url from list"""
2860         for url in url_list:
2861             try:
2862                 compat_urllib_request.urlopen(url)
2863                 return url
2864             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2865                 url = None
2866
2867         return None
2868
2869     def _print_formats(self, formats):
2870         print('Available formats:')
2871         for fmt in formats.keys():
2872             for b in formats[fmt]:
2873                 try:
2874                     ext = formats[fmt][b][0]
2875                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2876                 except TypeError: # we have no bitrate info
2877                     ext = formats[fmt][0]
2878                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2879                     break
2880
2881     def _real_extract(self, url):
2882         mobj = re.match(self._VALID_URL, url)
2883         if mobj is None:
2884             self._downloader.report_error(u'invalid URL: %s' % url)
2885             return
2886         # extract uploader & filename from url
2887         uploader = mobj.group(1).decode('utf-8')
2888         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2889
2890         # construct API request
2891         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2892         # retrieve .json file with links to files
2893         request = compat_urllib_request.Request(file_url)
2894         try:
2895             self.report_download_json(file_url)
2896             jsonData = compat_urllib_request.urlopen(request).read()
2897         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2898             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2899             return
2900
2901         # parse JSON
2902         json_data = json.loads(jsonData)
2903         player_url = json_data['player_swf_url']
2904         formats = dict(json_data['audio_formats'])
2905
2906         req_format = self._downloader.params.get('format', None)
2907         bitrate = None
2908
2909         if self._downloader.params.get('listformats', None):
2910             self._print_formats(formats)
2911             return
2912
2913         if req_format is None or req_format == 'best':
2914             for format_param in formats.keys():
2915                 url_list = self.get_urls(formats, format_param)
2916                 # check urls
2917                 file_url = self.check_urls(url_list)
2918                 if file_url is not None:
2919                     break # got it!
2920         else:
2921             if req_format not in formats:
2922                 self._downloader.report_error(u'format is not available')
2923                 return
2924
2925             url_list = self.get_urls(formats, req_format)
2926             file_url = self.check_urls(url_list)
2927             format_param = req_format
2928
2929         return [{
2930             'id': file_id.decode('utf-8'),
2931             'url': file_url.decode('utf-8'),
2932             'uploader': uploader.decode('utf-8'),
2933             'upload_date': None,
2934             'title': json_data['name'],
2935             'ext': file_url.split('.')[-1].decode('utf-8'),
2936             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2937             'thumbnail': json_data['thumbnail_url'],
2938             'description': json_data['description'],
2939             'player_url': player_url.decode('utf-8'),
2940         }]
2941
2942 class StanfordOpenClassroomIE(InfoExtractor):
2943     """Information extractor for Stanford's Open ClassRoom"""
2944
2945     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2946     IE_NAME = u'stanfordoc'
2947
2948     def _real_extract(self, url):
2949         mobj = re.match(self._VALID_URL, url)
2950         if mobj is None:
2951             raise ExtractorError(u'Invalid URL: %s' % url)
2952
2953         if mobj.group('course') and mobj.group('video'): # A specific video
2954             course = mobj.group('course')
2955             video = mobj.group('video')
2956             info = {
2957                 'id': course + '_' + video,
2958                 'uploader': None,
2959                 'upload_date': None,
2960             }
2961
2962             self.report_extraction(info['id'])
2963             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2964             xmlUrl = baseUrl + video + '.xml'
2965             try:
2966                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2967             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2968                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2969                 return
2970             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2971             try:
2972                 info['title'] = mdoc.findall('./title')[0].text
2973                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2974             except IndexError:
2975                 self._downloader.report_error(u'Invalid metadata XML file')
2976                 return
2977             info['ext'] = info['url'].rpartition('.')[2]
2978             return [info]
2979         elif mobj.group('course'): # A course page
2980             course = mobj.group('course')
2981             info = {
2982                 'id': course,
2983                 'type': 'playlist',
2984                 'uploader': None,
2985                 'upload_date': None,
2986             }
2987
2988             coursepage = self._download_webpage(url, info['id'],
2989                                         note='Downloading course info page',
2990                                         errnote='Unable to download course info page')
2991
2992             m = re.search('<h1>([^<]+)</h1>', coursepage)
2993             if m:
2994                 info['title'] = unescapeHTML(m.group(1))
2995             else:
2996                 info['title'] = info['id']
2997
2998             m = re.search('<description>([^<]+)</description>', coursepage)
2999             if m:
3000                 info['description'] = unescapeHTML(m.group(1))
3001
3002             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3003             info['list'] = [
3004                 {
3005                     'type': 'reference',
3006                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3007                 }
3008                     for vpage in links]
3009             results = []
3010             for entry in info['list']:
3011                 assert entry['type'] == 'reference'
3012                 results += self.extract(entry['url'])
3013             return results
3014         else: # Root page
3015             info = {
3016                 'id': 'Stanford OpenClassroom',
3017                 'type': 'playlist',
3018                 'uploader': None,
3019                 'upload_date': None,
3020             }
3021
3022             self.report_download_webpage(info['id'])
3023             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3024             try:
3025                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3026             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3028                 return
3029
3030             info['title'] = info['id']
3031
3032             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3033             info['list'] = [
3034                 {
3035                     'type': 'reference',
3036                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3037                 }
3038                     for cpage in links]
3039
3040             results = []
3041             for entry in info['list']:
3042                 assert entry['type'] == 'reference'
3043                 results += self.extract(entry['url'])
3044             return results
3045
3046 class MTVIE(InfoExtractor):
3047     """Information extractor for MTV.com"""
3048
3049     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3050     IE_NAME = u'mtv'
3051
3052     def _real_extract(self, url):
3053         mobj = re.match(self._VALID_URL, url)
3054         if mobj is None:
3055             self._downloader.report_error(u'invalid URL: %s' % url)
3056             return
3057         if not mobj.group('proto'):
3058             url = 'http://' + url
3059         video_id = mobj.group('videoid')
3060
3061         webpage = self._download_webpage(url, video_id)
3062
3063         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3064         if mobj is None:
3065             self._downloader.report_error(u'unable to extract song name')
3066             return
3067         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3069         if mobj is None:
3070             self._downloader.report_error(u'unable to extract performer')
3071             return
3072         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3073         video_title = performer + ' - ' + song_name
3074
3075         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3076         if mobj is None:
3077             self._downloader.report_error(u'unable to mtvn_uri')
3078             return
3079         mtvn_uri = mobj.group(1)
3080
3081         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3082         if mobj is None:
3083             self._downloader.report_error(u'unable to extract content id')
3084             return
3085         content_id = mobj.group(1)
3086
3087         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3088         self.report_extraction(video_id)
3089         request = compat_urllib_request.Request(videogen_url)
3090         try:
3091             metadataXml = compat_urllib_request.urlopen(request).read()
3092         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3093             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3094             return
3095
3096         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3097         renditions = mdoc.findall('.//rendition')
3098
3099         # For now, always pick the highest quality.
3100         rendition = renditions[-1]
3101
3102         try:
3103             _,_,ext = rendition.attrib['type'].partition('/')
3104             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3105             video_url = rendition.find('./src').text
3106         except KeyError:
3107             self._downloader.report_error('Invalid rendition field.')
3108             return
3109
3110         info = {
3111             'id': video_id,
3112             'url': video_url,
3113             'uploader': performer,
3114             'upload_date': None,
3115             'title': video_title,
3116             'ext': ext,
3117             'format': format,
3118         }
3119
3120         return [info]
3121
3122
3123 class YoukuIE(InfoExtractor):
3124     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3125
3126     def _gen_sid(self):
3127         nowTime = int(time.time() * 1000)
3128         random1 = random.randint(1000,1998)
3129         random2 = random.randint(1000,9999)
3130
3131         return "%d%d%d" %(nowTime,random1,random2)
3132
3133     def _get_file_ID_mix_string(self, seed):
3134         mixed = []
3135         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3136         seed = float(seed)
3137         for i in range(len(source)):
3138             seed  =  (seed * 211 + 30031 ) % 65536
3139             index  =  math.floor(seed / 65536 * len(source) )
3140             mixed.append(source[int(index)])
3141             source.remove(source[int(index)])
3142         #return ''.join(mixed)
3143         return mixed
3144
3145     def _get_file_id(self, fileId, seed):
3146         mixed = self._get_file_ID_mix_string(seed)
3147         ids = fileId.split('*')
3148         realId = []
3149         for ch in ids:
3150             if ch:
3151                 realId.append(mixed[int(ch)])
3152         return ''.join(realId)
3153
3154     def _real_extract(self, url):
3155         mobj = re.match(self._VALID_URL, url)
3156         if mobj is None:
3157             self._downloader.report_error(u'invalid URL: %s' % url)
3158             return
3159         video_id = mobj.group('ID')
3160
3161         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3162
3163         request = compat_urllib_request.Request(info_url, None, std_headers)
3164         try:
3165             self.report_download_webpage(video_id)
3166             jsondata = compat_urllib_request.urlopen(request).read()
3167         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3168             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3169             return
3170
3171         self.report_extraction(video_id)
3172         try:
3173             jsonstr = jsondata.decode('utf-8')
3174             config = json.loads(jsonstr)
3175
3176             video_title =  config['data'][0]['title']
3177             seed = config['data'][0]['seed']
3178
3179             format = self._downloader.params.get('format', None)
3180             supported_format = list(config['data'][0]['streamfileids'].keys())
3181
3182             if format is None or format == 'best':
3183                 if 'hd2' in supported_format:
3184                     format = 'hd2'
3185                 else:
3186                     format = 'flv'
3187                 ext = u'flv'
3188             elif format == 'worst':
3189                 format = 'mp4'
3190                 ext = u'mp4'
3191             else:
3192                 format = 'flv'
3193                 ext = u'flv'
3194
3195
3196             fileid = config['data'][0]['streamfileids'][format]
3197             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3198         except (UnicodeDecodeError, ValueError, KeyError):
3199             self._downloader.report_error(u'unable to extract info section')
3200             return
3201
3202         files_info=[]
3203         sid = self._gen_sid()
3204         fileid = self._get_file_id(fileid, seed)
3205
3206         #column 8,9 of fileid represent the segment number
3207         #fileid[7:9] should be changed
3208         for index, key in enumerate(keys):
3209
3210             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3211             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3212
3213             info = {
3214                 'id': '%s_part%02d' % (video_id, index),
3215                 'url': download_url,
3216                 'uploader': None,
3217                 'upload_date': None,
3218                 'title': video_title,
3219                 'ext': ext,
3220             }
3221             files_info.append(info)
3222
3223         return files_info
3224
3225
3226 class XNXXIE(InfoExtractor):
3227     """Information extractor for xnxx.com"""
3228
3229     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3230     IE_NAME = u'xnxx'
3231     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3232     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3233     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3234
3235     def _real_extract(self, url):
3236         mobj = re.match(self._VALID_URL, url)
3237         if mobj is None:
3238             self._downloader.report_error(u'invalid URL: %s' % url)
3239             return
3240         video_id = mobj.group(1)
3241
3242         self.report_download_webpage(video_id)
3243
3244         # Get webpage content
3245         try:
3246             webpage_bytes = compat_urllib_request.urlopen(url).read()
3247             webpage = webpage_bytes.decode('utf-8')
3248         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3249             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3250             return
3251
3252         result = re.search(self.VIDEO_URL_RE, webpage)
3253         if result is None:
3254             self._downloader.report_error(u'unable to extract video url')
3255             return
3256         video_url = compat_urllib_parse.unquote(result.group(1))
3257
3258         result = re.search(self.VIDEO_TITLE_RE, webpage)
3259         if result is None:
3260             self._downloader.report_error(u'unable to extract video title')
3261             return
3262         video_title = result.group(1)
3263
3264         result = re.search(self.VIDEO_THUMB_RE, webpage)
3265         if result is None:
3266             self._downloader.report_error(u'unable to extract video thumbnail')
3267             return
3268         video_thumbnail = result.group(1)
3269
3270         return [{
3271             'id': video_id,
3272             'url': video_url,
3273             'uploader': None,
3274             'upload_date': None,
3275             'title': video_title,
3276             'ext': 'flv',
3277             'thumbnail': video_thumbnail,
3278             'description': None,
3279         }]
3280
3281
3282 class GooglePlusIE(InfoExtractor):
3283     """Information extractor for plus.google.com."""
3284
3285     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3286     IE_NAME = u'plus.google'
3287
3288     def report_extract_entry(self, url):
3289         """Report downloading extry"""
3290         self.to_screen(u'Downloading entry: %s' % url)
3291
3292     def report_date(self, upload_date):
3293         """Report downloading extry"""
3294         self.to_screen(u'Entry date: %s' % upload_date)
3295
3296     def report_uploader(self, uploader):
3297         """Report downloading extry"""
3298         self.to_screen(u'Uploader: %s' % uploader)
3299
3300     def report_title(self, video_title):
3301         """Report downloading extry"""
3302         self.to_screen(u'Title: %s' % video_title)
3303
3304     def report_extract_vid_page(self, video_page):
3305         """Report information extraction."""
3306         self.to_screen(u'Extracting video page: %s' % video_page)
3307
3308     def _real_extract(self, url):
3309         # Extract id from URL
3310         mobj = re.match(self._VALID_URL, url)
3311         if mobj is None:
3312             self._downloader.report_error(u'Invalid URL: %s' % url)
3313             return
3314
3315         post_url = mobj.group(0)
3316         video_id = mobj.group(1)
3317
3318         video_extension = 'flv'
3319
3320         # Step 1, Retrieve post webpage to extract further information
3321         self.report_extract_entry(post_url)
3322         request = compat_urllib_request.Request(post_url)
3323         try:
3324             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3325         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3326             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3327             return
3328
3329         # Extract update date
3330         upload_date = None
3331         pattern = 'title="Timestamp">(.*?)</a>'
3332         mobj = re.search(pattern, webpage)
3333         if mobj:
3334             upload_date = mobj.group(1)
3335             # Convert timestring to a format suitable for filename
3336             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3337             upload_date = upload_date.strftime('%Y%m%d')
3338         self.report_date(upload_date)
3339
3340         # Extract uploader
3341         uploader = None
3342         pattern = r'rel\="author".*?>(.*?)</a>'
3343         mobj = re.search(pattern, webpage)
3344         if mobj:
3345             uploader = mobj.group(1)
3346         self.report_uploader(uploader)
3347
3348         # Extract title
3349         # Get the first line for title
3350         video_title = u'NA'
3351         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3352         mobj = re.search(pattern, webpage)
3353         if mobj:
3354             video_title = mobj.group(1)
3355         self.report_title(video_title)
3356
3357         # Step 2, Stimulate clicking the image box to launch video
3358         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3359         mobj = re.search(pattern, webpage)
3360         if mobj is None:
3361             self._downloader.report_error(u'unable to extract video page URL')
3362
3363         video_page = mobj.group(1)
3364         request = compat_urllib_request.Request(video_page)
3365         try:
3366             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3367         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3368             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3369             return
3370         self.report_extract_vid_page(video_page)
3371
3372
3373         # Extract video links on video page
3374         """Extract video links of all sizes"""
3375         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3376         mobj = re.findall(pattern, webpage)
3377         if len(mobj) == 0:
3378             self._downloader.report_error(u'unable to extract video links')
3379
3380         # Sort in resolution
3381         links = sorted(mobj)
3382
3383         # Choose the lowest of the sort, i.e. highest resolution
3384         video_url = links[-1]
3385         # Only get the url. The resolution part in the tuple has no use anymore
3386         video_url = video_url[-1]
3387         # Treat escaped \u0026 style hex
3388         try:
3389             video_url = video_url.decode("unicode_escape")
3390         except AttributeError: # Python 3
3391             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3392
3393
3394         return [{
3395             'id':       video_id,
3396             'url':      video_url,
3397             'uploader': uploader,
3398             'upload_date':  upload_date,
3399             'title':    video_title,
3400             'ext':      video_extension,
3401         }]
3402
3403 class NBAIE(InfoExtractor):
3404     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3405     IE_NAME = u'nba'
3406
3407     def _real_extract(self, url):
3408         mobj = re.match(self._VALID_URL, url)
3409         if mobj is None:
3410             self._downloader.report_error(u'invalid URL: %s' % url)
3411             return
3412
3413         video_id = mobj.group(1)
3414         if video_id.endswith('/index.html'):
3415             video_id = video_id[:-len('/index.html')]
3416
3417         webpage = self._download_webpage(url, video_id)
3418
3419         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3420         def _findProp(rexp, default=None):
3421             m = re.search(rexp, webpage)
3422             if m:
3423                 return unescapeHTML(m.group(1))
3424             else:
3425                 return default
3426
3427         shortened_video_id = video_id.rpartition('/')[2]
3428         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3429         info = {
3430             'id': shortened_video_id,
3431             'url': video_url,
3432             'ext': 'mp4',
3433             'title': title,
3434             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3435             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3436         }
3437         return [info]
3438
3439 class JustinTVIE(InfoExtractor):
3440     """Information extractor for justin.tv and twitch.tv"""
3441     # TODO: One broadcast may be split into multiple videos. The key
3442     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3443     # starts at 1 and increases. Can we treat all parts as one video?
3444
3445     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3446         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3447     _JUSTIN_PAGE_LIMIT = 100
3448     IE_NAME = u'justin.tv'
3449
3450     def report_download_page(self, channel, offset):
3451         """Report attempt to download a single page of videos."""
3452         self.to_screen(u'%s: Downloading video information from %d to %d' %
3453                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3454
3455     # Return count of items, list of *valid* items
3456     def _parse_page(self, url):
3457         try:
3458             urlh = compat_urllib_request.urlopen(url)
3459             webpage_bytes = urlh.read()
3460             webpage = webpage_bytes.decode('utf-8', 'ignore')
3461         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3462             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3463             return
3464
3465         response = json.loads(webpage)
3466         if type(response) != list:
3467             error_text = response.get('error', 'unknown error')
3468             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3469             return
3470         info = []
3471         for clip in response:
3472             video_url = clip['video_file_url']
3473             if video_url:
3474                 video_extension = os.path.splitext(video_url)[1][1:]
3475                 video_date = re.sub('-', '', clip['start_time'][:10])
3476                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3477                 video_id = clip['id']
3478                 video_title = clip.get('title', video_id)
3479                 info.append({
3480                     'id': video_id,
3481                     'url': video_url,
3482                     'title': video_title,
3483                     'uploader': clip.get('channel_name', video_uploader_id),
3484                     'uploader_id': video_uploader_id,
3485                     'upload_date': video_date,
3486                     'ext': video_extension,
3487                 })
3488         return (len(response), info)
3489
3490     def _real_extract(self, url):
3491         mobj = re.match(self._VALID_URL, url)
3492         if mobj is None:
3493             self._downloader.report_error(u'invalid URL: %s' % url)
3494             return
3495
3496         api = 'http://api.justin.tv'
3497         video_id = mobj.group(mobj.lastindex)
3498         paged = False
3499         if mobj.lastindex == 1:
3500             paged = True
3501             api += '/channel/archives/%s.json'
3502         else:
3503             api += '/broadcast/by_archive/%s.json'
3504         api = api % (video_id,)
3505
3506         self.report_extraction(video_id)
3507
3508         info = []
3509         offset = 0
3510         limit = self._JUSTIN_PAGE_LIMIT
3511         while True:
3512             if paged:
3513                 self.report_download_page(video_id, offset)
3514             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3515             page_count, page_info = self._parse_page(page_url)
3516             info.extend(page_info)
3517             if not paged or page_count != limit:
3518                 break
3519             offset += limit
3520         return info
3521
3522 class FunnyOrDieIE(InfoExtractor):
3523     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3524
3525     def _real_extract(self, url):
3526         mobj = re.match(self._VALID_URL, url)
3527         if mobj is None:
3528             self._downloader.report_error(u'invalid URL: %s' % url)
3529             return
3530
3531         video_id = mobj.group('id')
3532         webpage = self._download_webpage(url, video_id)
3533
3534         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3535         if not m:
3536             self._downloader.report_error(u'unable to find video information')
3537         video_url = unescapeHTML(m.group('url'))
3538
3539         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3540         if not m:
3541             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3542             if not m:
3543                 self._downloader.report_error(u'Cannot find video title')
3544         title = clean_html(m.group('title'))
3545
3546         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3547         if m:
3548             desc = unescapeHTML(m.group('desc'))
3549         else:
3550             desc = None
3551
3552         info = {
3553             'id': video_id,
3554             'url': video_url,
3555             'ext': 'mp4',
3556             'title': title,
3557             'description': desc,
3558         }
3559         return [info]
3560
3561 class SteamIE(InfoExtractor):
3562     _VALID_URL = r"""http://store.steampowered.com/
3563                 (agecheck/)?
3564                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3565                 (?P<gameID>\d+)/?
3566                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3567                 """
3568
3569     @classmethod
3570     def suitable(cls, url):
3571         """Receives a URL and returns True if suitable for this IE."""
3572         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3573
3574     def _real_extract(self, url):
3575         m = re.match(self._VALID_URL, url, re.VERBOSE)
3576         gameID = m.group('gameID')
3577         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3578         self.report_age_confirmation()
3579         webpage = self._download_webpage(videourl, gameID)
3580         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3581
3582         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3583         mweb = re.finditer(urlRE, webpage)
3584         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3585         titles = re.finditer(namesRE, webpage)
3586         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3587         thumbs = re.finditer(thumbsRE, webpage)
3588         videos = []
3589         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3590             video_id = vid.group('videoID')
3591             title = vtitle.group('videoName')
3592             video_url = vid.group('videoURL')
3593             video_thumb = thumb.group('thumbnail')
3594             if not video_url:
3595                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3596             info = {
3597                 'id':video_id,
3598                 'url':video_url,
3599                 'ext': 'flv',
3600                 'title': unescapeHTML(title),
3601                 'thumbnail': video_thumb
3602                   }
3603             videos.append(info)
3604         return [self.playlist_result(videos, gameID, game_title)]
3605
3606 class UstreamIE(InfoExtractor):
3607     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3608     IE_NAME = u'ustream'
3609
3610     def _real_extract(self, url):
3611         m = re.match(self._VALID_URL, url)
3612         video_id = m.group('videoID')
3613         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3614         webpage = self._download_webpage(url, video_id)
3615         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3616         title = m.group('title')
3617         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3618         uploader = m.group('uploader')
3619         info = {
3620                 'id':video_id,
3621                 'url':video_url,
3622                 'ext': 'flv',
3623                 'title': title,
3624                 'uploader': uploader
3625                   }
3626         return [info]
3627
3628 class WorldStarHipHopIE(InfoExtractor):
3629     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3630     IE_NAME = u'WorldStarHipHop'
3631
3632     def _real_extract(self, url):
3633         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3634
3635         webpage_src = compat_urllib_request.urlopen(url).read()
3636         webpage_src = webpage_src.decode('utf-8')
3637
3638         mobj = re.search(_src_url, webpage_src)
3639
3640         m = re.match(self._VALID_URL, url)
3641         video_id = m.group('id')
3642
3643         if mobj is not None:
3644             video_url = mobj.group()
3645             if 'mp4' in video_url:
3646                 ext = 'mp4'
3647             else:
3648                 ext = 'flv'
3649         else:
3650             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3651             return
3652
3653         _title = r"""<title>(.*)</title>"""
3654
3655         mobj = re.search(_title, webpage_src)
3656
3657         if mobj is not None:
3658             title = mobj.group(1)
3659         else:
3660             title = 'World Start Hip Hop - %s' % time.ctime()
3661
3662         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3663         mobj = re.search(_thumbnail, webpage_src)
3664
3665         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3666         if mobj is not None:
3667             thumbnail = mobj.group(1)
3668         else:
3669             _title = r"""candytitles.*>(.*)</span>"""
3670             mobj = re.search(_title, webpage_src)
3671             if mobj is not None:
3672                 title = mobj.group(1)
3673             thumbnail = None
3674
3675         results = [{
3676                     'id': video_id,
3677                     'url' : video_url,
3678                     'title' : title,
3679                     'thumbnail' : thumbnail,
3680                     'ext' : ext,
3681                     }]
3682         return results
3683
3684 class RBMARadioIE(InfoExtractor):
3685     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3686
3687     def _real_extract(self, url):
3688         m = re.match(self._VALID_URL, url)
3689         video_id = m.group('videoID')
3690
3691         webpage = self._download_webpage(url, video_id)
3692         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3693         if not m:
3694             raise ExtractorError(u'Cannot find metadata')
3695         json_data = m.group(1)
3696
3697         try:
3698             data = json.loads(json_data)
3699         except ValueError as e:
3700             raise ExtractorError(u'Invalid JSON: ' + str(e))
3701
3702         video_url = data['akamai_url'] + '&cbr=256'
3703         url_parts = compat_urllib_parse_urlparse(video_url)
3704         video_ext = url_parts.path.rpartition('.')[2]
3705         info = {
3706                 'id': video_id,
3707                 'url': video_url,
3708                 'ext': video_ext,
3709                 'title': data['title'],
3710                 'description': data.get('teaser_text'),
3711                 'location': data.get('country_of_origin'),
3712                 'uploader': data.get('host', {}).get('name'),
3713                 'uploader_id': data.get('host', {}).get('slug'),
3714                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3715                 'duration': data.get('duration'),
3716         }
3717         return [info]
3718
3719
3720 class YouPornIE(InfoExtractor):
3721     """Information extractor for youporn.com."""
3722     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3723
3724     def _print_formats(self, formats):
3725         """Print all available formats"""
3726         print(u'Available formats:')
3727         print(u'ext\t\tformat')
3728         print(u'---------------------------------')
3729         for format in formats:
3730             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3731
3732     def _specific(self, req_format, formats):
3733         for x in formats:
3734             if(x["format"]==req_format):
3735                 return x
3736         return None
3737
3738     def _real_extract(self, url):
3739         mobj = re.match(self._VALID_URL, url)
3740         if mobj is None:
3741             self._downloader.report_error(u'invalid URL: %s' % url)
3742             return
3743
3744         video_id = mobj.group('videoid')
3745
3746         req = compat_urllib_request.Request(url)
3747         req.add_header('Cookie', 'age_verified=1')
3748         webpage = self._download_webpage(req, video_id)
3749
3750         # Get the video title
3751         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3752         if result is None:
3753             raise ExtractorError(u'Unable to extract video title')
3754         video_title = result.group('title').strip()
3755
3756         # Get the video date
3757         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3758         if result is None:
3759             self._downloader.report_warning(u'unable to extract video date')
3760             upload_date = None
3761         else:
3762             upload_date = result.group('date').strip()
3763
3764         # Get the video uploader
3765         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3766         if result is None:
3767             self._downloader.report_warning(u'unable to extract uploader')
3768             video_uploader = None
3769         else:
3770             video_uploader = result.group('uploader').strip()
3771             video_uploader = clean_html( video_uploader )
3772
3773         # Get all of the formats available
3774         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3775         result = re.search(DOWNLOAD_LIST_RE, webpage)
3776         if result is None:
3777             raise ExtractorError(u'Unable to extract download list')
3778         download_list_html = result.group('download_list').strip()
3779
3780         # Get all of the links from the page
3781         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3782         links = re.findall(LINK_RE, download_list_html)
3783         if(len(links) == 0):
3784             raise ExtractorError(u'ERROR: no known formats available for video')
3785
3786         self.to_screen(u'Links found: %d' % len(links))
3787
3788         formats = []
3789         for link in links:
3790
3791             # A link looks like this:
3792             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3793             # A path looks like this:
3794             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3795             video_url = unescapeHTML( link )
3796             path = compat_urllib_parse_urlparse( video_url ).path
3797             extension = os.path.splitext( path )[1][1:]
3798             format = path.split('/')[4].split('_')[:2]
3799             size = format[0]
3800             bitrate = format[1]
3801             format = "-".join( format )
3802             title = u'%s-%s-%s' % (video_title, size, bitrate)
3803
3804             formats.append({
3805                 'id': video_id,
3806                 'url': video_url,
3807                 'uploader': video_uploader,
3808                 'upload_date': upload_date,
3809                 'title': title,
3810                 'ext': extension,
3811                 'format': format,
3812                 'thumbnail': None,
3813                 'description': None,
3814                 'player_url': None
3815             })
3816
3817         if self._downloader.params.get('listformats', None):
3818             self._print_formats(formats)
3819             return
3820
3821         req_format = self._downloader.params.get('format', None)
3822         self.to_screen(u'Format: %s' % req_format)
3823
3824         if req_format is None or req_format == 'best':
3825             return [formats[0]]
3826         elif req_format == 'worst':
3827             return [formats[-1]]
3828         elif req_format in ('-1', 'all'):
3829             return formats
3830         else:
3831             format = self._specific( req_format, formats )
3832             if result is None:
3833                 self._downloader.report_error(u'requested format not available')
3834                 return
3835             return [format]
3836
3837
3838
3839 class PornotubeIE(InfoExtractor):
3840     """Information extractor for pornotube.com."""
3841     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3842
3843     def _real_extract(self, url):
3844         mobj = re.match(self._VALID_URL, url)
3845         if mobj is None:
3846             self._downloader.report_error(u'invalid URL: %s' % url)
3847             return
3848
3849         video_id = mobj.group('videoid')
3850         video_title = mobj.group('title')
3851
3852         # Get webpage content
3853         webpage = self._download_webpage(url, video_id)
3854
3855         # Get the video URL
3856         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3857         result = re.search(VIDEO_URL_RE, webpage)
3858         if result is None:
3859             self._downloader.report_error(u'unable to extract video url')
3860             return
3861         video_url = compat_urllib_parse.unquote(result.group('url'))
3862
3863         #Get the uploaded date
3864         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3865         result = re.search(VIDEO_UPLOADED_RE, webpage)
3866         if result is None:
3867             self._downloader.report_error(u'unable to extract video title')
3868             return
3869         upload_date = result.group('date')
3870
3871         info = {'id': video_id,
3872                 'url': video_url,
3873                 'uploader': None,
3874                 'upload_date': upload_date,
3875                 'title': video_title,
3876                 'ext': 'flv',
3877                 'format': 'flv'}
3878
3879         return [info]
3880
3881 class YouJizzIE(InfoExtractor):
3882     """Information extractor for youjizz.com."""
3883     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3884
3885     def _real_extract(self, url):
3886         mobj = re.match(self._VALID_URL, url)
3887         if mobj is None:
3888             self._downloader.report_error(u'invalid URL: %s' % url)
3889             return
3890
3891         video_id = mobj.group('videoid')
3892
3893         # Get webpage content
3894         webpage = self._download_webpage(url, video_id)
3895
3896         # Get the video title
3897         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3898         if result is None:
3899             raise ExtractorError(u'ERROR: unable to extract video title')
3900         video_title = result.group('title').strip()
3901
3902         # Get the embed page
3903         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3904         if result is None:
3905             raise ExtractorError(u'ERROR: unable to extract embed page')
3906
3907         embed_page_url = result.group(0).strip()
3908         video_id = result.group('videoid')
3909
3910         webpage = self._download_webpage(embed_page_url, video_id)
3911
3912         # Get the video URL
3913         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3914         if result is None:
3915             raise ExtractorError(u'ERROR: unable to extract video url')
3916         video_url = result.group('source')
3917
3918         info = {'id': video_id,
3919                 'url': video_url,
3920                 'title': video_title,
3921                 'ext': 'flv',
3922                 'format': 'flv',
3923                 'player_url': embed_page_url}
3924
3925         return [info]
3926
3927 class EightTracksIE(InfoExtractor):
3928     IE_NAME = '8tracks'
3929     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3930
3931     def _real_extract(self, url):
3932         mobj = re.match(self._VALID_URL, url)
3933         if mobj is None:
3934             raise ExtractorError(u'Invalid URL: %s' % url)
3935         playlist_id = mobj.group('id')
3936
3937         webpage = self._download_webpage(url, playlist_id)
3938
3939         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3940         if not m:
3941             raise ExtractorError(u'Cannot find trax information')
3942         json_like = m.group(1)
3943         data = json.loads(json_like)
3944
3945         session = str(random.randint(0, 1000000000))
3946         mix_id = data['id']
3947         track_count = data['tracks_count']
3948         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3949         next_url = first_url
3950         res = []
3951         for i in itertools.count():
3952             api_json = self._download_webpage(next_url, playlist_id,
3953                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3954                 errnote=u'Failed to download song information')
3955             api_data = json.loads(api_json)
3956             track_data = api_data[u'set']['track']
3957             info = {
3958                 'id': track_data['id'],
3959                 'url': track_data['track_file_stream_url'],
3960                 'title': track_data['performer'] + u' - ' + track_data['name'],
3961                 'raw_title': track_data['name'],
3962                 'uploader_id': data['user']['login'],
3963                 'ext': 'm4a',
3964             }
3965             res.append(info)
3966             if api_data['set']['at_last_track']:
3967                 break
3968             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3969         return res
3970
3971 class KeekIE(InfoExtractor):
3972     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3973     IE_NAME = u'keek'
3974
3975     def _real_extract(self, url):
3976         m = re.match(self._VALID_URL, url)
3977         video_id = m.group('videoID')
3978         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3979         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3980         webpage = self._download_webpage(url, video_id)
3981         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3982         title = unescapeHTML(m.group('title'))
3983         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3984         uploader = clean_html(m.group('uploader'))
3985         info = {
3986                 'id': video_id,
3987                 'url': video_url,
3988                 'ext': 'mp4',
3989                 'title': title,
3990                 'thumbnail': thumbnail,
3991                 'uploader': uploader
3992         }
3993         return [info]
3994
3995 class TEDIE(InfoExtractor):
3996     _VALID_URL=r'''http://www.ted.com/
3997                    (
3998                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3999                         |
4000                         ((?P<type_talk>talks)) # We have a simple talk
4001                    )
4002                    /(?P<name>\w+) # Here goes the name and then ".html"
4003                    '''
4004
4005     @classmethod
4006     def suitable(cls, url):
4007         """Receives a URL and returns True if suitable for this IE."""
4008         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4009
4010     def _real_extract(self, url):
4011         m=re.match(self._VALID_URL, url, re.VERBOSE)
4012         if m.group('type_talk'):
4013             return [self._talk_info(url)]
4014         else :
4015             playlist_id=m.group('playlist_id')
4016             name=m.group('name')
4017             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4018             return [self._playlist_videos_info(url,name,playlist_id)]
4019
4020     def _talk_video_link(self,mediaSlug):
4021         '''Returns the video link for that mediaSlug'''
4022         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4023
4024     def _playlist_videos_info(self,url,name,playlist_id=0):
4025         '''Returns the videos of the playlist'''
4026         video_RE=r'''
4027                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4028                      ([.\s]*?)data-playlist_item_id="(\d+)"
4029                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4030                      '''
4031         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4032         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4033         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4034         m_names=re.finditer(video_name_RE,webpage)
4035
4036         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4037         m_playlist = re.search(playlist_RE, webpage)
4038         playlist_title = m_playlist.group('playlist_title')
4039
4040         playlist_entries = []
4041         for m_video, m_name in zip(m_videos,m_names):
4042             video_id=m_video.group('video_id')
4043             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4044             playlist_entries.append(self.url_result(talk_url, 'TED'))
4045         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4046
4047     def _talk_info(self, url, video_id=0):
4048         """Return the video for the talk in the url"""
4049         m=re.match(self._VALID_URL, url,re.VERBOSE)
4050         videoName=m.group('name')
4051         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4052         # If the url includes the language we get the title translated
4053         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4054         title=re.search(title_RE, webpage).group('title')
4055         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4056                         "id":(?P<videoID>[\d]+).*?
4057                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4058         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4059         thumb_match=re.search(thumb_RE,webpage)
4060         info_match=re.search(info_RE,webpage,re.VERBOSE)
4061         video_id=info_match.group('videoID')
4062         mediaSlug=info_match.group('mediaSlug')
4063         video_url=self._talk_video_link(mediaSlug)
4064         info = {
4065                 'id': video_id,
4066                 'url': video_url,
4067                 'ext': 'mp4',
4068                 'title': title,
4069                 'thumbnail': thumb_match.group('thumbnail')
4070                 }
4071         return info
4072
4073 class MySpassIE(InfoExtractor):
4074     _VALID_URL = r'http://www.myspass.de/.*'
4075
4076     def _real_extract(self, url):
4077         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4078
4079         # video id is the last path element of the URL
4080         # usually there is a trailing slash, so also try the second but last
4081         url_path = compat_urllib_parse_urlparse(url).path
4082         url_parent_path, video_id = os.path.split(url_path)
4083         if not video_id:
4084             _, video_id = os.path.split(url_parent_path)
4085
4086         # get metadata
4087         metadata_url = META_DATA_URL_TEMPLATE % video_id
4088         metadata_text = self._download_webpage(metadata_url, video_id)
4089         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4090
4091         # extract values from metadata
4092         url_flv_el = metadata.find('url_flv')
4093         if url_flv_el is None:
4094             self._downloader.report_error(u'unable to extract download url')
4095             return
4096         video_url = url_flv_el.text
4097         extension = os.path.splitext(video_url)[1][1:]
4098         title_el = metadata.find('title')
4099         if title_el is None:
4100             self._downloader.report_error(u'unable to extract title')
4101             return
4102         title = title_el.text
4103         format_id_el = metadata.find('format_id')
4104         if format_id_el is None:
4105             format = ext
4106         else:
4107             format = format_id_el.text
4108         description_el = metadata.find('description')
4109         if description_el is not None:
4110             description = description_el.text
4111         else:
4112             description = None
4113         imagePreview_el = metadata.find('imagePreview')
4114         if imagePreview_el is not None:
4115             thumbnail = imagePreview_el.text
4116         else:
4117             thumbnail = None
4118         info = {
4119             'id': video_id,
4120             'url': video_url,
4121             'title': title,
4122             'ext': extension,
4123             'format': format,
4124             'thumbnail': thumbnail,
4125             'description': description
4126         }
4127         return [info]
4128
4129 class SpiegelIE(InfoExtractor):
4130     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4131
4132     def _real_extract(self, url):
4133         m = re.match(self._VALID_URL, url)
4134         video_id = m.group('videoID')
4135
4136         webpage = self._download_webpage(url, video_id)
4137         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4138         if not m:
4139             raise ExtractorError(u'Cannot find title')
4140         video_title = unescapeHTML(m.group(1))
4141
4142         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4143         xml_code = self._download_webpage(xml_url, video_id,
4144                     note=u'Downloading XML', errnote=u'Failed to download XML')
4145
4146         idoc = xml.etree.ElementTree.fromstring(xml_code)
4147         last_type = idoc[-1]
4148         filename = last_type.findall('./filename')[0].text
4149         duration = float(last_type.findall('./duration')[0].text)
4150
4151         video_url = 'http://video2.spiegel.de/flash/' + filename
4152         video_ext = filename.rpartition('.')[2]
4153         info = {
4154             'id': video_id,
4155             'url': video_url,
4156             'ext': video_ext,
4157             'title': video_title,
4158             'duration': duration,
4159         }
4160         return [info]
4161
4162 class LiveLeakIE(InfoExtractor):
4163
4164     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4165     IE_NAME = u'liveleak'
4166
4167     def _real_extract(self, url):
4168         mobj = re.match(self._VALID_URL, url)
4169         if mobj is None:
4170             self._downloader.report_error(u'invalid URL: %s' % url)
4171             return
4172
4173         video_id = mobj.group('video_id')
4174
4175         webpage = self._download_webpage(url, video_id)
4176
4177         m = re.search(r'file: "(.*?)",', webpage)
4178         if not m:
4179             self._downloader.report_error(u'unable to find video url')
4180             return
4181         video_url = m.group(1)
4182
4183         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4184         if not m:
4185             self._downloader.report_error(u'Cannot find video title')
4186         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4187
4188         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4189         if m:
4190             desc = unescapeHTML(m.group('desc'))
4191         else:
4192             desc = None
4193
4194         m = re.search(r'By:.*?(\w+)</a>', webpage)
4195         if m:
4196             uploader = clean_html(m.group(1))
4197         else:
4198             uploader = None
4199
4200         info = {
4201             'id':  video_id,
4202             'url': video_url,
4203             'ext': 'mp4',
4204             'title': title,
4205             'description': desc,
4206             'uploader': uploader
4207         }
4208
4209         return [info]
4210
4211 class ARDIE(InfoExtractor):
4212     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4213     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4214     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4215
4216     def _real_extract(self, url):
4217         # determine video id from url
4218         m = re.match(self._VALID_URL, url)
4219
4220         numid = re.search(r'documentId=([0-9]+)', url)
4221         if numid:
4222             video_id = numid.group(1)
4223         else:
4224             video_id = m.group('video_id')
4225
4226         # determine title and media streams from webpage
4227         html = self._download_webpage(url, video_id)
4228         title = re.search(self._TITLE, html).group('title')
4229         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4230         if not streams:
4231             assert '"fsk"' in html
4232             self._downloader.report_error(u'this video is only available after 8:00 pm')
4233             return
4234
4235         # choose default media type and highest quality for now
4236         stream = max([s for s in streams if int(s["media_type"]) == 0],
4237                      key=lambda s: int(s["quality"]))
4238
4239         # there's two possibilities: RTMP stream or HTTP download
4240         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4241         if stream['rtmp_url']:
4242             self.to_screen(u'RTMP download detected')
4243             assert stream['video_url'].startswith('mp4:')
4244             info["url"] = stream["rtmp_url"]
4245             info["play_path"] = stream['video_url']
4246         else:
4247             assert stream["video_url"].endswith('.mp4')
4248             info["url"] = stream["video_url"]
4249         return [info]
4250
4251 class TumblrIE(InfoExtractor):
4252     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4253
4254     def _real_extract(self, url):
4255         m_url = re.match(self._VALID_URL, url)
4256         video_id = m_url.group('id')
4257         blog = m_url.group('blog_name')
4258
4259         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4260         webpage = self._download_webpage(url, video_id)
4261
4262         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4263         video = re.search(re_video, webpage)
4264         if video is None:
4265             self.to_screen("No video founded")
4266             return []
4267         video_url = video.group('video_url')
4268         ext = video.group('ext')
4269
4270         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4271         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4272
4273         # The only place where you can get a title, it's not complete,
4274         # but searching in other places doesn't work for all videos
4275         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4276         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4277
4278         return [{'id': video_id,
4279                  'url': video_url,
4280                  'title': title,
4281                  'thumbnail': thumb,
4282                  'ext': ext
4283                  }]
4284
4285
4286 def gen_extractors():
4287     """ Return a list of an instance of every supported extractor.
4288     The order does matter; the first extractor matched is the one handling the URL.
4289     """
4290     return [
4291         YoutubePlaylistIE(),
4292         YoutubeChannelIE(),
4293         YoutubeUserIE(),
4294         YoutubeSearchIE(),
4295         YoutubeIE(),
4296         MetacafeIE(),
4297         DailymotionIE(),
4298         GoogleSearchIE(),
4299         PhotobucketIE(),
4300         YahooIE(),
4301         YahooSearchIE(),
4302         DepositFilesIE(),
4303         FacebookIE(),
4304         BlipTVUserIE(),
4305         BlipTVIE(),
4306         VimeoIE(),
4307         MyVideoIE(),
4308         ComedyCentralIE(),
4309         EscapistIE(),
4310         CollegeHumorIE(),
4311         XVideosIE(),
4312         SoundcloudSetIE(),
4313         SoundcloudIE(),
4314         InfoQIE(),
4315         MixcloudIE(),
4316         StanfordOpenClassroomIE(),
4317         MTVIE(),
4318         YoukuIE(),
4319         XNXXIE(),
4320         YouJizzIE(),
4321         PornotubeIE(),
4322         YouPornIE(),
4323         GooglePlusIE(),
4324         ArteTvIE(),
4325         NBAIE(),
4326         WorldStarHipHopIE(),
4327         JustinTVIE(),
4328         FunnyOrDieIE(),
4329         SteamIE(),
4330         UstreamIE(),
4331         RBMARadioIE(),
4332         EightTracksIE(),
4333         KeekIE(),
4334         TEDIE(),
4335         MySpassIE(),
4336         SpiegelIE(),
4337         LiveLeakIE(),
4338         ARDIE(),
4339         TumblrIE(),
4340         GenericIE()
4341     ]
4342
4343 def get_info_extractor(ie_name):
4344     """Returns the info extractor class with the given ie_name"""
4345     return globals()[ie_name+'IE']