_ Git - youtube-dl/blob - youtube_dl/InfoExtractors.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import base64
   7 import datetime
   8 import itertools
   9 import netrc
  10 import os
  11 import re
  12 import socket
  13 import time
  14 import email.utils
  15 import xml.etree.ElementTree
  16 import random
  17 import math
  18 import operator
  19
  20 from .utils import *
  21
  22
  23 class InfoExtractor(object):
  24     """Information Extractor class.
  25
  26     Information extractors are the classes that, given a URL, extract
  27     information about the video (or videos) the URL refers to. This
  28     information includes the real video URL, the video title, author and
  29     others. The information is stored in a dictionary which is then
  30     passed to the FileDownloader. The FileDownloader processes this
  31     information possibly downloading the video to the file system, among
  32     other possible outcomes.
  33
  34     The dictionaries must include the following fields:
  35
  36     id:             Video identifier.
  37     url:            Final video URL.
  38     title:          Video title, unescaped.
  39     ext:            Video filename extension.
  40
  41     The following fields are optional:
  42
  43     format:         The video format, defaults to ext (used for --get-format)
  44     thumbnail:      Full URL to a video thumbnail image.
  45     description:    One-line video description.
  46     uploader:       Full name of the video uploader.
  47     upload_date:    Video upload date (YYYYMMDD).
  48     uploader_id:    Nickname or id of the video uploader.
  49     location:       Physical location of the video.
  50     player_url:     SWF Player URL (used for rtmpdump).
  51     subtitles:      The subtitle file contents.
  52     urlhandle:      [internal] The urlHandle to be used to download the file,
  53                     like returned by urllib.request.urlopen
  54
  55     The fields should all be Unicode strings.
  56
  57     Subclasses of this one should re-define the _real_initialize() and
  58     _real_extract() methods and define a _VALID_URL regexp.
  59     Probably, they should also be added to the list of extractors.
  60
  61     _real_extract() must return a *list* of information dictionaries as
  62     described above.
  63
  64     Finally, the _WORKING attribute should be set to False for broken IEs
  65     in order to warn the users and skip the tests.
  66     """
  67
  68     _ready = False
  69     _downloader = None
  70     _WORKING = True
  71
  72     def __init__(self, downloader=None):
  73         """Constructor. Receives an optional downloader."""
  74         self._ready = False
  75         self.set_downloader(downloader)
  76
  77     @classmethod
  78     def suitable(cls, url):
  79         """Receives a URL and returns True if suitable for this IE."""
  80         return re.match(cls._VALID_URL, url) is not None
  81
  82     @classmethod
  83     def working(cls):
  84         """Getter method for _WORKING."""
  85         return cls._WORKING
  86
  87     def initialize(self):
  88         """Initializes an instance (authentication, etc)."""
  89         if not self._ready:
  90             self._real_initialize()
  91             self._ready = True
  92
  93     def extract(self, url):
  94         """Extracts URL information and returns it in list of dicts."""
  95         self.initialize()
  96         return self._real_extract(url)
  97
  98     def set_downloader(self, downloader):
  99         """Sets the downloader for this IE."""
 100         self._downloader = downloader
 101
 102     def _real_initialize(self):
 103         """Real initialization process. Redefine in subclasses."""
 104         pass
 105
 106     def _real_extract(self, url):
 107         """Real extraction process. Redefine in subclasses."""
 108         pass
 109
 110     @property
 111     def IE_NAME(self):
 112         return type(self).__name__[:-2]
 113
 114     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
 115         """ Returns the response handle """
 116         if note is None:
 117             self.report_download_webpage(video_id)
 118         elif note is not False:
 119             self.to_screen(u'%s: %s' % (video_id, note))
 120         try:
 121             return compat_urllib_request.urlopen(url_or_request)
 122         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 123             if errnote is None:
 124                 errnote = u'Unable to download webpage'
 125             raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 126
 127     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
 128         """ Returns the data of the page as a string """
 129         urlh = self._request_webpage(url_or_request, video_id, note, errnote)
 130         content_type = urlh.headers.get('Content-Type', '')
 131         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 132         if m:
 133             encoding = m.group(1)
 134         else:
 135             encoding = 'utf-8'
 136         webpage_bytes = urlh.read()
 137         if self._downloader.params.get('dump_intermediate_pages', False):
 138             try:
 139                 url = url_or_request.get_full_url()
 140             except AttributeError:
 141                 url = url_or_request
 142             self.to_screen(u'Dumping request to ' + url)
 143             dump = base64.b64encode(webpage_bytes).decode('ascii')
 144             self._downloader.to_screen(dump)
 145         return webpage_bytes.decode(encoding, 'replace')
 146
 147     def to_screen(self, msg):
 148         """Print msg to screen, prefixing it with '[ie_name]'"""
 149         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
 150
 151     def report_extraction(self, id_or_name):
 152         """Report information extraction."""
 153         self.to_screen(u'%s: Extracting information' % id_or_name)
 154
 155     def report_download_webpage(self, video_id):
 156         """Report webpage download."""
 157         self.to_screen(u'%s: Downloading webpage' % video_id)
 158
 159     def report_age_confirmation(self):
 160         """Report attempt to confirm age."""
 161         self.to_screen(u'Confirming age')
 162
 163     #Methods for following #608
 164     #They set the correct value of the '_type' key
 165     def video_result(self, video_info):
 166         """Returns a video"""
 167         video_info['_type'] = 'video'
 168         return video_info
 169     def url_result(self, url, ie=None):
 170         """Returns a url that points to a page that should be processed"""
 171         #TODO: ie should be the class used for getting the info
 172         video_info = {'_type': 'url',
 173                       'url': url,
 174                       'ie_key': ie}
 175         return video_info
 176     def playlist_result(self, entries, playlist_id=None, playlist_title=None):
 177         """Returns a playlist"""
 178         video_info = {'_type': 'playlist',
 179                       'entries': entries}
 180         if playlist_id:
 181             video_info['id'] = playlist_id
 182         if playlist_title:
 183             video_info['title'] = playlist_title
 184         return video_info
 185
 186
 187 class YoutubeIE(InfoExtractor):
 188     """Information extractor for youtube.com."""
 189
 190     _VALID_URL = r"""^
 191                      (
 192                          (?:https?://)?                                       # http(s):// (optional)
 193                          (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 194                             tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
 195                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 196                          (?:                                                  # the various things that can precede the ID:
 197                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
 198                              |(?:                                             # or the v= param in all its forms
 199                                  (?:watch(?:_popup)?(?:\.php)?)?              # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 200                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 201                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 202                                  v=
 203                              )
 204                          )?                                                   # optional -> youtube.com/xxxx is OK
 205                      )?                                                       # all until now is optional -> you can pass the naked ID
 206                      ([0-9A-Za-z_-]+)                                         # here is it! the YouTube video ID
 207                      (?(1).+)?                                                # if we found the ID, everything can follow
 208                      $"""
 209     _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 210     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 211     _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 212     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 213     _NETRC_MACHINE = 'youtube'
 214     # Listed in order of quality
 215     _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
 216     _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
 217     _video_extensions = {
 218         '13': '3gp',
 219         '17': 'mp4',
 220         '18': 'mp4',
 221         '22': 'mp4',
 222         '37': 'mp4',
 223         '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
 224         '43': 'webm',
 225         '44': 'webm',
 226         '45': 'webm',
 227         '46': 'webm',
 228     }
 229     _video_dimensions = {
 230         '5': '240x400',
 231         '6': '???',
 232         '13': '???',
 233         '17': '144x176',
 234         '18': '360x640',
 235         '22': '720x1280',
 236         '34': '360x640',
 237         '35': '480x854',
 238         '37': '1080x1920',
 239         '38': '3072x4096',
 240         '43': '360x640',
 241         '44': '480x854',
 242         '45': '720x1280',
 243         '46': '1080x1920',
 244     }
 245     IE_NAME = u'youtube'
 246
 247     @classmethod
 248     def suitable(cls, url):
 249         """Receives a URL and returns True if suitable for this IE."""
 250         if YoutubePlaylistIE.suitable(url): return False
 251         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 252
 253     def report_lang(self):
 254         """Report attempt to set language."""
 255         self.to_screen(u'Setting language')
 256
 257     def report_login(self):
 258         """Report attempt to log in."""
 259         self.to_screen(u'Logging in')
 260
 261     def report_video_webpage_download(self, video_id):
 262         """Report attempt to download video webpage."""
 263         self.to_screen(u'%s: Downloading video webpage' % video_id)
 264
 265     def report_video_info_webpage_download(self, video_id):
 266         """Report attempt to download video info webpage."""
 267         self.to_screen(u'%s: Downloading video info webpage' % video_id)
 268
 269     def report_video_subtitles_download(self, video_id):
 270         """Report attempt to download video info webpage."""
 271         self.to_screen(u'%s: Checking available subtitles' % video_id)
 272
 273     def report_video_subtitles_request(self, video_id, sub_lang, format):
 274         """Report attempt to download video info webpage."""
 275         self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
 276
 277     def report_video_subtitles_available(self, video_id, sub_lang_list):
 278         """Report available subtitles."""
 279         sub_lang = ",".join(list(sub_lang_list.keys()))
 280         self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
 281
 282     def report_information_extraction(self, video_id):
 283         """Report attempt to extract video information."""
 284         self.to_screen(u'%s: Extracting video information' % video_id)
 285
 286     def report_unavailable_format(self, video_id, format):
 287         """Report extracted video URL."""
 288         self.to_screen(u'%s: Format %s not available' % (video_id, format))
 289
 290     def report_rtmp_download(self):
 291         """Indicate the download will use the RTMP protocol."""
 292         self.to_screen(u'RTMP download detected')
 293
 294     def _get_available_subtitles(self, video_id):
 295         self.report_video_subtitles_download(video_id)
 296         request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
 297         try:
 298             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
 299         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 300             return (u'unable to download video subtitles: %s' % compat_str(err), None)
 301         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 302         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
 303         if not sub_lang_list:
 304             return (u'video doesn\'t have subtitles', None)
 305         return sub_lang_list
 306
 307     def _list_available_subtitles(self, video_id):
 308         sub_lang_list = self._get_available_subtitles(video_id)
 309         self.report_video_subtitles_available(video_id, sub_lang_list)
 310
 311     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
 312         """
 313         Return tuple:
 314         (error_message, sub_lang, sub)
 315         """
 316         self.report_video_subtitles_request(video_id, sub_lang, format)
 317         params = compat_urllib_parse.urlencode({
 318             'lang': sub_lang,
 319             'name': sub_name,
 320             'v': video_id,
 321             'fmt': format,
 322         })
 323         url = 'http://www.youtube.com/api/timedtext?' + params
 324         try:
 325             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
 326         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 327             return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
 328         if not sub:
 329             return (u'Did not fetch video subtitles', None, None)
 330         return (None, sub_lang, sub)
 331
 332     def _extract_subtitle(self, video_id):
 333         """
 334         Return a list with a tuple:
 335         [(error_message, sub_lang, sub)]
 336         """
 337         sub_lang_list = self._get_available_subtitles(video_id)
 338         sub_format = self._downloader.params.get('subtitlesformat')
 339         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 340             return [(sub_lang_list[0], None, None)]
 341         if self._downloader.params.get('subtitleslang', False):
 342             sub_lang = self._downloader.params.get('subtitleslang')
 343         elif 'en' in sub_lang_list:
 344             sub_lang = 'en'
 345         else:
 346             sub_lang = list(sub_lang_list.keys())[0]
 347         if not sub_lang in sub_lang_list:
 348             return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 349
 350         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 351         return [subtitle]
 352
 353     def _extract_all_subtitles(self, video_id):
 354         sub_lang_list = self._get_available_subtitles(video_id)
 355         sub_format = self._downloader.params.get('subtitlesformat')
 356         if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
 357             return [(sub_lang_list[0], None, None)]
 358         subtitles = []
 359         for sub_lang in sub_lang_list:
 360             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
 361             subtitles.append(subtitle)
 362         return subtitles
 363
 364     def _print_formats(self, formats):
 365         print('Available formats:')
 366         for x in formats:
 367             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
 368
 369     def _real_initialize(self):
 370         if self._downloader is None:
 371             return
 372
 373         username = None
 374         password = None
 375         downloader_params = self._downloader.params
 376
 377         # Attempt to use provided username and password or .netrc data
 378         if downloader_params.get('username', None) is not None:
 379             username = downloader_params['username']
 380             password = downloader_params['password']
 381         elif downloader_params.get('usenetrc', False):
 382             try:
 383                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 384                 if info is not None:
 385                     username = info[0]
 386                     password = info[2]
 387                 else:
 388                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 389             except (IOError, netrc.NetrcParseError) as err:
 390                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
 391                 return
 392
 393         # Set language
 394         request = compat_urllib_request.Request(self._LANG_URL)
 395         try:
 396             self.report_lang()
 397             compat_urllib_request.urlopen(request).read()
 398         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 399             self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
 400             return
 401
 402         # No authentication to be performed
 403         if username is None:
 404             return
 405
 406         request = compat_urllib_request.Request(self._LOGIN_URL)
 407         try:
 408             login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
 409         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 410             self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
 411             return
 412
 413         galx = None
 414         dsh = None
 415         match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
 416         if match:
 417           galx = match.group(1)
 418
 419         match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
 420         if match:
 421           dsh = match.group(1)
 422
 423         # Log in
 424         login_form_strs = {
 425                 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 426                 u'Email': username,
 427                 u'GALX': galx,
 428                 u'Passwd': password,
 429                 u'PersistentCookie': u'yes',
 430                 u'_utf8': u'霱',
 431                 u'bgresponse': u'js_disabled',
 432                 u'checkConnection': u'',
 433                 u'checkedDomains': u'youtube',
 434                 u'dnConn': u'',
 435                 u'dsh': dsh,
 436                 u'pstMsg': u'0',
 437                 u'rmShown': u'1',
 438                 u'secTok': u'',
 439                 u'signIn': u'Sign in',
 440                 u'timeStmp': u'',
 441                 u'service': u'youtube',
 442                 u'uilel': u'3',
 443                 u'hl': u'en_US',
 444         }
 445         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 446         # chokes on unicode
 447         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 448         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 449         request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 450         try:
 451             self.report_login()
 452             login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 453             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 454                 self._downloader.report_warning(u'unable to log in: bad username or password')
 455                 return
 456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 457             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
 458             return
 459
 460         # Confirm age
 461         age_form = {
 462                 'next_url':     '/',
 463                 'action_confirm':   'Confirm',
 464                 }
 465         request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
 466         try:
 467             self.report_age_confirmation()
 468             age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
 469         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 470             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 471             return
 472
 473     def _extract_id(self, url):
 474         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
 475         if mobj is None:
 476             self._downloader.report_error(u'invalid URL: %s' % url)
 477             return
 478         video_id = mobj.group(2)
 479         return video_id
 480
 481     def _real_extract(self, url):
 482         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 483         mobj = re.search(self._NEXT_URL_RE, url)
 484         if mobj:
 485             url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 486         video_id = self._extract_id(url)
 487
 488         # Get video webpage
 489         self.report_video_webpage_download(video_id)
 490         url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 491         request = compat_urllib_request.Request(url)
 492         try:
 493             video_webpage_bytes = compat_urllib_request.urlopen(request).read()
 494         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 495             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
 496             return
 497
 498         video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
 499
 500         # Attempt to extract SWF player URL
 501         mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 502         if mobj is not None:
 503             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 504         else:
 505             player_url = None
 506
 507         # Get video info
 508         self.report_video_info_webpage_download(video_id)
 509         for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 510             video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 511                     % (video_id, el_type))
 512             video_info_webpage = self._download_webpage(video_info_url, video_id,
 513                                     note=False,
 514                                     errnote='unable to download video info webpage')
 515             video_info = compat_parse_qs(video_info_webpage)
 516             if 'token' in video_info:
 517                 break
 518         if 'token' not in video_info:
 519             if 'reason' in video_info:
 520                 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
 521             else:
 522                 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
 523             return
 524
 525         # Check for "rental" videos
 526         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 527             self._downloader.report_error(u'"rental" videos not supported')
 528             return
 529
 530         # Start extracting information
 531         self.report_information_extraction(video_id)
 532
 533         # uploader
 534         if 'author' not in video_info:
 535             self._downloader.report_error(u'unable to extract uploader name')
 536             return
 537         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 538
 539         # uploader_id
 540         video_uploader_id = None
 541         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 542         if mobj is not None:
 543             video_uploader_id = mobj.group(1)
 544         else:
 545             self._downloader.report_warning(u'unable to extract uploader nickname')
 546
 547         # title
 548         if 'title' not in video_info:
 549             self._downloader.report_error(u'unable to extract video title')
 550             return
 551         video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
 552
 553         # thumbnail image
 554         if 'thumbnail_url' not in video_info:
 555             self._downloader.report_warning(u'unable to extract video thumbnail')
 556             video_thumbnail = ''
 557         else:   # don't panic if we can't find it
 558             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 559
 560         # upload date
 561         upload_date = None
 562         mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
 563         if mobj is not None:
 564             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 565             upload_date = unified_strdate(upload_date)
 566
 567         # description
 568         video_description = get_element_by_id("eow-description", video_webpage)
 569         if video_description:
 570             video_description = clean_html(video_description)
 571         else:
 572             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 573             if fd_mobj:
 574                 video_description = unescapeHTML(fd_mobj.group(1))
 575             else:
 576                 video_description = u''
 577
 578         # subtitles
 579         video_subtitles = None
 580
 581         if self._downloader.params.get('writesubtitles', False):
 582             video_subtitles = self._extract_subtitle(video_id)
 583             if video_subtitles:
 584                 (sub_error, sub_lang, sub) = video_subtitles[0]
 585                 if sub_error:
 586                     self._downloader.report_error(sub_error)
 587
 588         if self._downloader.params.get('allsubtitles', False):
 589             video_subtitles = self._extract_all_subtitles(video_id)
 590             for video_subtitle in video_subtitles:
 591                 (sub_error, sub_lang, sub) = video_subtitle
 592                 if sub_error:
 593                     self._downloader.report_error(sub_error)
 594
 595         if self._downloader.params.get('listsubtitles', False):
 596             sub_lang_list = self._list_available_subtitles(video_id)
 597             return
 598
 599         if 'length_seconds' not in video_info:
 600             self._downloader.report_warning(u'unable to extract video duration')
 601             video_duration = ''
 602         else:
 603             video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
 604
 605         # token
 606         video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
 607
 608         # Decide which formats to download
 609         req_format = self._downloader.params.get('format', None)
 610
 611         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 612             self.report_rtmp_download()
 613             video_url_list = [(None, video_info['conn'][0])]
 614         elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
 615             url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
 616             url_data = [compat_parse_qs(uds) for uds in url_data_strs]
 617             url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
 618             url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
 619
 620             format_limit = self._downloader.params.get('format_limit', None)
 621             available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
 622             if format_limit is not None and format_limit in available_formats:
 623                 format_list = available_formats[available_formats.index(format_limit):]
 624             else:
 625                 format_list = available_formats
 626             existing_formats = [x for x in format_list if x in url_map]
 627             if len(existing_formats) == 0:
 628                 raise ExtractorError(u'no known formats available for video')
 629             if self._downloader.params.get('listformats', None):
 630                 self._print_formats(existing_formats)
 631                 return
 632             if req_format is None or req_format == 'best':
 633                 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
 634             elif req_format == 'worst':
 635                 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
 636             elif req_format in ('-1', 'all'):
 637                 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
 638             else:
 639                 # Specific formats. We pick the first in a slash-delimeted sequence.
 640                 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
 641                 req_formats = req_format.split('/')
 642                 video_url_list = None
 643                 for rf in req_formats:
 644                     if rf in url_map:
 645                         video_url_list = [(rf, url_map[rf])]
 646                         break
 647                 if video_url_list is None:
 648                     raise ExtractorError(u'requested format not available')
 649         else:
 650             raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
 651
 652         results = []
 653         for format_param, video_real_url in video_url_list:
 654             # Extension
 655             video_extension = self._video_extensions.get(format_param, 'flv')
 656
 657             video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
 658                                               self._video_dimensions.get(format_param, '???'))
 659
 660             results.append({
 661                 'id':       video_id,
 662                 'url':      video_real_url,
 663                 'uploader': video_uploader,
 664                 'uploader_id': video_uploader_id,
 665                 'upload_date':  upload_date,
 666                 'title':    video_title,
 667                 'ext':      video_extension,
 668                 'format':   video_format,
 669                 'thumbnail':    video_thumbnail,
 670                 'description':  video_description,
 671                 'player_url':   player_url,
 672                 'subtitles':    video_subtitles,
 673                 'duration':     video_duration
 674             })
 675         return results
 676
 677
 678 class MetacafeIE(InfoExtractor):
 679     """Information Extractor for metacafe.com."""
 680
 681     _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 682     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 683     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 684     IE_NAME = u'metacafe'
 685
 686     def report_disclaimer(self):
 687         """Report disclaimer retrieval."""
 688         self.to_screen(u'Retrieving disclaimer')
 689
 690     def _real_initialize(self):
 691         # Retrieve disclaimer
 692         request = compat_urllib_request.Request(self._DISCLAIMER)
 693         try:
 694             self.report_disclaimer()
 695             disclaimer = compat_urllib_request.urlopen(request).read()
 696         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 697             self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
 698             return
 699
 700         # Confirm age
 701         disclaimer_form = {
 702             'filters': '0',
 703             'submit': "Continue - I'm over 18",
 704             }
 705         request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
 706         try:
 707             self.report_age_confirmation()
 708             disclaimer = compat_urllib_request.urlopen(request).read()
 709         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 710             self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
 711             return
 712
 713     def _real_extract(self, url):
 714         # Extract id and simplified title from URL
 715         mobj = re.match(self._VALID_URL, url)
 716         if mobj is None:
 717             self._downloader.report_error(u'invalid URL: %s' % url)
 718             return
 719
 720         video_id = mobj.group(1)
 721
 722         # Check if video comes from YouTube
 723         mobj2 = re.match(r'^yt-(.*)$', video_id)
 724         if mobj2 is not None:
 725             return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
 726
 727         # Retrieve video webpage to extract further information
 728         webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
 729
 730         # Extract URL, uploader and title from webpage
 731         self.report_extraction(video_id)
 732         mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
 733         if mobj is not None:
 734             mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 735             video_extension = mediaURL[-3:]
 736
 737             # Extract gdaKey if available
 738             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 739             if mobj is None:
 740                 video_url = mediaURL
 741             else:
 742                 gdaKey = mobj.group(1)
 743                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 744         else:
 745             mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
 746             if mobj is None:
 747                 self._downloader.report_error(u'unable to extract media URL')
 748                 return
 749             vardict = compat_parse_qs(mobj.group(1))
 750             if 'mediaData' not in vardict:
 751                 self._downloader.report_error(u'unable to extract media URL')
 752                 return
 753             mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 754             if mobj is None:
 755                 self._downloader.report_error(u'unable to extract media URL')
 756                 return
 757             mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 758             video_extension = mediaURL[-3:]
 759             video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 760
 761         mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
 762         if mobj is None:
 763             self._downloader.report_error(u'unable to extract title')
 764             return
 765         video_title = mobj.group(1).decode('utf-8')
 766
 767         mobj = re.search(r'submitter=(.*?);', webpage)
 768         if mobj is None:
 769             self._downloader.report_error(u'unable to extract uploader nickname')
 770             return
 771         video_uploader = mobj.group(1)
 772
 773         return [{
 774             'id':       video_id.decode('utf-8'),
 775             'url':      video_url.decode('utf-8'),
 776             'uploader': video_uploader.decode('utf-8'),
 777             'upload_date':  None,
 778             'title':    video_title,
 779             'ext':      video_extension.decode('utf-8'),
 780         }]
 781
 782
 783 class DailymotionIE(InfoExtractor):
 784     """Information Extractor for Dailymotion"""
 785
 786     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
 787     IE_NAME = u'dailymotion'
 788
 789     def _real_extract(self, url):
 790         # Extract id and simplified title from URL
 791         mobj = re.match(self._VALID_URL, url)
 792         if mobj is None:
 793             self._downloader.report_error(u'invalid URL: %s' % url)
 794             return
 795
 796         video_id = mobj.group(1).split('_')[0].split('?')[0]
 797
 798         video_extension = 'mp4'
 799
 800         # Retrieve video webpage to extract further information
 801         request = compat_urllib_request.Request(url)
 802         request.add_header('Cookie', 'family_filter=off')
 803         webpage = self._download_webpage(request, video_id)
 804
 805         # Extract URL, uploader and title from webpage
 806         self.report_extraction(video_id)
 807         mobj = re.search(r'\s*var flashvars = (.*)', webpage)
 808         if mobj is None:
 809             self._downloader.report_error(u'unable to extract media URL')
 810             return
 811         flashvars = compat_urllib_parse.unquote(mobj.group(1))
 812
 813         for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
 814             if key in flashvars:
 815                 max_quality = key
 816                 self.to_screen(u'Using %s' % key)
 817                 break
 818         else:
 819             self._downloader.report_error(u'unable to extract video URL')
 820             return
 821
 822         mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
 823         if mobj is None:
 824             self._downloader.report_error(u'unable to extract video URL')
 825             return
 826
 827         video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
 828
 829         # TODO: support choosing qualities
 830
 831         mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
 832         if mobj is None:
 833             self._downloader.report_error(u'unable to extract title')
 834             return
 835         video_title = unescapeHTML(mobj.group('title'))
 836
 837         video_uploader = None
 838         mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
 839         if mobj is None:
 840             # lookin for official user
 841             mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
 842             if mobj_official is None:
 843                 self._downloader.report_warning(u'unable to extract uploader nickname')
 844             else:
 845                 video_uploader = mobj_official.group(1)
 846         else:
 847             video_uploader = mobj.group(1)
 848
 849         video_upload_date = None
 850         mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
 851         if mobj is not None:
 852             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 853
 854         return [{
 855             'id':       video_id,
 856             'url':      video_url,
 857             'uploader': video_uploader,
 858             'upload_date':  video_upload_date,
 859             'title':    video_title,
 860             'ext':      video_extension,
 861         }]
 862
 863
 864 class PhotobucketIE(InfoExtractor):
 865     """Information extractor for photobucket.com."""
 866
 867     _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
 868     IE_NAME = u'photobucket'
 869
 870     def _real_extract(self, url):
 871         # Extract id from URL
 872         mobj = re.match(self._VALID_URL, url)
 873         if mobj is None:
 874             self._downloader.report_error(u'Invalid URL: %s' % url)
 875             return
 876
 877         video_id = mobj.group(1)
 878
 879         video_extension = 'flv'
 880
 881         # Retrieve video webpage to extract further information
 882         request = compat_urllib_request.Request(url)
 883         try:
 884             self.report_download_webpage(video_id)
 885             webpage = compat_urllib_request.urlopen(request).read()
 886         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 887             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 888             return
 889
 890         # Extract URL, uploader, and title from webpage
 891         self.report_extraction(video_id)
 892         mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
 893         if mobj is None:
 894             self._downloader.report_error(u'unable to extract media URL')
 895             return
 896         mediaURL = compat_urllib_parse.unquote(mobj.group(1))
 897
 898         video_url = mediaURL
 899
 900         mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
 901         if mobj is None:
 902             self._downloader.report_error(u'unable to extract title')
 903             return
 904         video_title = mobj.group(1).decode('utf-8')
 905
 906         video_uploader = mobj.group(2).decode('utf-8')
 907
 908         return [{
 909             'id':       video_id.decode('utf-8'),
 910             'url':      video_url.decode('utf-8'),
 911             'uploader': video_uploader,
 912             'upload_date':  None,
 913             'title':    video_title,
 914             'ext':      video_extension.decode('utf-8'),
 915         }]
 916
 917
 918 class YahooIE(InfoExtractor):
 919     """Information extractor for video.yahoo.com."""
 920
 921     _WORKING = False
 922     # _VALID_URL matches all Yahoo! Video URLs
 923     # _VPAGE_URL matches only the extractable '/watch/' URLs
 924     _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
 925     _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
 926     IE_NAME = u'video.yahoo'
 927
 928     def _real_extract(self, url, new_video=True):
 929         # Extract ID from URL
 930         mobj = re.match(self._VALID_URL, url)
 931         if mobj is None:
 932             self._downloader.report_error(u'Invalid URL: %s' % url)
 933             return
 934
 935         video_id = mobj.group(2)
 936         video_extension = 'flv'
 937
 938         # Rewrite valid but non-extractable URLs as
 939         # extractable English language /watch/ URLs
 940         if re.match(self._VPAGE_URL, url) is None:
 941             request = compat_urllib_request.Request(url)
 942             try:
 943                 webpage = compat_urllib_request.urlopen(request).read()
 944             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 945                 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 946                 return
 947
 948             mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
 949             if mobj is None:
 950                 self._downloader.report_error(u'Unable to extract id field')
 951                 return
 952             yahoo_id = mobj.group(1)
 953
 954             mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
 955             if mobj is None:
 956                 self._downloader.report_error(u'Unable to extract vid field')
 957                 return
 958             yahoo_vid = mobj.group(1)
 959
 960             url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
 961             return self._real_extract(url, new_video=False)
 962
 963         # Retrieve video webpage to extract further information
 964         request = compat_urllib_request.Request(url)
 965         try:
 966             self.report_download_webpage(video_id)
 967             webpage = compat_urllib_request.urlopen(request).read()
 968         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 969             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
 970             return
 971
 972         # Extract uploader and title from webpage
 973         self.report_extraction(video_id)
 974         mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
 975         if mobj is None:
 976             self._downloader.report_error(u'unable to extract video title')
 977             return
 978         video_title = mobj.group(1).decode('utf-8')
 979
 980         mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
 981         if mobj is None:
 982             self._downloader.report_error(u'unable to extract video uploader')
 983             return
 984         video_uploader = mobj.group(1).decode('utf-8')
 985
 986         # Extract video thumbnail
 987         mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
 988         if mobj is None:
 989             self._downloader.report_error(u'unable to extract video thumbnail')
 990             return
 991         video_thumbnail = mobj.group(1).decode('utf-8')
 992
 993         # Extract video description
 994         mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
 995         if mobj is None:
 996             self._downloader.report_error(u'unable to extract video description')
 997             return
 998         video_description = mobj.group(1).decode('utf-8')
 999         if not video_description:
1000             video_description = 'No description available.'
1001
1002         # Extract video height and width
1003         mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1004         if mobj is None:
1005             self._downloader.report_error(u'unable to extract video height')
1006             return
1007         yv_video_height = mobj.group(1)
1008
1009         mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1010         if mobj is None:
1011             self._downloader.report_error(u'unable to extract video width')
1012             return
1013         yv_video_width = mobj.group(1)
1014
1015         # Retrieve video playlist to extract media URL
1016         # I'm not completely sure what all these options are, but we
1017         # seem to need most of them, otherwise the server sends a 401.
1018         yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1019         yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1020         request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1021                 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1022                 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1023         try:
1024             self.report_download_webpage(video_id)
1025             webpage = compat_urllib_request.urlopen(request).read()
1026         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1027             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1028             return
1029
1030         # Extract media URL from playlist XML
1031         mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1032         if mobj is None:
1033             self._downloader.report_error(u'Unable to extract media URL')
1034             return
1035         video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1036         video_url = unescapeHTML(video_url)
1037
1038         return [{
1039             'id':       video_id.decode('utf-8'),
1040             'url':      video_url,
1041             'uploader': video_uploader,
1042             'upload_date':  None,
1043             'title':    video_title,
1044             'ext':      video_extension.decode('utf-8'),
1045             'thumbnail':    video_thumbnail.decode('utf-8'),
1046             'description':  video_description,
1047         }]
1048
1049
1050 class VimeoIE(InfoExtractor):
1051     """Information extractor for vimeo.com."""
1052
1053     # _VALID_URL matches Vimeo URLs
1054     _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1055     IE_NAME = u'vimeo'
1056
1057     def _real_extract(self, url, new_video=True):
1058         # Extract ID from URL
1059         mobj = re.match(self._VALID_URL, url)
1060         if mobj is None:
1061             self._downloader.report_error(u'Invalid URL: %s' % url)
1062             return
1063
1064         video_id = mobj.group('id')
1065         if not mobj.group('proto'):
1066             url = 'https://' + url
1067         if mobj.group('direct_link'):
1068             url = 'https://vimeo.com/' + video_id
1069
1070         # Retrieve video webpage to extract further information
1071         request = compat_urllib_request.Request(url, None, std_headers)
1072         try:
1073             self.report_download_webpage(video_id)
1074             webpage_bytes = compat_urllib_request.urlopen(request).read()
1075             webpage = webpage_bytes.decode('utf-8')
1076         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1077             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1078             return
1079
1080         # Now we begin extracting as much information as we can from what we
1081         # retrieved. First we extract the information common to all extractors,
1082         # and latter we extract those that are Vimeo specific.
1083         self.report_extraction(video_id)
1084
1085         # Extract the config JSON
1086         try:
1087             config = webpage.split(' = {config:')[1].split(',assets:')[0]
1088             config = json.loads(config)
1089         except:
1090             if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1091                 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1092             else:
1093                 self._downloader.report_error(u'unable to extract info section')
1094             return
1095
1096         # Extract title
1097         video_title = config["video"]["title"]
1098
1099         # Extract uploader and uploader_id
1100         video_uploader = config["video"]["owner"]["name"]
1101         video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1102
1103         # Extract video thumbnail
1104         video_thumbnail = config["video"]["thumbnail"]
1105
1106         # Extract video description
1107         video_description = get_element_by_attribute("itemprop", "description", webpage)
1108         if video_description: video_description = clean_html(video_description)
1109         else: video_description = u''
1110
1111         # Extract upload date
1112         video_upload_date = None
1113         mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1114         if mobj is not None:
1115             video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1116
1117         # Vimeo specific: extract request signature and timestamp
1118         sig = config['request']['signature']
1119         timestamp = config['request']['timestamp']
1120
1121         # Vimeo specific: extract video codec and quality information
1122         # First consider quality, then codecs, then take everything
1123         # TODO bind to format param
1124         codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1125         files = { 'hd': [], 'sd': [], 'other': []}
1126         for codec_name, codec_extension in codecs:
1127             if codec_name in config["video"]["files"]:
1128                 if 'hd' in config["video"]["files"][codec_name]:
1129                     files['hd'].append((codec_name, codec_extension, 'hd'))
1130                 elif 'sd' in config["video"]["files"][codec_name]:
1131                     files['sd'].append((codec_name, codec_extension, 'sd'))
1132                 else:
1133                     files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1134
1135         for quality in ('hd', 'sd', 'other'):
1136             if len(files[quality]) > 0:
1137                 video_quality = files[quality][0][2]
1138                 video_codec = files[quality][0][0]
1139                 video_extension = files[quality][0][1]
1140                 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1141                 break
1142         else:
1143             self._downloader.report_error(u'no known codec found')
1144             return
1145
1146         video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1147                     %(video_id, sig, timestamp, video_quality, video_codec.upper())
1148
1149         return [{
1150             'id':       video_id,
1151             'url':      video_url,
1152             'uploader': video_uploader,
1153             'uploader_id': video_uploader_id,
1154             'upload_date':  video_upload_date,
1155             'title':    video_title,
1156             'ext':      video_extension,
1157             'thumbnail':    video_thumbnail,
1158             'description':  video_description,
1159         }]
1160
1161
1162 class ArteTvIE(InfoExtractor):
1163     """arte.tv information extractor."""
1164
1165     _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1166     _LIVE_URL = r'index-[0-9]+\.html$'
1167
1168     IE_NAME = u'arte.tv'
1169
1170     def fetch_webpage(self, url):
1171         request = compat_urllib_request.Request(url)
1172         try:
1173             self.report_download_webpage(url)
1174             webpage = compat_urllib_request.urlopen(request).read()
1175         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1176             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1177             return
1178         except ValueError as err:
1179             self._downloader.report_error(u'Invalid URL: %s' % url)
1180             return
1181         return webpage
1182
1183     def grep_webpage(self, url, regex, regexFlags, matchTuples):
1184         page = self.fetch_webpage(url)
1185         mobj = re.search(regex, page, regexFlags)
1186         info = {}
1187
1188         if mobj is None:
1189             self._downloader.report_error(u'Invalid URL: %s' % url)
1190             return
1191
1192         for (i, key, err) in matchTuples:
1193             if mobj.group(i) is None:
1194                 self._downloader.report_error(err)
1195                 return
1196             else:
1197                 info[key] = mobj.group(i)
1198
1199         return info
1200
1201     def extractLiveStream(self, url):
1202         video_lang = url.split('/')[-4]
1203         info = self.grep_webpage(
1204             url,
1205             r'src="(.*?/videothek_js.*?\.js)',
1206             0,
1207             [
1208                 (1, 'url', u'Invalid URL: %s' % url)
1209             ]
1210         )
1211         http_host = url.split('/')[2]
1212         next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1213         info = self.grep_webpage(
1214             next_url,
1215             r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1216                 '(http://.*?\.swf).*?' +
1217                 '(rtmp://.*?)\'',
1218             re.DOTALL,
1219             [
1220                 (1, 'path',   u'could not extract video path: %s' % url),
1221                 (2, 'player', u'could not extract video player: %s' % url),
1222                 (3, 'url',    u'could not extract video url: %s' % url)
1223             ]
1224         )
1225         video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1226
1227     def extractPlus7Stream(self, url):
1228         video_lang = url.split('/')[-3]
1229         info = self.grep_webpage(
1230             url,
1231             r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1232             0,
1233             [
1234                 (1, 'url', u'Invalid URL: %s' % url)
1235             ]
1236         )
1237         next_url = compat_urllib_parse.unquote(info.get('url'))
1238         info = self.grep_webpage(
1239             next_url,
1240             r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1241             0,
1242             [
1243                 (1, 'url', u'Could not find <video> tag: %s' % url)
1244             ]
1245         )
1246         next_url = compat_urllib_parse.unquote(info.get('url'))
1247
1248         info = self.grep_webpage(
1249             next_url,
1250             r'<video id="(.*?)".*?>.*?' +
1251                 '<name>(.*?)</name>.*?' +
1252                 '<dateVideo>(.*?)</dateVideo>.*?' +
1253                 '<url quality="hd">(.*?)</url>',
1254             re.DOTALL,
1255             [
1256                 (1, 'id',    u'could not extract video id: %s' % url),
1257                 (2, 'title', u'could not extract video title: %s' % url),
1258                 (3, 'date',  u'could not extract video date: %s' % url),
1259                 (4, 'url',   u'could not extract video url: %s' % url)
1260             ]
1261         )
1262
1263         return {
1264             'id':           info.get('id'),
1265             'url':          compat_urllib_parse.unquote(info.get('url')),
1266             'uploader':     u'arte.tv',
1267             'upload_date':  info.get('date'),
1268             'title':        info.get('title').decode('utf-8'),
1269             'ext':          u'mp4',
1270             'format':       u'NA',
1271             'player_url':   None,
1272         }
1273
1274     def _real_extract(self, url):
1275         video_id = url.split('/')[-1]
1276         self.report_extraction(video_id)
1277
1278         if re.search(self._LIVE_URL, video_id) is not None:
1279             self.extractLiveStream(url)
1280             return
1281         else:
1282             info = self.extractPlus7Stream(url)
1283
1284         return [info]
1285
1286
1287 class GenericIE(InfoExtractor):
1288     """Generic last-resort information extractor."""
1289
1290     _VALID_URL = r'.*'
1291     IE_NAME = u'generic'
1292
1293     def report_download_webpage(self, video_id):
1294         """Report webpage download."""
1295         if not self._downloader.params.get('test', False):
1296             self._downloader.report_warning(u'Falling back on generic information extractor.')
1297         super(GenericIE, self).report_download_webpage(video_id)
1298
1299     def report_following_redirect(self, new_url):
1300         """Report information extraction."""
1301         self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1302
1303     def _test_redirect(self, url):
1304         """Check if it is a redirect, like url shorteners, in case return the new url."""
1305         class HeadRequest(compat_urllib_request.Request):
1306             def get_method(self):
1307                 return "HEAD"
1308
1309         class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1310             """
1311             Subclass the HTTPRedirectHandler to make it use our
1312             HeadRequest also on the redirected URL
1313             """
1314             def redirect_request(self, req, fp, code, msg, headers, newurl):
1315                 if code in (301, 302, 303, 307):
1316                     newurl = newurl.replace(' ', '%20')
1317                     newheaders = dict((k,v) for k,v in req.headers.items()
1318                                       if k.lower() not in ("content-length", "content-type"))
1319                     return HeadRequest(newurl,
1320                                        headers=newheaders,
1321                                        origin_req_host=req.get_origin_req_host(),
1322                                        unverifiable=True)
1323                 else:
1324                     raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1325
1326         class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1327             """
1328             Fallback to GET if HEAD is not allowed (405 HTTP error)
1329             """
1330             def http_error_405(self, req, fp, code, msg, headers):
1331                 fp.read()
1332                 fp.close()
1333
1334                 newheaders = dict((k,v) for k,v in req.headers.items()
1335                                   if k.lower() not in ("content-length", "content-type"))
1336                 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1337                                                  headers=newheaders,
1338                                                  origin_req_host=req.get_origin_req_host(),
1339                                                  unverifiable=True))
1340
1341         # Build our opener
1342         opener = compat_urllib_request.OpenerDirector()
1343         for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1344                         HTTPMethodFallback, HEADRedirectHandler,
1345                         compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1346             opener.add_handler(handler())
1347
1348         response = opener.open(HeadRequest(url))
1349         new_url = response.geturl()
1350
1351         if url == new_url:
1352             return False
1353
1354         self.report_following_redirect(new_url)
1355         return new_url
1356
1357     def _real_extract(self, url):
1358         new_url = self._test_redirect(url)
1359         if new_url: return [self.url_result(new_url)]
1360
1361         video_id = url.split('/')[-1]
1362         try:
1363             webpage = self._download_webpage(url, video_id)
1364         except ValueError as err:
1365             # since this is the last-resort InfoExtractor, if
1366             # this error is thrown, it'll be thrown here
1367             self._downloader.report_error(u'Invalid URL: %s' % url)
1368             return
1369
1370         self.report_extraction(video_id)
1371         # Start with something easy: JW Player in SWFObject
1372         mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1373         if mobj is None:
1374             # Broaden the search a little bit
1375             mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1376         if mobj is None:
1377             # Broaden the search a little bit: JWPlayer JS loader
1378             mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1379         if mobj is None:
1380             self._downloader.report_error(u'Invalid URL: %s' % url)
1381             return
1382
1383         # It's possible that one of the regexes
1384         # matched, but returned an empty group:
1385         if mobj.group(1) is None:
1386             self._downloader.report_error(u'Invalid URL: %s' % url)
1387             return
1388
1389         video_url = compat_urllib_parse.unquote(mobj.group(1))
1390         video_id = os.path.basename(video_url)
1391
1392         # here's a fun little line of code for you:
1393         video_extension = os.path.splitext(video_id)[1][1:]
1394         video_id = os.path.splitext(video_id)[0]
1395
1396         # it's tempting to parse this further, but you would
1397         # have to take into account all the variations like
1398         #   Video Title - Site Name
1399         #   Site Name | Video Title
1400         #   Video Title - Tagline | Site Name
1401         # and so on and so forth; it's just not practical
1402         mobj = re.search(r'<title>(.*)</title>', webpage)
1403         if mobj is None:
1404             self._downloader.report_error(u'unable to extract title')
1405             return
1406         video_title = mobj.group(1)
1407
1408         # video uploader is domain name
1409         mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1410         if mobj is None:
1411             self._downloader.report_error(u'unable to extract title')
1412             return
1413         video_uploader = mobj.group(1)
1414
1415         return [{
1416             'id':       video_id,
1417             'url':      video_url,
1418             'uploader': video_uploader,
1419             'upload_date':  None,
1420             'title':    video_title,
1421             'ext':      video_extension,
1422         }]
1423
1424
1425 class YoutubeSearchIE(InfoExtractor):
1426     """Information Extractor for YouTube search queries."""
1427     _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1428     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1429     _max_youtube_results = 1000
1430     IE_NAME = u'youtube:search'
1431
1432     def report_download_page(self, query, pagenum):
1433         """Report attempt to download search page with given number."""
1434         query = query.decode(preferredencoding())
1435         self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1436
1437     def _real_extract(self, query):
1438         mobj = re.match(self._VALID_URL, query)
1439         if mobj is None:
1440             self._downloader.report_error(u'invalid search query "%s"' % query)
1441             return
1442
1443         prefix, query = query.split(':')
1444         prefix = prefix[8:]
1445         query = query.encode('utf-8')
1446         if prefix == '':
1447             return self._get_n_results(query, 1)
1448         elif prefix == 'all':
1449             self._get_n_results(query, self._max_youtube_results)
1450         else:
1451             try:
1452                 n = int(prefix)
1453                 if n <= 0:
1454                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1455                     return
1456                 elif n > self._max_youtube_results:
1457                     self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1458                     n = self._max_youtube_results
1459                 return self._get_n_results(query, n)
1460             except ValueError: # parsing prefix as integer fails
1461                 return self._get_n_results(query, 1)
1462
1463     def _get_n_results(self, query, n):
1464         """Get a specified number of results for a query"""
1465
1466         video_ids = []
1467         pagenum = 0
1468         limit = n
1469
1470         while (50 * pagenum) < limit:
1471             self.report_download_page(query, pagenum+1)
1472             result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473             request = compat_urllib_request.Request(result_url)
1474             try:
1475                 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477                 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1478                 return
1479             api_response = json.loads(data)['data']
1480
1481             if not 'items' in api_response:
1482                 self._downloader.report_error(u'[youtube] No video results')
1483                 return
1484
1485             new_ids = list(video['id'] for video in api_response['items'])
1486             video_ids += new_ids
1487
1488             limit = min(n, api_response['totalItems'])
1489             pagenum += 1
1490
1491         if len(video_ids) > n:
1492             video_ids = video_ids[:n]
1493         videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1494         return videos
1495
1496
1497 class GoogleSearchIE(InfoExtractor):
1498     """Information Extractor for Google Video search queries."""
1499     _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1500     _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1501     _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1502     _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1503     _max_google_results = 1000
1504     IE_NAME = u'video.google:search'
1505
1506     def report_download_page(self, query, pagenum):
1507         """Report attempt to download playlist page with given number."""
1508         query = query.decode(preferredencoding())
1509         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1510
1511     def _real_extract(self, query):
1512         mobj = re.match(self._VALID_URL, query)
1513         if mobj is None:
1514             self._downloader.report_error(u'invalid search query "%s"' % query)
1515             return
1516
1517         prefix, query = query.split(':')
1518         prefix = prefix[8:]
1519         query = query.encode('utf-8')
1520         if prefix == '':
1521             self._download_n_results(query, 1)
1522             return
1523         elif prefix == 'all':
1524             self._download_n_results(query, self._max_google_results)
1525             return
1526         else:
1527             try:
1528                 n = int(prefix)
1529                 if n <= 0:
1530                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1531                     return
1532                 elif n > self._max_google_results:
1533                     self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1534                     n = self._max_google_results
1535                 self._download_n_results(query, n)
1536                 return
1537             except ValueError: # parsing prefix as integer fails
1538                 self._download_n_results(query, 1)
1539                 return
1540
1541     def _download_n_results(self, query, n):
1542         """Downloads a specified number of results for a query"""
1543
1544         video_ids = []
1545         pagenum = 0
1546
1547         while True:
1548             self.report_download_page(query, pagenum)
1549             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1550             request = compat_urllib_request.Request(result_url)
1551             try:
1552                 page = compat_urllib_request.urlopen(request).read()
1553             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1554                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1555                 return
1556
1557             # Extract video identifiers
1558             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559                 video_id = mobj.group(1)
1560                 if video_id not in video_ids:
1561                     video_ids.append(video_id)
1562                     if len(video_ids) == n:
1563                         # Specified n videos reached
1564                         for id in video_ids:
1565                             self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1566                         return
1567
1568             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1569                 for id in video_ids:
1570                     self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571                 return
1572
1573             pagenum = pagenum + 1
1574
1575
1576 class YahooSearchIE(InfoExtractor):
1577     """Information Extractor for Yahoo! Video search queries."""
1578
1579     _WORKING = False
1580     _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1581     _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1582     _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1583     _MORE_PAGES_INDICATOR = r'\s*Next'
1584     _max_yahoo_results = 1000
1585     IE_NAME = u'video.yahoo:search'
1586
1587     def report_download_page(self, query, pagenum):
1588         """Report attempt to download playlist page with given number."""
1589         query = query.decode(preferredencoding())
1590         self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1591
1592     def _real_extract(self, query):
1593         mobj = re.match(self._VALID_URL, query)
1594         if mobj is None:
1595             self._downloader.report_error(u'invalid search query "%s"' % query)
1596             return
1597
1598         prefix, query = query.split(':')
1599         prefix = prefix[8:]
1600         query = query.encode('utf-8')
1601         if prefix == '':
1602             self._download_n_results(query, 1)
1603             return
1604         elif prefix == 'all':
1605             self._download_n_results(query, self._max_yahoo_results)
1606             return
1607         else:
1608             try:
1609                 n = int(prefix)
1610                 if n <= 0:
1611                     self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1612                     return
1613                 elif n > self._max_yahoo_results:
1614                     self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1615                     n = self._max_yahoo_results
1616                 self._download_n_results(query, n)
1617                 return
1618             except ValueError: # parsing prefix as integer fails
1619                 self._download_n_results(query, 1)
1620                 return
1621
1622     def _download_n_results(self, query, n):
1623         """Downloads a specified number of results for a query"""
1624
1625         video_ids = []
1626         already_seen = set()
1627         pagenum = 1
1628
1629         while True:
1630             self.report_download_page(query, pagenum)
1631             result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1632             request = compat_urllib_request.Request(result_url)
1633             try:
1634                 page = compat_urllib_request.urlopen(request).read()
1635             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1636                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1637                 return
1638
1639             # Extract video identifiers
1640             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1641                 video_id = mobj.group(1)
1642                 if video_id not in already_seen:
1643                     video_ids.append(video_id)
1644                     already_seen.add(video_id)
1645                     if len(video_ids) == n:
1646                         # Specified n videos reached
1647                         for id in video_ids:
1648                             self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1649                         return
1650
1651             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652                 for id in video_ids:
1653                     self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654                 return
1655
1656             pagenum = pagenum + 1
1657
1658
1659 class YoutubePlaylistIE(InfoExtractor):
1660     """Information Extractor for YouTube playlists."""
1661
1662     _VALID_URL = r"""(?:
1663                         (?:https?://)?
1664                         (?:\w+\.)?
1665                         youtube\.com/
1666                         (?:
1667                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
1668                            \? (?:.*?&)*? (?:p|a|list)=
1669                         |  p/
1670                         )
1671                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1672                         .*
1673                      |
1674                         ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1675                      )"""
1676     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1677     _MAX_RESULTS = 50
1678     IE_NAME = u'youtube:playlist'
1679
1680     @classmethod
1681     def suitable(cls, url):
1682         """Receives a URL and returns True if suitable for this IE."""
1683         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1684
1685     def report_download_page(self, playlist_id, pagenum):
1686         """Report attempt to download playlist page with given number."""
1687         self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1688
1689     def _real_extract(self, url):
1690         # Extract playlist id
1691         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1692         if mobj is None:
1693             self._downloader.report_error(u'invalid url: %s' % url)
1694             return
1695
1696         # Download playlist videos from API
1697         playlist_id = mobj.group(1) or mobj.group(2)
1698         page_num = 1
1699         videos = []
1700
1701         while True:
1702             self.report_download_page(playlist_id, page_num)
1703
1704             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1705             try:
1706                 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1707             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1709                 return
1710
1711             try:
1712                 response = json.loads(page)
1713             except ValueError as err:
1714                 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1715                 return
1716
1717             if 'feed' not in response:
1718                 self._downloader.report_error(u'Got a malformed response from YouTube API')
1719                 return
1720             playlist_title = response['feed']['title']['$t']
1721             if 'entry' not in response['feed']:
1722                 # Number of videos is a multiple of self._MAX_RESULTS
1723                 break
1724
1725             videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1726                         for entry in response['feed']['entry']
1727                         if 'content' in entry ]
1728
1729             if len(response['feed']['entry']) < self._MAX_RESULTS:
1730                 break
1731             page_num += 1
1732
1733         videos = [v[1] for v in sorted(videos)]
1734
1735         url_results = [self.url_result(url, 'Youtube') for url in videos]
1736         return [self.playlist_result(url_results, playlist_id, playlist_title)]
1737
1738
1739 class YoutubeChannelIE(InfoExtractor):
1740     """Information Extractor for YouTube channels."""
1741
1742     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1743     _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1745     _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1746     IE_NAME = u'youtube:channel'
1747
1748     def report_download_page(self, channel_id, pagenum):
1749         """Report attempt to download channel page with given number."""
1750         self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1751
1752     def extract_videos_from_page(self, page):
1753         ids_in_page = []
1754         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1755             if mobj.group(1) not in ids_in_page:
1756                 ids_in_page.append(mobj.group(1))
1757         return ids_in_page
1758
1759     def _real_extract(self, url):
1760         # Extract channel id
1761         mobj = re.match(self._VALID_URL, url)
1762         if mobj is None:
1763             self._downloader.report_error(u'invalid url: %s' % url)
1764             return
1765
1766         # Download channel page
1767         channel_id = mobj.group(1)
1768         video_ids = []
1769         pagenum = 1
1770
1771         self.report_download_page(channel_id, pagenum)
1772         url = self._TEMPLATE_URL % (channel_id, pagenum)
1773         request = compat_urllib_request.Request(url)
1774         try:
1775             page = compat_urllib_request.urlopen(request).read().decode('utf8')
1776         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1777             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1778             return
1779
1780         # Extract video identifiers
1781         ids_in_page = self.extract_videos_from_page(page)
1782         video_ids.extend(ids_in_page)
1783
1784         # Download any subsequent channel pages using the json-based channel_ajax query
1785         if self._MORE_PAGES_INDICATOR in page:
1786             while True:
1787                 pagenum = pagenum + 1
1788
1789                 self.report_download_page(channel_id, pagenum)
1790                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1791                 request = compat_urllib_request.Request(url)
1792                 try:
1793                     page = compat_urllib_request.urlopen(request).read().decode('utf8')
1794                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1795                     self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1796                     return
1797
1798                 page = json.loads(page)
1799
1800                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1801                 video_ids.extend(ids_in_page)
1802
1803                 if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
1804                     break
1805
1806         self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1807
1808         urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1809         url_entries = [self.url_result(url, 'Youtube') for url in urls]
1810         return [self.playlist_result(url_entries, channel_id)]
1811
1812
1813 class YoutubeUserIE(InfoExtractor):
1814     """Information Extractor for YouTube users."""
1815
1816     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1817     _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1818     _GDATA_PAGE_SIZE = 50
1819     _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1820     _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1821     IE_NAME = u'youtube:user'
1822
1823     def report_download_page(self, username, start_index):
1824         """Report attempt to download user page."""
1825         self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1826                 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1827
1828     def _real_extract(self, url):
1829         # Extract username
1830         mobj = re.match(self._VALID_URL, url)
1831         if mobj is None:
1832             self._downloader.report_error(u'invalid url: %s' % url)
1833             return
1834
1835         username = mobj.group(1)
1836
1837         # Download video ids using YouTube Data API. Result size per
1838         # query is limited (currently to 50 videos) so we need to query
1839         # page by page until there are no video ids - it means we got
1840         # all of them.
1841
1842         video_ids = []
1843         pagenum = 0
1844
1845         while True:
1846             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1847             self.report_download_page(username, start_index)
1848
1849             request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1850
1851             try:
1852                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1853             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1854                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1855                 return
1856
1857             # Extract video identifiers
1858             ids_in_page = []
1859
1860             for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1861                 if mobj.group(1) not in ids_in_page:
1862                     ids_in_page.append(mobj.group(1))
1863
1864             video_ids.extend(ids_in_page)
1865
1866             # A little optimization - if current page is not
1867             # "full", ie. does not contain PAGE_SIZE video ids then
1868             # we can assume that this page is the last one - there
1869             # are no more ids on further pages - no need to query
1870             # again.
1871
1872             if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1873                 break
1874
1875             pagenum += 1
1876
1877         urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1878         url_results = [self.url_result(url, 'Youtube') for url in urls]
1879         return [self.playlist_result(url_results, playlist_title = username)]
1880
1881
1882 class BlipTVUserIE(InfoExtractor):
1883     """Information Extractor for blip.tv users."""
1884
1885     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1886     _PAGE_SIZE = 12
1887     IE_NAME = u'blip.tv:user'
1888
1889     def report_download_page(self, username, pagenum):
1890         """Report attempt to download user page."""
1891         self.to_screen(u'user %s: Downloading video ids from page %d' %
1892                 (username, pagenum))
1893
1894     def _real_extract(self, url):
1895         # Extract username
1896         mobj = re.match(self._VALID_URL, url)
1897         if mobj is None:
1898             self._downloader.report_error(u'invalid url: %s' % url)
1899             return
1900
1901         username = mobj.group(1)
1902
1903         page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1904
1905         request = compat_urllib_request.Request(url)
1906
1907         try:
1908             page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909             mobj = re.search(r'data-users-id="([^"]+)"', page)
1910             page_base = page_base % mobj.group(1)
1911         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1912             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1913             return
1914
1915
1916         # Download video ids using BlipTV Ajax calls. Result size per
1917         # query is limited (currently to 12 videos) so we need to query
1918         # page by page until there are no video ids - it means we got
1919         # all of them.
1920
1921         video_ids = []
1922         pagenum = 1
1923
1924         while True:
1925             self.report_download_page(username, pagenum)
1926             url = page_base + "&page=" + str(pagenum)
1927             request = compat_urllib_request.Request( url )
1928             try:
1929                 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1930             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1931                 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1932                 return
1933
1934             # Extract video identifiers
1935             ids_in_page = []
1936
1937             for mobj in re.finditer(r'href="/([^"]+)"', page):
1938                 if mobj.group(1) not in ids_in_page:
1939                     ids_in_page.append(unescapeHTML(mobj.group(1)))
1940
1941             video_ids.extend(ids_in_page)
1942
1943             # A little optimization - if current page is not
1944             # "full", ie. does not contain PAGE_SIZE video ids then
1945             # we can assume that this page is the last one - there
1946             # are no more ids on further pages - no need to query
1947             # again.
1948
1949             if len(ids_in_page) < self._PAGE_SIZE:
1950                 break
1951
1952             pagenum += 1
1953
1954         urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1955         url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1956         return [self.playlist_result(url_entries, playlist_title = username)]
1957
1958
1959 class DepositFilesIE(InfoExtractor):
1960     """Information extractor for depositfiles.com"""
1961
1962     _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1963
1964     def _real_extract(self, url):
1965         file_id = url.split('/')[-1]
1966         # Rebuild url in english locale
1967         url = 'http://depositfiles.com/en/files/' + file_id
1968
1969         # Retrieve file webpage with 'Free download' button pressed
1970         free_download_indication = { 'gateway_result' : '1' }
1971         request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1972         try:
1973             self.report_download_webpage(file_id)
1974             webpage = compat_urllib_request.urlopen(request).read()
1975         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1976             self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1977             return
1978
1979         # Search for the real file URL
1980         mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1981         if (mobj is None) or (mobj.group(1) is None):
1982             # Try to figure out reason of the error.
1983             mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1984             if (mobj is not None) and (mobj.group(1) is not None):
1985                 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1986                 self._downloader.report_error(u'%s' % restriction_message)
1987             else:
1988                 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1989             return
1990
1991         file_url = mobj.group(1)
1992         file_extension = os.path.splitext(file_url)[1][1:]
1993
1994         # Search for file title
1995         mobj = re.search(r'<b title="(.*?)">', webpage)
1996         if mobj is None:
1997             self._downloader.report_error(u'unable to extract title')
1998             return
1999         file_title = mobj.group(1).decode('utf-8')
2000
2001         return [{
2002             'id':       file_id.decode('utf-8'),
2003             'url':      file_url.decode('utf-8'),
2004             'uploader': None,
2005             'upload_date':  None,
2006             'title':    file_title,
2007             'ext':      file_extension.decode('utf-8'),
2008         }]
2009
2010
2011 class FacebookIE(InfoExtractor):
2012     """Information Extractor for Facebook"""
2013
2014     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2015     _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2016     _NETRC_MACHINE = 'facebook'
2017     IE_NAME = u'facebook'
2018
2019     def report_login(self):
2020         """Report attempt to log in."""
2021         self.to_screen(u'Logging in')
2022
2023     def _real_initialize(self):
2024         if self._downloader is None:
2025             return
2026
2027         useremail = None
2028         password = None
2029         downloader_params = self._downloader.params
2030
2031         # Attempt to use provided username and password or .netrc data
2032         if downloader_params.get('username', None) is not None:
2033             useremail = downloader_params['username']
2034             password = downloader_params['password']
2035         elif downloader_params.get('usenetrc', False):
2036             try:
2037                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2038                 if info is not None:
2039                     useremail = info[0]
2040                     password = info[2]
2041                 else:
2042                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2043             except (IOError, netrc.NetrcParseError) as err:
2044                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2045                 return
2046
2047         if useremail is None:
2048             return
2049
2050         # Log in
2051         login_form = {
2052             'email': useremail,
2053             'pass': password,
2054             'login': 'Log+In'
2055             }
2056         request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2057         try:
2058             self.report_login()
2059             login_results = compat_urllib_request.urlopen(request).read()
2060             if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2061                 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2062                 return
2063         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2064             self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2065             return
2066
2067     def _real_extract(self, url):
2068         mobj = re.match(self._VALID_URL, url)
2069         if mobj is None:
2070             self._downloader.report_error(u'invalid URL: %s' % url)
2071             return
2072         video_id = mobj.group('ID')
2073
2074         url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2075         webpage = self._download_webpage(url, video_id)
2076
2077         BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2078         AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2079         m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2080         if not m:
2081             raise ExtractorError(u'Cannot parse data')
2082         data = dict(json.loads(m.group(1)))
2083         params_raw = compat_urllib_parse.unquote(data['params'])
2084         params = json.loads(params_raw)
2085         video_data = params['video_data'][0]
2086         video_url = video_data.get('hd_src')
2087         if not video_url:
2088             video_url = video_data['sd_src']
2089         if not video_url:
2090             raise ExtractorError(u'Cannot find video URL')
2091         video_duration = int(video_data['video_duration'])
2092         thumbnail = video_data['thumbnail_src']
2093
2094         m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2095         if not m:
2096             raise ExtractorError(u'Cannot find title in webpage')
2097         video_title = unescapeHTML(m.group(1))
2098
2099         info = {
2100             'id': video_id,
2101             'title': video_title,
2102             'url': video_url,
2103             'ext': 'mp4',
2104             'duration': video_duration,
2105             'thumbnail': thumbnail,
2106         }
2107         return [info]
2108
2109
2110 class BlipTVIE(InfoExtractor):
2111     """Information extractor for blip.tv"""
2112
2113     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2114     _URL_EXT = r'^.*\.([a-z0-9]+)$'
2115     IE_NAME = u'blip.tv'
2116
2117     def report_direct_download(self, title):
2118         """Report information extraction."""
2119         self.to_screen(u'%s: Direct download detected' % title)
2120
2121     def _real_extract(self, url):
2122         mobj = re.match(self._VALID_URL, url)
2123         if mobj is None:
2124             self._downloader.report_error(u'invalid URL: %s' % url)
2125             return
2126
2127         urlp = compat_urllib_parse_urlparse(url)
2128         if urlp.path.startswith('/play/'):
2129             request = compat_urllib_request.Request(url)
2130             response = compat_urllib_request.urlopen(request)
2131             redirecturl = response.geturl()
2132             rurlp = compat_urllib_parse_urlparse(redirecturl)
2133             file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2134             url = 'http://blip.tv/a/a-' + file_id
2135             return self._real_extract(url)
2136
2137
2138         if '?' in url:
2139             cchar = '&'
2140         else:
2141             cchar = '?'
2142         json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2143         request = compat_urllib_request.Request(json_url)
2144         request.add_header('User-Agent', 'iTunes/10.6.1')
2145         self.report_extraction(mobj.group(1))
2146         info = None
2147         try:
2148             urlh = compat_urllib_request.urlopen(request)
2149             if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2150                 basename = url.split('/')[-1]
2151                 title,ext = os.path.splitext(basename)
2152                 title = title.decode('UTF-8')
2153                 ext = ext.replace('.', '')
2154                 self.report_direct_download(title)
2155                 info = {
2156                     'id': title,
2157                     'url': url,
2158                     'uploader': None,
2159                     'upload_date': None,
2160                     'title': title,
2161                     'ext': ext,
2162                     'urlhandle': urlh
2163                 }
2164         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2165             raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2166         if info is None: # Regular URL
2167             try:
2168                 json_code_bytes = urlh.read()
2169                 json_code = json_code_bytes.decode('utf-8')
2170             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171                 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2172                 return
2173
2174             try:
2175                 json_data = json.loads(json_code)
2176                 if 'Post' in json_data:
2177                     data = json_data['Post']
2178                 else:
2179                     data = json_data
2180
2181                 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2182                 video_url = data['media']['url']
2183                 umobj = re.match(self._URL_EXT, video_url)
2184                 if umobj is None:
2185                     raise ValueError('Can not determine filename extension')
2186                 ext = umobj.group(1)
2187
2188                 info = {
2189                     'id': data['item_id'],
2190                     'url': video_url,
2191                     'uploader': data['display_name'],
2192                     'upload_date': upload_date,
2193                     'title': data['title'],
2194                     'ext': ext,
2195                     'format': data['media']['mimeType'],
2196                     'thumbnail': data['thumbnailUrl'],
2197                     'description': data['description'],
2198                     'player_url': data['embedUrl'],
2199                     'user_agent': 'iTunes/10.6.1',
2200                 }
2201             except (ValueError,KeyError) as err:
2202                 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2203                 return
2204
2205         return [info]
2206
2207
2208 class MyVideoIE(InfoExtractor):
2209     """Information Extractor for myvideo.de."""
2210
2211     _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2212     IE_NAME = u'myvideo'
2213
2214     def _real_extract(self,url):
2215         mobj = re.match(self._VALID_URL, url)
2216         if mobj is None:
2217             self._download.report_error(u'invalid URL: %s' % url)
2218             return
2219
2220         video_id = mobj.group(1)
2221
2222         # Get video webpage
2223         webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2224         webpage = self._download_webpage(webpage_url, video_id)
2225
2226         self.report_extraction(video_id)
2227         mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2228                  webpage)
2229         if mobj is None:
2230             self._downloader.report_error(u'unable to extract media URL')
2231             return
2232         video_url = mobj.group(1) + ('/%s.flv' % video_id)
2233
2234         mobj = re.search('<title>([^<]+)</title>', webpage)
2235         if mobj is None:
2236             self._downloader.report_error(u'unable to extract title')
2237             return
2238
2239         video_title = mobj.group(1)
2240
2241         return [{
2242             'id':       video_id,
2243             'url':      video_url,
2244             'uploader': None,
2245             'upload_date':  None,
2246             'title':    video_title,
2247             'ext':      u'flv',
2248         }]
2249
2250 class ComedyCentralIE(InfoExtractor):
2251     """Information extractor for The Daily Show and Colbert Report """
2252
2253     # urls can be abbreviations like :thedailyshow or :colbert
2254     # urls for episodes like:
2255     # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2256     #                     or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2257     #                     or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2258     _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2259                       |(https?://)?(www\.)?
2260                           (?P<showname>thedailyshow|colbertnation)\.com/
2261                          (full-episodes/(?P<episode>.*)|
2262                           (?P<clip>
2263                               (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2264                               |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2265                      $"""
2266
2267     _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2268
2269     _video_extensions = {
2270         '3500': 'mp4',
2271         '2200': 'mp4',
2272         '1700': 'mp4',
2273         '1200': 'mp4',
2274         '750': 'mp4',
2275         '400': 'mp4',
2276     }
2277     _video_dimensions = {
2278         '3500': '1280x720',
2279         '2200': '960x540',
2280         '1700': '768x432',
2281         '1200': '640x360',
2282         '750': '512x288',
2283         '400': '384x216',
2284     }
2285
2286     @classmethod
2287     def suitable(cls, url):
2288         """Receives a URL and returns True if suitable for this IE."""
2289         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2290
2291     def report_config_download(self, episode_id, media_id):
2292         self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2293
2294     def report_index_download(self, episode_id):
2295         self.to_screen(u'%s: Downloading show index' % episode_id)
2296
2297     def _print_formats(self, formats):
2298         print('Available formats:')
2299         for x in formats:
2300             print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2301
2302
2303     def _real_extract(self, url):
2304         mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2305         if mobj is None:
2306             self._downloader.report_error(u'invalid URL: %s' % url)
2307             return
2308
2309         if mobj.group('shortname'):
2310             if mobj.group('shortname') in ('tds', 'thedailyshow'):
2311                 url = u'http://www.thedailyshow.com/full-episodes/'
2312             else:
2313                 url = u'http://www.colbertnation.com/full-episodes/'
2314             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2315             assert mobj is not None
2316
2317         if mobj.group('clip'):
2318             if mobj.group('showname') == 'thedailyshow':
2319                 epTitle = mobj.group('tdstitle')
2320             else:
2321                 epTitle = mobj.group('cntitle')
2322             dlNewest = False
2323         else:
2324             dlNewest = not mobj.group('episode')
2325             if dlNewest:
2326                 epTitle = mobj.group('showname')
2327             else:
2328                 epTitle = mobj.group('episode')
2329
2330         req = compat_urllib_request.Request(url)
2331         self.report_extraction(epTitle)
2332         try:
2333             htmlHandle = compat_urllib_request.urlopen(req)
2334             html = htmlHandle.read()
2335             webpage = html.decode('utf-8')
2336         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2337             self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2338             return
2339         if dlNewest:
2340             url = htmlHandle.geturl()
2341             mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2342             if mobj is None:
2343                 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2344                 return
2345             if mobj.group('episode') == '':
2346                 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2347                 return
2348             epTitle = mobj.group('episode')
2349
2350         mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2351
2352         if len(mMovieParams) == 0:
2353             # The Colbert Report embeds the information in a without
2354             # a URL prefix; so extract the alternate reference
2355             # and then add the URL prefix manually.
2356
2357             altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2358             if len(altMovieParams) == 0:
2359                 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2360                 return
2361             else:
2362                 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2363
2364         uri = mMovieParams[0][1]
2365         indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2366         self.report_index_download(epTitle)
2367         try:
2368             indexXml = compat_urllib_request.urlopen(indexUrl).read()
2369         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2370             self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2371             return
2372
2373         results = []
2374
2375         idoc = xml.etree.ElementTree.fromstring(indexXml)
2376         itemEls = idoc.findall('.//item')
2377         for partNum,itemEl in enumerate(itemEls):
2378             mediaId = itemEl.findall('./guid')[0].text
2379             shortMediaId = mediaId.split(':')[-1]
2380             showId = mediaId.split(':')[-2].replace('.com', '')
2381             officialTitle = itemEl.findall('./title')[0].text
2382             officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2383
2384             configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2385                         compat_urllib_parse.urlencode({'uri': mediaId}))
2386             configReq = compat_urllib_request.Request(configUrl)
2387             self.report_config_download(epTitle, shortMediaId)
2388             try:
2389                 configXml = compat_urllib_request.urlopen(configReq).read()
2390             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2391                 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2392                 return
2393
2394             cdoc = xml.etree.ElementTree.fromstring(configXml)
2395             turls = []
2396             for rendition in cdoc.findall('.//rendition'):
2397                 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2398                 turls.append(finfo)
2399
2400             if len(turls) == 0:
2401                 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2402                 continue
2403
2404             if self._downloader.params.get('listformats', None):
2405                 self._print_formats([i[0] for i in turls])
2406                 return
2407
2408             # For now, just pick the highest bitrate
2409             format,rtmp_video_url = turls[-1]
2410
2411             # Get the format arg from the arg stream
2412             req_format = self._downloader.params.get('format', None)
2413
2414             # Select format if we can find one
2415             for f,v in turls:
2416                 if f == req_format:
2417                     format, rtmp_video_url = f, v
2418                     break
2419
2420             m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2421             if not m:
2422                 raise ExtractorError(u'Cannot transform RTMP url')
2423             base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2424             video_url = base + m.group('finalid')
2425
2426             effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2427             info = {
2428                 'id': shortMediaId,
2429                 'url': video_url,
2430                 'uploader': showId,
2431                 'upload_date': officialDate,
2432                 'title': effTitle,
2433                 'ext': 'mp4',
2434                 'format': format,
2435                 'thumbnail': None,
2436                 'description': officialTitle,
2437             }
2438             results.append(info)
2439
2440         return results
2441
2442
2443 class EscapistIE(InfoExtractor):
2444     """Information extractor for The Escapist """
2445
2446     _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2447     IE_NAME = u'escapist'
2448
2449     def report_config_download(self, showName):
2450         self.to_screen(u'%s: Downloading configuration' % showName)
2451
2452     def _real_extract(self, url):
2453         mobj = re.match(self._VALID_URL, url)
2454         if mobj is None:
2455             self._downloader.report_error(u'invalid URL: %s' % url)
2456             return
2457         showName = mobj.group('showname')
2458         videoId = mobj.group('episode')
2459
2460         self.report_extraction(showName)
2461         try:
2462             webPage = compat_urllib_request.urlopen(url)
2463             webPageBytes = webPage.read()
2464             m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2465             webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2466         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2467             self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2468             return
2469
2470         descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2471         description = unescapeHTML(descMatch.group(1))
2472         imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2473         imgUrl = unescapeHTML(imgMatch.group(1))
2474         playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2475         playerUrl = unescapeHTML(playerUrlMatch.group(1))
2476         configUrlMatch = re.search('config=(.*)$', playerUrl)
2477         configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2478
2479         self.report_config_download(showName)
2480         try:
2481             configJSON = compat_urllib_request.urlopen(configUrl)
2482             m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2483             configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2484         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485             self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2486             return
2487
2488         # Technically, it's JavaScript, not JSON
2489         configJSON = configJSON.replace("'", '"')
2490
2491         try:
2492             config = json.loads(configJSON)
2493         except (ValueError,) as err:
2494             self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2495             return
2496
2497         playlist = config['playlist']
2498         videoUrl = playlist[1]['url']
2499
2500         info = {
2501             'id': videoId,
2502             'url': videoUrl,
2503             'uploader': showName,
2504             'upload_date': None,
2505             'title': showName,
2506             'ext': 'mp4',
2507             'thumbnail': imgUrl,
2508             'description': description,
2509             'player_url': playerUrl,
2510         }
2511
2512         return [info]
2513
2514 class CollegeHumorIE(InfoExtractor):
2515     """Information extractor for collegehumor.com"""
2516
2517     _WORKING = False
2518     _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2519     IE_NAME = u'collegehumor'
2520
2521     def report_manifest(self, video_id):
2522         """Report information extraction."""
2523         self.to_screen(u'%s: Downloading XML manifest' % video_id)
2524
2525     def _real_extract(self, url):
2526         mobj = re.match(self._VALID_URL, url)
2527         if mobj is None:
2528             self._downloader.report_error(u'invalid URL: %s' % url)
2529             return
2530         video_id = mobj.group('videoid')
2531
2532         info = {
2533             'id': video_id,
2534             'uploader': None,
2535             'upload_date': None,
2536         }
2537
2538         self.report_extraction(video_id)
2539         xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2540         try:
2541             metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2542         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2543             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2544             return
2545
2546         mdoc = xml.etree.ElementTree.fromstring(metaXml)
2547         try:
2548             videoNode = mdoc.findall('./video')[0]
2549             info['description'] = videoNode.findall('./description')[0].text
2550             info['title'] = videoNode.findall('./caption')[0].text
2551             info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2552             manifest_url = videoNode.findall('./file')[0].text
2553         except IndexError:
2554             self._downloader.report_error(u'Invalid metadata XML file')
2555             return
2556
2557         manifest_url += '?hdcore=2.10.3'
2558         self.report_manifest(video_id)
2559         try:
2560             manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2561         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2562             self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2563             return
2564
2565         adoc = xml.etree.ElementTree.fromstring(manifestXml)
2566         try:
2567             media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2568             node_id = media_node.attrib['url']
2569             video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2570         except IndexError as err:
2571             self._downloader.report_error(u'Invalid manifest file')
2572             return
2573
2574         url_pr = compat_urllib_parse_urlparse(manifest_url)
2575         url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2576
2577         info['url'] = url
2578         info['ext'] = 'f4f'
2579         return [info]
2580
2581
2582 class XVideosIE(InfoExtractor):
2583     """Information extractor for xvideos.com"""
2584
2585     _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2586     IE_NAME = u'xvideos'
2587
2588     def _real_extract(self, url):
2589         mobj = re.match(self._VALID_URL, url)
2590         if mobj is None:
2591             self._downloader.report_error(u'invalid URL: %s' % url)
2592             return
2593         video_id = mobj.group(1)
2594
2595         webpage = self._download_webpage(url, video_id)
2596
2597         self.report_extraction(video_id)
2598
2599
2600         # Extract video URL
2601         mobj = re.search(r'flv_url=(.+?)&', webpage)
2602         if mobj is None:
2603             self._downloader.report_error(u'unable to extract video url')
2604             return
2605         video_url = compat_urllib_parse.unquote(mobj.group(1))
2606
2607
2608         # Extract title
2609         mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2610         if mobj is None:
2611             self._downloader.report_error(u'unable to extract video title')
2612             return
2613         video_title = mobj.group(1)
2614
2615
2616         # Extract video thumbnail
2617         mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2618         if mobj is None:
2619             self._downloader.report_error(u'unable to extract video thumbnail')
2620             return
2621         video_thumbnail = mobj.group(0)
2622
2623         info = {
2624             'id': video_id,
2625             'url': video_url,
2626             'uploader': None,
2627             'upload_date': None,
2628             'title': video_title,
2629             'ext': 'flv',
2630             'thumbnail': video_thumbnail,
2631             'description': None,
2632         }
2633
2634         return [info]
2635
2636
2637 class SoundcloudIE(InfoExtractor):
2638     """Information extractor for soundcloud.com
2639        To access the media, the uid of the song and a stream token
2640        must be extracted from the page source and the script must make
2641        a request to media.soundcloud.com/crossdomain.xml. Then
2642        the media can be grabbed by requesting from an url composed
2643        of the stream token and uid
2644      """
2645
2646     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2647     IE_NAME = u'soundcloud'
2648
2649     def report_resolve(self, video_id):
2650         """Report information extraction."""
2651         self.to_screen(u'%s: Resolving id' % video_id)
2652
2653     def _real_extract(self, url):
2654         mobj = re.match(self._VALID_URL, url)
2655         if mobj is None:
2656             self._downloader.report_error(u'invalid URL: %s' % url)
2657             return
2658
2659         # extract uploader (which is in the url)
2660         uploader = mobj.group(1)
2661         # extract simple title (uploader + slug of song title)
2662         slug_title =  mobj.group(2)
2663         simple_title = uploader + u'-' + slug_title
2664
2665         self.report_resolve('%s/%s' % (uploader, slug_title))
2666
2667         url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2668         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2669         request = compat_urllib_request.Request(resolv_url)
2670         try:
2671             info_json_bytes = compat_urllib_request.urlopen(request).read()
2672             info_json = info_json_bytes.decode('utf-8')
2673         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2674             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2675             return
2676
2677         info = json.loads(info_json)
2678         video_id = info['id']
2679         self.report_extraction('%s/%s' % (uploader, slug_title))
2680
2681         streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2682         request = compat_urllib_request.Request(streams_url)
2683         try:
2684             stream_json_bytes = compat_urllib_request.urlopen(request).read()
2685             stream_json = stream_json_bytes.decode('utf-8')
2686         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2687             self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2688             return
2689
2690         streams = json.loads(stream_json)
2691         mediaURL = streams['http_mp3_128_url']
2692         upload_date = unified_strdate(info['created_at'])
2693
2694         return [{
2695             'id':       info['id'],
2696             'url':      mediaURL,
2697             'uploader': info['user']['username'],
2698             'upload_date': upload_date,
2699             'title':    info['title'],
2700             'ext':      u'mp3',
2701             'description': info['description'],
2702         }]
2703
2704 class SoundcloudSetIE(InfoExtractor):
2705     """Information extractor for soundcloud.com sets
2706        To access the media, the uid of the song and a stream token
2707        must be extracted from the page source and the script must make
2708        a request to media.soundcloud.com/crossdomain.xml. Then
2709        the media can be grabbed by requesting from an url composed
2710        of the stream token and uid
2711      """
2712
2713     _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2714     IE_NAME = u'soundcloud:set'
2715
2716     def report_resolve(self, video_id):
2717         """Report information extraction."""
2718         self.to_screen(u'%s: Resolving id' % video_id)
2719
2720     def _real_extract(self, url):
2721         mobj = re.match(self._VALID_URL, url)
2722         if mobj is None:
2723             self._downloader.report_error(u'invalid URL: %s' % url)
2724             return
2725
2726         # extract uploader (which is in the url)
2727         uploader = mobj.group(1)
2728         # extract simple title (uploader + slug of song title)
2729         slug_title =  mobj.group(2)
2730         simple_title = uploader + u'-' + slug_title
2731
2732         self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2733
2734         url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2735         resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2736         request = compat_urllib_request.Request(resolv_url)
2737         try:
2738             info_json_bytes = compat_urllib_request.urlopen(request).read()
2739             info_json = info_json_bytes.decode('utf-8')
2740         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2741             self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2742             return
2743
2744         videos = []
2745         info = json.loads(info_json)
2746         if 'errors' in info:
2747             for err in info['errors']:
2748                 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2749             return
2750
2751         for track in info['tracks']:
2752             video_id = track['id']
2753             self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2754
2755             streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2756             request = compat_urllib_request.Request(streams_url)
2757             try:
2758                 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2759                 stream_json = stream_json_bytes.decode('utf-8')
2760             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2761                 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2762                 return
2763
2764             streams = json.loads(stream_json)
2765             mediaURL = streams['http_mp3_128_url']
2766
2767             videos.append({
2768                 'id':       video_id,
2769                 'url':      mediaURL,
2770                 'uploader': track['user']['username'],
2771                 'upload_date':  unified_strdate(track['created_at']),
2772                 'title':    track['title'],
2773                 'ext':      u'mp3',
2774                 'description': track['description'],
2775             })
2776         return videos
2777
2778
2779 class InfoQIE(InfoExtractor):
2780     """Information extractor for infoq.com"""
2781     _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2782
2783     def _real_extract(self, url):
2784         mobj = re.match(self._VALID_URL, url)
2785         if mobj is None:
2786             self._downloader.report_error(u'invalid URL: %s' % url)
2787             return
2788
2789         webpage = self._download_webpage(url, video_id=url)
2790         self.report_extraction(url)
2791
2792         # Extract video URL
2793         mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2794         if mobj is None:
2795             self._downloader.report_error(u'unable to extract video url')
2796             return
2797         real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2798         video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2799
2800         # Extract title
2801         mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2802         if mobj is None:
2803             self._downloader.report_error(u'unable to extract video title')
2804             return
2805         video_title = mobj.group(1)
2806
2807         # Extract description
2808         video_description = u'No description available.'
2809         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2810         if mobj is not None:
2811             video_description = mobj.group(1)
2812
2813         video_filename = video_url.split('/')[-1]
2814         video_id, extension = video_filename.split('.')
2815
2816         info = {
2817             'id': video_id,
2818             'url': video_url,
2819             'uploader': None,
2820             'upload_date': None,
2821             'title': video_title,
2822             'ext': extension, # Extension is always(?) mp4, but seems to be flv
2823             'thumbnail': None,
2824             'description': video_description,
2825         }
2826
2827         return [info]
2828
2829 class MixcloudIE(InfoExtractor):
2830     """Information extractor for www.mixcloud.com"""
2831
2832     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2833     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2834     IE_NAME = u'mixcloud'
2835
2836     def report_download_json(self, file_id):
2837         """Report JSON download."""
2838         self.to_screen(u'Downloading json')
2839
2840     def get_urls(self, jsonData, fmt, bitrate='best'):
2841         """Get urls from 'audio_formats' section in json"""
2842         file_url = None
2843         try:
2844             bitrate_list = jsonData[fmt]
2845             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2846                 bitrate = max(bitrate_list) # select highest
2847
2848             url_list = jsonData[fmt][bitrate]
2849         except TypeError: # we have no bitrate info.
2850             url_list = jsonData[fmt]
2851         return url_list
2852
2853     def check_urls(self, url_list):
2854         """Returns 1st active url from list"""
2855         for url in url_list:
2856             try:
2857                 compat_urllib_request.urlopen(url)
2858                 return url
2859             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2860                 url = None
2861
2862         return None
2863
2864     def _print_formats(self, formats):
2865         print('Available formats:')
2866         for fmt in formats.keys():
2867             for b in formats[fmt]:
2868                 try:
2869                     ext = formats[fmt][b][0]
2870                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2871                 except TypeError: # we have no bitrate info
2872                     ext = formats[fmt][0]
2873                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2874                     break
2875
2876     def _real_extract(self, url):
2877         mobj = re.match(self._VALID_URL, url)
2878         if mobj is None:
2879             self._downloader.report_error(u'invalid URL: %s' % url)
2880             return
2881         # extract uploader & filename from url
2882         uploader = mobj.group(1).decode('utf-8')
2883         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2884
2885         # construct API request
2886         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2887         # retrieve .json file with links to files
2888         request = compat_urllib_request.Request(file_url)
2889         try:
2890             self.report_download_json(file_url)
2891             jsonData = compat_urllib_request.urlopen(request).read()
2892         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893             self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2894             return
2895
2896         # parse JSON
2897         json_data = json.loads(jsonData)
2898         player_url = json_data['player_swf_url']
2899         formats = dict(json_data['audio_formats'])
2900
2901         req_format = self._downloader.params.get('format', None)
2902         bitrate = None
2903
2904         if self._downloader.params.get('listformats', None):
2905             self._print_formats(formats)
2906             return
2907
2908         if req_format is None or req_format == 'best':
2909             for format_param in formats.keys():
2910                 url_list = self.get_urls(formats, format_param)
2911                 # check urls
2912                 file_url = self.check_urls(url_list)
2913                 if file_url is not None:
2914                     break # got it!
2915         else:
2916             if req_format not in formats:
2917                 self._downloader.report_error(u'format is not available')
2918                 return
2919
2920             url_list = self.get_urls(formats, req_format)
2921             file_url = self.check_urls(url_list)
2922             format_param = req_format
2923
2924         return [{
2925             'id': file_id.decode('utf-8'),
2926             'url': file_url.decode('utf-8'),
2927             'uploader': uploader.decode('utf-8'),
2928             'upload_date': None,
2929             'title': json_data['name'],
2930             'ext': file_url.split('.')[-1].decode('utf-8'),
2931             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2932             'thumbnail': json_data['thumbnail_url'],
2933             'description': json_data['description'],
2934             'player_url': player_url.decode('utf-8'),
2935         }]
2936
2937 class StanfordOpenClassroomIE(InfoExtractor):
2938     """Information extractor for Stanford's Open ClassRoom"""
2939
2940     _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2941     IE_NAME = u'stanfordoc'
2942
2943     def _real_extract(self, url):
2944         mobj = re.match(self._VALID_URL, url)
2945         if mobj is None:
2946             raise ExtractorError(u'Invalid URL: %s' % url)
2947
2948         if mobj.group('course') and mobj.group('video'): # A specific video
2949             course = mobj.group('course')
2950             video = mobj.group('video')
2951             info = {
2952                 'id': course + '_' + video,
2953                 'uploader': None,
2954                 'upload_date': None,
2955             }
2956
2957             self.report_extraction(info['id'])
2958             baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2959             xmlUrl = baseUrl + video + '.xml'
2960             try:
2961                 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2962             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2963                 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2964                 return
2965             mdoc = xml.etree.ElementTree.fromstring(metaXml)
2966             try:
2967                 info['title'] = mdoc.findall('./title')[0].text
2968                 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2969             except IndexError:
2970                 self._downloader.report_error(u'Invalid metadata XML file')
2971                 return
2972             info['ext'] = info['url'].rpartition('.')[2]
2973             return [info]
2974         elif mobj.group('course'): # A course page
2975             course = mobj.group('course')
2976             info = {
2977                 'id': course,
2978                 'type': 'playlist',
2979                 'uploader': None,
2980                 'upload_date': None,
2981             }
2982
2983             coursepage = self._download_webpage(url, info['id'],
2984                                         note='Downloading course info page',
2985                                         errnote='Unable to download course info page')
2986
2987             m = re.search('<h1>([^<]+)</h1>', coursepage)
2988             if m:
2989                 info['title'] = unescapeHTML(m.group(1))
2990             else:
2991                 info['title'] = info['id']
2992
2993             m = re.search('<description>([^<]+)</description>', coursepage)
2994             if m:
2995                 info['description'] = unescapeHTML(m.group(1))
2996
2997             links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2998             info['list'] = [
2999                 {
3000                     'type': 'reference',
3001                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3002                 }
3003                     for vpage in links]
3004             results = []
3005             for entry in info['list']:
3006                 assert entry['type'] == 'reference'
3007                 results += self.extract(entry['url'])
3008             return results
3009         else: # Root page
3010             info = {
3011                 'id': 'Stanford OpenClassroom',
3012                 'type': 'playlist',
3013                 'uploader': None,
3014                 'upload_date': None,
3015             }
3016
3017             self.report_download_webpage(info['id'])
3018             rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3019             try:
3020                 rootpage = compat_urllib_request.urlopen(rootURL).read()
3021             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3022                 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3023                 return
3024
3025             info['title'] = info['id']
3026
3027             links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3028             info['list'] = [
3029                 {
3030                     'type': 'reference',
3031                     'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3032                 }
3033                     for cpage in links]
3034
3035             results = []
3036             for entry in info['list']:
3037                 assert entry['type'] == 'reference'
3038                 results += self.extract(entry['url'])
3039             return results
3040
3041 class MTVIE(InfoExtractor):
3042     """Information extractor for MTV.com"""
3043
3044     _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3045     IE_NAME = u'mtv'
3046
3047     def _real_extract(self, url):
3048         mobj = re.match(self._VALID_URL, url)
3049         if mobj is None:
3050             self._downloader.report_error(u'invalid URL: %s' % url)
3051             return
3052         if not mobj.group('proto'):
3053             url = 'http://' + url
3054         video_id = mobj.group('videoid')
3055
3056         webpage = self._download_webpage(url, video_id)
3057
3058         mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3059         if mobj is None:
3060             self._downloader.report_error(u'unable to extract song name')
3061             return
3062         song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3063         mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3064         if mobj is None:
3065             self._downloader.report_error(u'unable to extract performer')
3066             return
3067         performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068         video_title = performer + ' - ' + song_name
3069
3070         mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3071         if mobj is None:
3072             self._downloader.report_error(u'unable to mtvn_uri')
3073             return
3074         mtvn_uri = mobj.group(1)
3075
3076         mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3077         if mobj is None:
3078             self._downloader.report_error(u'unable to extract content id')
3079             return
3080         content_id = mobj.group(1)
3081
3082         videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3083         self.report_extraction(video_id)
3084         request = compat_urllib_request.Request(videogen_url)
3085         try:
3086             metadataXml = compat_urllib_request.urlopen(request).read()
3087         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3088             self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3089             return
3090
3091         mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3092         renditions = mdoc.findall('.//rendition')
3093
3094         # For now, always pick the highest quality.
3095         rendition = renditions[-1]
3096
3097         try:
3098             _,_,ext = rendition.attrib['type'].partition('/')
3099             format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3100             video_url = rendition.find('./src').text
3101         except KeyError:
3102             self._downloader.report_error('Invalid rendition field.')
3103             return
3104
3105         info = {
3106             'id': video_id,
3107             'url': video_url,
3108             'uploader': performer,
3109             'upload_date': None,
3110             'title': video_title,
3111             'ext': ext,
3112             'format': format,
3113         }
3114
3115         return [info]
3116
3117
3118 class YoukuIE(InfoExtractor):
3119     _VALID_URL =  r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3120
3121     def _gen_sid(self):
3122         nowTime = int(time.time() * 1000)
3123         random1 = random.randint(1000,1998)
3124         random2 = random.randint(1000,9999)
3125
3126         return "%d%d%d" %(nowTime,random1,random2)
3127
3128     def _get_file_ID_mix_string(self, seed):
3129         mixed = []
3130         source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3131         seed = float(seed)
3132         for i in range(len(source)):
3133             seed  =  (seed * 211 + 30031 ) % 65536
3134             index  =  math.floor(seed / 65536 * len(source) )
3135             mixed.append(source[int(index)])
3136             source.remove(source[int(index)])
3137         #return ''.join(mixed)
3138         return mixed
3139
3140     def _get_file_id(self, fileId, seed):
3141         mixed = self._get_file_ID_mix_string(seed)
3142         ids = fileId.split('*')
3143         realId = []
3144         for ch in ids:
3145             if ch:
3146                 realId.append(mixed[int(ch)])
3147         return ''.join(realId)
3148
3149     def _real_extract(self, url):
3150         mobj = re.match(self._VALID_URL, url)
3151         if mobj is None:
3152             self._downloader.report_error(u'invalid URL: %s' % url)
3153             return
3154         video_id = mobj.group('ID')
3155
3156         info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3157
3158         request = compat_urllib_request.Request(info_url, None, std_headers)
3159         try:
3160             self.report_download_webpage(video_id)
3161             jsondata = compat_urllib_request.urlopen(request).read()
3162         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3163             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3164             return
3165
3166         self.report_extraction(video_id)
3167         try:
3168             jsonstr = jsondata.decode('utf-8')
3169             config = json.loads(jsonstr)
3170
3171             video_title =  config['data'][0]['title']
3172             seed = config['data'][0]['seed']
3173
3174             format = self._downloader.params.get('format', None)
3175             supported_format = list(config['data'][0]['streamfileids'].keys())
3176
3177             if format is None or format == 'best':
3178                 if 'hd2' in supported_format:
3179                     format = 'hd2'
3180                 else:
3181                     format = 'flv'
3182                 ext = u'flv'
3183             elif format == 'worst':
3184                 format = 'mp4'
3185                 ext = u'mp4'
3186             else:
3187                 format = 'flv'
3188                 ext = u'flv'
3189
3190
3191             fileid = config['data'][0]['streamfileids'][format]
3192             keys = [s['k'] for s in config['data'][0]['segs'][format]]
3193         except (UnicodeDecodeError, ValueError, KeyError):
3194             self._downloader.report_error(u'unable to extract info section')
3195             return
3196
3197         files_info=[]
3198         sid = self._gen_sid()
3199         fileid = self._get_file_id(fileid, seed)
3200
3201         #column 8,9 of fileid represent the segment number
3202         #fileid[7:9] should be changed
3203         for index, key in enumerate(keys):
3204
3205             temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3206             download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3207
3208             info = {
3209                 'id': '%s_part%02d' % (video_id, index),
3210                 'url': download_url,
3211                 'uploader': None,
3212                 'upload_date': None,
3213                 'title': video_title,
3214                 'ext': ext,
3215             }
3216             files_info.append(info)
3217
3218         return files_info
3219
3220
3221 class XNXXIE(InfoExtractor):
3222     """Information extractor for xnxx.com"""
3223
3224     _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3225     IE_NAME = u'xnxx'
3226     VIDEO_URL_RE = r'flv_url=(.*?)&amp;'
3227     VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3228     VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&amp;'
3229
3230     def _real_extract(self, url):
3231         mobj = re.match(self._VALID_URL, url)
3232         if mobj is None:
3233             self._downloader.report_error(u'invalid URL: %s' % url)
3234             return
3235         video_id = mobj.group(1)
3236
3237         self.report_download_webpage(video_id)
3238
3239         # Get webpage content
3240         try:
3241             webpage_bytes = compat_urllib_request.urlopen(url).read()
3242             webpage = webpage_bytes.decode('utf-8')
3243         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3244             self._downloader.report_error(u'unable to download video webpage: %s' % err)
3245             return
3246
3247         result = re.search(self.VIDEO_URL_RE, webpage)
3248         if result is None:
3249             self._downloader.report_error(u'unable to extract video url')
3250             return
3251         video_url = compat_urllib_parse.unquote(result.group(1))
3252
3253         result = re.search(self.VIDEO_TITLE_RE, webpage)
3254         if result is None:
3255             self._downloader.report_error(u'unable to extract video title')
3256             return
3257         video_title = result.group(1)
3258
3259         result = re.search(self.VIDEO_THUMB_RE, webpage)
3260         if result is None:
3261             self._downloader.report_error(u'unable to extract video thumbnail')
3262             return
3263         video_thumbnail = result.group(1)
3264
3265         return [{
3266             'id': video_id,
3267             'url': video_url,
3268             'uploader': None,
3269             'upload_date': None,
3270             'title': video_title,
3271             'ext': 'flv',
3272             'thumbnail': video_thumbnail,
3273             'description': None,
3274         }]
3275
3276
3277 class GooglePlusIE(InfoExtractor):
3278     """Information extractor for plus.google.com."""
3279
3280     _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3281     IE_NAME = u'plus.google'
3282
3283     def report_extract_entry(self, url):
3284         """Report downloading extry"""
3285         self.to_screen(u'Downloading entry: %s' % url)
3286
3287     def report_date(self, upload_date):
3288         """Report downloading extry"""
3289         self.to_screen(u'Entry date: %s' % upload_date)
3290
3291     def report_uploader(self, uploader):
3292         """Report downloading extry"""
3293         self.to_screen(u'Uploader: %s' % uploader)
3294
3295     def report_title(self, video_title):
3296         """Report downloading extry"""
3297         self.to_screen(u'Title: %s' % video_title)
3298
3299     def report_extract_vid_page(self, video_page):
3300         """Report information extraction."""
3301         self.to_screen(u'Extracting video page: %s' % video_page)
3302
3303     def _real_extract(self, url):
3304         # Extract id from URL
3305         mobj = re.match(self._VALID_URL, url)
3306         if mobj is None:
3307             self._downloader.report_error(u'Invalid URL: %s' % url)
3308             return
3309
3310         post_url = mobj.group(0)
3311         video_id = mobj.group(1)
3312
3313         video_extension = 'flv'
3314
3315         # Step 1, Retrieve post webpage to extract further information
3316         self.report_extract_entry(post_url)
3317         request = compat_urllib_request.Request(post_url)
3318         try:
3319             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3320         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321             self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3322             return
3323
3324         # Extract update date
3325         upload_date = None
3326         pattern = 'title="Timestamp">(.*?)</a>'
3327         mobj = re.search(pattern, webpage)
3328         if mobj:
3329             upload_date = mobj.group(1)
3330             # Convert timestring to a format suitable for filename
3331             upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3332             upload_date = upload_date.strftime('%Y%m%d')
3333         self.report_date(upload_date)
3334
3335         # Extract uploader
3336         uploader = None
3337         pattern = r'rel\="author".*?>(.*?)</a>'
3338         mobj = re.search(pattern, webpage)
3339         if mobj:
3340             uploader = mobj.group(1)
3341         self.report_uploader(uploader)
3342
3343         # Extract title
3344         # Get the first line for title
3345         video_title = u'NA'
3346         pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3347         mobj = re.search(pattern, webpage)
3348         if mobj:
3349             video_title = mobj.group(1)
3350         self.report_title(video_title)
3351
3352         # Step 2, Stimulate clicking the image box to launch video
3353         pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3354         mobj = re.search(pattern, webpage)
3355         if mobj is None:
3356             self._downloader.report_error(u'unable to extract video page URL')
3357
3358         video_page = mobj.group(1)
3359         request = compat_urllib_request.Request(video_page)
3360         try:
3361             webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3362         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3363             self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3364             return
3365         self.report_extract_vid_page(video_page)
3366
3367
3368         # Extract video links on video page
3369         """Extract video links of all sizes"""
3370         pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3371         mobj = re.findall(pattern, webpage)
3372         if len(mobj) == 0:
3373             self._downloader.report_error(u'unable to extract video links')
3374
3375         # Sort in resolution
3376         links = sorted(mobj)
3377
3378         # Choose the lowest of the sort, i.e. highest resolution
3379         video_url = links[-1]
3380         # Only get the url. The resolution part in the tuple has no use anymore
3381         video_url = video_url[-1]
3382         # Treat escaped \u0026 style hex
3383         try:
3384             video_url = video_url.decode("unicode_escape")
3385         except AttributeError: # Python 3
3386             video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3387
3388
3389         return [{
3390             'id':       video_id,
3391             'url':      video_url,
3392             'uploader': uploader,
3393             'upload_date':  upload_date,
3394             'title':    video_title,
3395             'ext':      video_extension,
3396         }]
3397
3398 class NBAIE(InfoExtractor):
3399     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3400     IE_NAME = u'nba'
3401
3402     def _real_extract(self, url):
3403         mobj = re.match(self._VALID_URL, url)
3404         if mobj is None:
3405             self._downloader.report_error(u'invalid URL: %s' % url)
3406             return
3407
3408         video_id = mobj.group(1)
3409         if video_id.endswith('/index.html'):
3410             video_id = video_id[:-len('/index.html')]
3411
3412         webpage = self._download_webpage(url, video_id)
3413
3414         video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3415         def _findProp(rexp, default=None):
3416             m = re.search(rexp, webpage)
3417             if m:
3418                 return unescapeHTML(m.group(1))
3419             else:
3420                 return default
3421
3422         shortened_video_id = video_id.rpartition('/')[2]
3423         title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3424         info = {
3425             'id': shortened_video_id,
3426             'url': video_url,
3427             'ext': 'mp4',
3428             'title': title,
3429             'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3430             'description': _findProp(r'<div class="description">(.*?)</h1>'),
3431         }
3432         return [info]
3433
3434 class JustinTVIE(InfoExtractor):
3435     """Information extractor for justin.tv and twitch.tv"""
3436     # TODO: One broadcast may be split into multiple videos. The key
3437     # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3438     # starts at 1 and increases. Can we treat all parts as one video?
3439
3440     _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3441         ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3442     _JUSTIN_PAGE_LIMIT = 100
3443     IE_NAME = u'justin.tv'
3444
3445     def report_download_page(self, channel, offset):
3446         """Report attempt to download a single page of videos."""
3447         self.to_screen(u'%s: Downloading video information from %d to %d' %
3448                 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3449
3450     # Return count of items, list of *valid* items
3451     def _parse_page(self, url):
3452         try:
3453             urlh = compat_urllib_request.urlopen(url)
3454             webpage_bytes = urlh.read()
3455             webpage = webpage_bytes.decode('utf-8', 'ignore')
3456         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3457             self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3458             return
3459
3460         response = json.loads(webpage)
3461         if type(response) != list:
3462             error_text = response.get('error', 'unknown error')
3463             self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3464             return
3465         info = []
3466         for clip in response:
3467             video_url = clip['video_file_url']
3468             if video_url:
3469                 video_extension = os.path.splitext(video_url)[1][1:]
3470                 video_date = re.sub('-', '', clip['start_time'][:10])
3471                 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3472                 video_id = clip['id']
3473                 video_title = clip.get('title', video_id)
3474                 info.append({
3475                     'id': video_id,
3476                     'url': video_url,
3477                     'title': video_title,
3478                     'uploader': clip.get('channel_name', video_uploader_id),
3479                     'uploader_id': video_uploader_id,
3480                     'upload_date': video_date,
3481                     'ext': video_extension,
3482                 })
3483         return (len(response), info)
3484
3485     def _real_extract(self, url):
3486         mobj = re.match(self._VALID_URL, url)
3487         if mobj is None:
3488             self._downloader.report_error(u'invalid URL: %s' % url)
3489             return
3490
3491         api = 'http://api.justin.tv'
3492         video_id = mobj.group(mobj.lastindex)
3493         paged = False
3494         if mobj.lastindex == 1:
3495             paged = True
3496             api += '/channel/archives/%s.json'
3497         else:
3498             api += '/broadcast/by_archive/%s.json'
3499         api = api % (video_id,)
3500
3501         self.report_extraction(video_id)
3502
3503         info = []
3504         offset = 0
3505         limit = self._JUSTIN_PAGE_LIMIT
3506         while True:
3507             if paged:
3508                 self.report_download_page(video_id, offset)
3509             page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3510             page_count, page_info = self._parse_page(page_url)
3511             info.extend(page_info)
3512             if not paged or page_count != limit:
3513                 break
3514             offset += limit
3515         return info
3516
3517 class FunnyOrDieIE(InfoExtractor):
3518     _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3519
3520     def _real_extract(self, url):
3521         mobj = re.match(self._VALID_URL, url)
3522         if mobj is None:
3523             self._downloader.report_error(u'invalid URL: %s' % url)
3524             return
3525
3526         video_id = mobj.group('id')
3527         webpage = self._download_webpage(url, video_id)
3528
3529         m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3530         if not m:
3531             self._downloader.report_error(u'unable to find video information')
3532         video_url = unescapeHTML(m.group('url'))
3533
3534         m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3535         if not m:
3536             m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3537             if not m:
3538                 self._downloader.report_error(u'Cannot find video title')
3539         title = clean_html(m.group('title'))
3540
3541         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3542         if m:
3543             desc = unescapeHTML(m.group('desc'))
3544         else:
3545             desc = None
3546
3547         info = {
3548             'id': video_id,
3549             'url': video_url,
3550             'ext': 'mp4',
3551             'title': title,
3552             'description': desc,
3553         }
3554         return [info]
3555
3556 class SteamIE(InfoExtractor):
3557     _VALID_URL = r"""http://store.steampowered.com/
3558                 (agecheck/)?
3559                 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3560                 (?P<gameID>\d+)/?
3561                 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3562                 """
3563
3564     @classmethod
3565     def suitable(cls, url):
3566         """Receives a URL and returns True if suitable for this IE."""
3567         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3568
3569     def _real_extract(self, url):
3570         m = re.match(self._VALID_URL, url, re.VERBOSE)
3571         gameID = m.group('gameID')
3572         videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3573         self.report_age_confirmation()
3574         webpage = self._download_webpage(videourl, gameID)
3575         game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3576
3577         urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3578         mweb = re.finditer(urlRE, webpage)
3579         namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3580         titles = re.finditer(namesRE, webpage)
3581         thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3582         thumbs = re.finditer(thumbsRE, webpage)
3583         videos = []
3584         for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3585             video_id = vid.group('videoID')
3586             title = vtitle.group('videoName')
3587             video_url = vid.group('videoURL')
3588             video_thumb = thumb.group('thumbnail')
3589             if not video_url:
3590                 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3591             info = {
3592                 'id':video_id,
3593                 'url':video_url,
3594                 'ext': 'flv',
3595                 'title': unescapeHTML(title),
3596                 'thumbnail': video_thumb
3597                   }
3598             videos.append(info)
3599         return [self.playlist_result(videos, gameID, game_title)]
3600
3601 class UstreamIE(InfoExtractor):
3602     _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3603     IE_NAME = u'ustream'
3604
3605     def _real_extract(self, url):
3606         m = re.match(self._VALID_URL, url)
3607         video_id = m.group('videoID')
3608         video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3609         webpage = self._download_webpage(url, video_id)
3610         m = re.search(r'data-title="(?P<title>.+)"',webpage)
3611         title = m.group('title')
3612         m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3613         uploader = m.group('uploader')
3614         info = {
3615                 'id':video_id,
3616                 'url':video_url,
3617                 'ext': 'flv',
3618                 'title': title,
3619                 'uploader': uploader
3620                   }
3621         return [info]
3622
3623 class WorldStarHipHopIE(InfoExtractor):
3624     _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3625     IE_NAME = u'WorldStarHipHop'
3626
3627     def _real_extract(self, url):
3628         _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3629
3630         webpage_src = compat_urllib_request.urlopen(url).read()
3631         webpage_src = webpage_src.decode('utf-8')
3632
3633         mobj = re.search(_src_url, webpage_src)
3634
3635         m = re.match(self._VALID_URL, url)
3636         video_id = m.group('id')
3637
3638         if mobj is not None:
3639             video_url = mobj.group()
3640             if 'mp4' in video_url:
3641                 ext = 'mp4'
3642             else:
3643                 ext = 'flv'
3644         else:
3645             self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3646             return
3647
3648         _title = r"""<title>(.*)</title>"""
3649
3650         mobj = re.search(_title, webpage_src)
3651
3652         if mobj is not None:
3653             title = mobj.group(1)
3654         else:
3655             title = 'World Start Hip Hop - %s' % time.ctime()
3656
3657         _thumbnail = r"""rel="image_src" href="(.*)" />"""
3658         mobj = re.search(_thumbnail, webpage_src)
3659
3660         # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3661         if mobj is not None:
3662             thumbnail = mobj.group(1)
3663         else:
3664             _title = r"""candytitles.*>(.*)</span>"""
3665             mobj = re.search(_title, webpage_src)
3666             if mobj is not None:
3667                 title = mobj.group(1)
3668             thumbnail = None
3669
3670         results = [{
3671                     'id': video_id,
3672                     'url' : video_url,
3673                     'title' : title,
3674                     'thumbnail' : thumbnail,
3675                     'ext' : ext,
3676                     }]
3677         return results
3678
3679 class RBMARadioIE(InfoExtractor):
3680     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3681
3682     def _real_extract(self, url):
3683         m = re.match(self._VALID_URL, url)
3684         video_id = m.group('videoID')
3685
3686         webpage = self._download_webpage(url, video_id)
3687         m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3688         if not m:
3689             raise ExtractorError(u'Cannot find metadata')
3690         json_data = m.group(1)
3691
3692         try:
3693             data = json.loads(json_data)
3694         except ValueError as e:
3695             raise ExtractorError(u'Invalid JSON: ' + str(e))
3696
3697         video_url = data['akamai_url'] + '&cbr=256'
3698         url_parts = compat_urllib_parse_urlparse(video_url)
3699         video_ext = url_parts.path.rpartition('.')[2]
3700         info = {
3701                 'id': video_id,
3702                 'url': video_url,
3703                 'ext': video_ext,
3704                 'title': data['title'],
3705                 'description': data.get('teaser_text'),
3706                 'location': data.get('country_of_origin'),
3707                 'uploader': data.get('host', {}).get('name'),
3708                 'uploader_id': data.get('host', {}).get('slug'),
3709                 'thumbnail': data.get('image', {}).get('large_url_2x'),
3710                 'duration': data.get('duration'),
3711         }
3712         return [info]
3713
3714
3715 class YouPornIE(InfoExtractor):
3716     """Information extractor for youporn.com."""
3717     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3718
3719     def _print_formats(self, formats):
3720         """Print all available formats"""
3721         print(u'Available formats:')
3722         print(u'ext\t\tformat')
3723         print(u'---------------------------------')
3724         for format in formats:
3725             print(u'%s\t\t%s'  % (format['ext'], format['format']))
3726
3727     def _specific(self, req_format, formats):
3728         for x in formats:
3729             if(x["format"]==req_format):
3730                 return x
3731         return None
3732
3733     def _real_extract(self, url):
3734         mobj = re.match(self._VALID_URL, url)
3735         if mobj is None:
3736             self._downloader.report_error(u'invalid URL: %s' % url)
3737             return
3738
3739         video_id = mobj.group('videoid')
3740
3741         req = compat_urllib_request.Request(url)
3742         req.add_header('Cookie', 'age_verified=1')
3743         webpage = self._download_webpage(req, video_id)
3744
3745         # Get the video title
3746         result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3747         if result is None:
3748             raise ExtractorError(u'Unable to extract video title')
3749         video_title = result.group('title').strip()
3750
3751         # Get the video date
3752         result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3753         if result is None:
3754             self._downloader.report_warning(u'unable to extract video date')
3755             upload_date = None
3756         else:
3757             upload_date = unified_strdate(result.group('date').strip())
3758
3759         # Get the video uploader
3760         result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3761         if result is None:
3762             self._downloader.report_warning(u'unable to extract uploader')
3763             video_uploader = None
3764         else:
3765             video_uploader = result.group('uploader').strip()
3766             video_uploader = clean_html( video_uploader )
3767
3768         # Get all of the formats available
3769         DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3770         result = re.search(DOWNLOAD_LIST_RE, webpage)
3771         if result is None:
3772             raise ExtractorError(u'Unable to extract download list')
3773         download_list_html = result.group('download_list').strip()
3774
3775         # Get all of the links from the page
3776         LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3777         links = re.findall(LINK_RE, download_list_html)
3778         if(len(links) == 0):
3779             raise ExtractorError(u'ERROR: no known formats available for video')
3780
3781         self.to_screen(u'Links found: %d' % len(links))
3782
3783         formats = []
3784         for link in links:
3785
3786             # A link looks like this:
3787             # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3788             # A path looks like this:
3789             # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3790             video_url = unescapeHTML( link )
3791             path = compat_urllib_parse_urlparse( video_url ).path
3792             extension = os.path.splitext( path )[1][1:]
3793             format = path.split('/')[4].split('_')[:2]
3794             size = format[0]
3795             bitrate = format[1]
3796             format = "-".join( format )
3797             title = u'%s-%s-%s' % (video_title, size, bitrate)
3798
3799             formats.append({
3800                 'id': video_id,
3801                 'url': video_url,
3802                 'uploader': video_uploader,
3803                 'upload_date': upload_date,
3804                 'title': title,
3805                 'ext': extension,
3806                 'format': format,
3807                 'thumbnail': None,
3808                 'description': None,
3809                 'player_url': None
3810             })
3811
3812         if self._downloader.params.get('listformats', None):
3813             self._print_formats(formats)
3814             return
3815
3816         req_format = self._downloader.params.get('format', None)
3817         self.to_screen(u'Format: %s' % req_format)
3818
3819         if req_format is None or req_format == 'best':
3820             return [formats[0]]
3821         elif req_format == 'worst':
3822             return [formats[-1]]
3823         elif req_format in ('-1', 'all'):
3824             return formats
3825         else:
3826             format = self._specific( req_format, formats )
3827             if result is None:
3828                 self._downloader.report_error(u'requested format not available')
3829                 return
3830             return [format]
3831
3832
3833
3834 class PornotubeIE(InfoExtractor):
3835     """Information extractor for pornotube.com."""
3836     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3837
3838     def _real_extract(self, url):
3839         mobj = re.match(self._VALID_URL, url)
3840         if mobj is None:
3841             self._downloader.report_error(u'invalid URL: %s' % url)
3842             return
3843
3844         video_id = mobj.group('videoid')
3845         video_title = mobj.group('title')
3846
3847         # Get webpage content
3848         webpage = self._download_webpage(url, video_id)
3849
3850         # Get the video URL
3851         VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3852         result = re.search(VIDEO_URL_RE, webpage)
3853         if result is None:
3854             self._downloader.report_error(u'unable to extract video url')
3855             return
3856         video_url = compat_urllib_parse.unquote(result.group('url'))
3857
3858         #Get the uploaded date
3859         VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3860         result = re.search(VIDEO_UPLOADED_RE, webpage)
3861         if result is None:
3862             self._downloader.report_error(u'unable to extract video title')
3863             return
3864         upload_date = unified_strdate(result.group('date'))
3865
3866         info = {'id': video_id,
3867                 'url': video_url,
3868                 'uploader': None,
3869                 'upload_date': upload_date,
3870                 'title': video_title,
3871                 'ext': 'flv',
3872                 'format': 'flv'}
3873
3874         return [info]
3875
3876 class YouJizzIE(InfoExtractor):
3877     """Information extractor for youjizz.com."""
3878     _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3879
3880     def _real_extract(self, url):
3881         mobj = re.match(self._VALID_URL, url)
3882         if mobj is None:
3883             self._downloader.report_error(u'invalid URL: %s' % url)
3884             return
3885
3886         video_id = mobj.group('videoid')
3887
3888         # Get webpage content
3889         webpage = self._download_webpage(url, video_id)
3890
3891         # Get the video title
3892         result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3893         if result is None:
3894             raise ExtractorError(u'ERROR: unable to extract video title')
3895         video_title = result.group('title').strip()
3896
3897         # Get the embed page
3898         result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3899         if result is None:
3900             raise ExtractorError(u'ERROR: unable to extract embed page')
3901
3902         embed_page_url = result.group(0).strip()
3903         video_id = result.group('videoid')
3904
3905         webpage = self._download_webpage(embed_page_url, video_id)
3906
3907         # Get the video URL
3908         result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3909         if result is None:
3910             raise ExtractorError(u'ERROR: unable to extract video url')
3911         video_url = result.group('source')
3912
3913         info = {'id': video_id,
3914                 'url': video_url,
3915                 'title': video_title,
3916                 'ext': 'flv',
3917                 'format': 'flv',
3918                 'player_url': embed_page_url}
3919
3920         return [info]
3921
3922 class EightTracksIE(InfoExtractor):
3923     IE_NAME = '8tracks'
3924     _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3925
3926     def _real_extract(self, url):
3927         mobj = re.match(self._VALID_URL, url)
3928         if mobj is None:
3929             raise ExtractorError(u'Invalid URL: %s' % url)
3930         playlist_id = mobj.group('id')
3931
3932         webpage = self._download_webpage(url, playlist_id)
3933
3934         m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3935         if not m:
3936             raise ExtractorError(u'Cannot find trax information')
3937         json_like = m.group(1)
3938         data = json.loads(json_like)
3939
3940         session = str(random.randint(0, 1000000000))
3941         mix_id = data['id']
3942         track_count = data['tracks_count']
3943         first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3944         next_url = first_url
3945         res = []
3946         for i in itertools.count():
3947             api_json = self._download_webpage(next_url, playlist_id,
3948                 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3949                 errnote=u'Failed to download song information')
3950             api_data = json.loads(api_json)
3951             track_data = api_data[u'set']['track']
3952             info = {
3953                 'id': track_data['id'],
3954                 'url': track_data['track_file_stream_url'],
3955                 'title': track_data['performer'] + u' - ' + track_data['name'],
3956                 'raw_title': track_data['name'],
3957                 'uploader_id': data['user']['login'],
3958                 'ext': 'm4a',
3959             }
3960             res.append(info)
3961             if api_data['set']['at_last_track']:
3962                 break
3963             next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3964         return res
3965
3966 class KeekIE(InfoExtractor):
3967     _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3968     IE_NAME = u'keek'
3969
3970     def _real_extract(self, url):
3971         m = re.match(self._VALID_URL, url)
3972         video_id = m.group('videoID')
3973         video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3974         thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3975         webpage = self._download_webpage(url, video_id)
3976         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3977         title = unescapeHTML(m.group('title'))
3978         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3979         uploader = clean_html(m.group('uploader'))
3980         info = {
3981                 'id': video_id,
3982                 'url': video_url,
3983                 'ext': 'mp4',
3984                 'title': title,
3985                 'thumbnail': thumbnail,
3986                 'uploader': uploader
3987         }
3988         return [info]
3989
3990 class TEDIE(InfoExtractor):
3991     _VALID_URL=r'''http://www.ted.com/
3992                    (
3993                         ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3994                         |
3995                         ((?P<type_talk>talks)) # We have a simple talk
3996                    )
3997                    /(?P<name>\w+) # Here goes the name and then ".html"
3998                    '''
3999
4000     @classmethod
4001     def suitable(cls, url):
4002         """Receives a URL and returns True if suitable for this IE."""
4003         return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4004
4005     def _real_extract(self, url):
4006         m=re.match(self._VALID_URL, url, re.VERBOSE)
4007         if m.group('type_talk'):
4008             return [self._talk_info(url)]
4009         else :
4010             playlist_id=m.group('playlist_id')
4011             name=m.group('name')
4012             self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4013             return [self._playlist_videos_info(url,name,playlist_id)]
4014
4015     def _talk_video_link(self,mediaSlug):
4016         '''Returns the video link for that mediaSlug'''
4017         return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4018
4019     def _playlist_videos_info(self,url,name,playlist_id=0):
4020         '''Returns the videos of the playlist'''
4021         video_RE=r'''
4022                      <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4023                      ([.\s]*?)data-playlist_item_id="(\d+)"
4024                      ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4025                      '''
4026         video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4027         webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4028         m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4029         m_names=re.finditer(video_name_RE,webpage)
4030
4031         playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4032         m_playlist = re.search(playlist_RE, webpage)
4033         playlist_title = m_playlist.group('playlist_title')
4034
4035         playlist_entries = []
4036         for m_video, m_name in zip(m_videos,m_names):
4037             video_id=m_video.group('video_id')
4038             talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4039             playlist_entries.append(self.url_result(talk_url, 'TED'))
4040         return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4041
4042     def _talk_info(self, url, video_id=0):
4043         """Return the video for the talk in the url"""
4044         m=re.match(self._VALID_URL, url,re.VERBOSE)
4045         videoName=m.group('name')
4046         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4047         # If the url includes the language we get the title translated
4048         title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4049         title=re.search(title_RE, webpage).group('title')
4050         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4051                         "id":(?P<videoID>[\d]+).*?
4052                         "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4053         thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4054         thumb_match=re.search(thumb_RE,webpage)
4055         info_match=re.search(info_RE,webpage,re.VERBOSE)
4056         video_id=info_match.group('videoID')
4057         mediaSlug=info_match.group('mediaSlug')
4058         video_url=self._talk_video_link(mediaSlug)
4059         info = {
4060                 'id': video_id,
4061                 'url': video_url,
4062                 'ext': 'mp4',
4063                 'title': title,
4064                 'thumbnail': thumb_match.group('thumbnail')
4065                 }
4066         return info
4067
4068 class MySpassIE(InfoExtractor):
4069     _VALID_URL = r'http://www.myspass.de/.*'
4070
4071     def _real_extract(self, url):
4072         META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4073
4074         # video id is the last path element of the URL
4075         # usually there is a trailing slash, so also try the second but last
4076         url_path = compat_urllib_parse_urlparse(url).path
4077         url_parent_path, video_id = os.path.split(url_path)
4078         if not video_id:
4079             _, video_id = os.path.split(url_parent_path)
4080
4081         # get metadata
4082         metadata_url = META_DATA_URL_TEMPLATE % video_id
4083         metadata_text = self._download_webpage(metadata_url, video_id)
4084         metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4085
4086         # extract values from metadata
4087         url_flv_el = metadata.find('url_flv')
4088         if url_flv_el is None:
4089             self._downloader.report_error(u'unable to extract download url')
4090             return
4091         video_url = url_flv_el.text
4092         extension = os.path.splitext(video_url)[1][1:]
4093         title_el = metadata.find('title')
4094         if title_el is None:
4095             self._downloader.report_error(u'unable to extract title')
4096             return
4097         title = title_el.text
4098         format_id_el = metadata.find('format_id')
4099         if format_id_el is None:
4100             format = ext
4101         else:
4102             format = format_id_el.text
4103         description_el = metadata.find('description')
4104         if description_el is not None:
4105             description = description_el.text
4106         else:
4107             description = None
4108         imagePreview_el = metadata.find('imagePreview')
4109         if imagePreview_el is not None:
4110             thumbnail = imagePreview_el.text
4111         else:
4112             thumbnail = None
4113         info = {
4114             'id': video_id,
4115             'url': video_url,
4116             'title': title,
4117             'ext': extension,
4118             'format': format,
4119             'thumbnail': thumbnail,
4120             'description': description
4121         }
4122         return [info]
4123
4124 class SpiegelIE(InfoExtractor):
4125     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4126
4127     def _real_extract(self, url):
4128         m = re.match(self._VALID_URL, url)
4129         video_id = m.group('videoID')
4130
4131         webpage = self._download_webpage(url, video_id)
4132         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4133         if not m:
4134             raise ExtractorError(u'Cannot find title')
4135         video_title = unescapeHTML(m.group(1))
4136
4137         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4138         xml_code = self._download_webpage(xml_url, video_id,
4139                     note=u'Downloading XML', errnote=u'Failed to download XML')
4140
4141         idoc = xml.etree.ElementTree.fromstring(xml_code)
4142         last_type = idoc[-1]
4143         filename = last_type.findall('./filename')[0].text
4144         duration = float(last_type.findall('./duration')[0].text)
4145
4146         video_url = 'http://video2.spiegel.de/flash/' + filename
4147         video_ext = filename.rpartition('.')[2]
4148         info = {
4149             'id': video_id,
4150             'url': video_url,
4151             'ext': video_ext,
4152             'title': video_title,
4153             'duration': duration,
4154         }
4155         return [info]
4156
4157 class LiveLeakIE(InfoExtractor):
4158
4159     _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4160     IE_NAME = u'liveleak'
4161
4162     def _real_extract(self, url):
4163         mobj = re.match(self._VALID_URL, url)
4164         if mobj is None:
4165             self._downloader.report_error(u'invalid URL: %s' % url)
4166             return
4167
4168         video_id = mobj.group('video_id')
4169
4170         webpage = self._download_webpage(url, video_id)
4171
4172         m = re.search(r'file: "(.*?)",', webpage)
4173         if not m:
4174             self._downloader.report_error(u'unable to find video url')
4175             return
4176         video_url = m.group(1)
4177
4178         m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4179         if not m:
4180             self._downloader.report_error(u'Cannot find video title')
4181         title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4182
4183         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4184         if m:
4185             desc = unescapeHTML(m.group('desc'))
4186         else:
4187             desc = None
4188
4189         m = re.search(r'By:.*?(\w+)</a>', webpage)
4190         if m:
4191             uploader = clean_html(m.group(1))
4192         else:
4193             uploader = None
4194
4195         info = {
4196             'id':  video_id,
4197             'url': video_url,
4198             'ext': 'mp4',
4199             'title': title,
4200             'description': desc,
4201             'uploader': uploader
4202         }
4203
4204         return [info]
4205
4206 class ARDIE(InfoExtractor):
4207     _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4208     _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4209     _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4210
4211     def _real_extract(self, url):
4212         # determine video id from url
4213         m = re.match(self._VALID_URL, url)
4214
4215         numid = re.search(r'documentId=([0-9]+)', url)
4216         if numid:
4217             video_id = numid.group(1)
4218         else:
4219             video_id = m.group('video_id')
4220
4221         # determine title and media streams from webpage
4222         html = self._download_webpage(url, video_id)
4223         title = re.search(self._TITLE, html).group('title')
4224         streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4225         if not streams:
4226             assert '"fsk"' in html
4227             self._downloader.report_error(u'this video is only available after 8:00 pm')
4228             return
4229
4230         # choose default media type and highest quality for now
4231         stream = max([s for s in streams if int(s["media_type"]) == 0],
4232                      key=lambda s: int(s["quality"]))
4233
4234         # there's two possibilities: RTMP stream or HTTP download
4235         info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4236         if stream['rtmp_url']:
4237             self.to_screen(u'RTMP download detected')
4238             assert stream['video_url'].startswith('mp4:')
4239             info["url"] = stream["rtmp_url"]
4240             info["play_path"] = stream['video_url']
4241         else:
4242             assert stream["video_url"].endswith('.mp4')
4243             info["url"] = stream["video_url"]
4244         return [info]
4245
4246 class TumblrIE(InfoExtractor):
4247     _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4248
4249     def _real_extract(self, url):
4250         m_url = re.match(self._VALID_URL, url)
4251         video_id = m_url.group('id')
4252         blog = m_url.group('blog_name')
4253
4254         url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4255         webpage = self._download_webpage(url, video_id)
4256
4257         re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4258         video = re.search(re_video, webpage)
4259         if video is None:
4260             self.to_screen("No video founded")
4261             return []
4262         video_url = video.group('video_url')
4263         ext = video.group('ext')
4264
4265         re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
4266         thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4267
4268         # The only place where you can get a title, it's not complete,
4269         # but searching in other places doesn't work for all videos
4270         re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4271         title = unescapeHTML(re.search(re_title, webpage).group('title'))
4272
4273         return [{'id': video_id,
4274                  'url': video_url,
4275                  'title': title,
4276                  'thumbnail': thumb,
4277                  'ext': ext
4278                  }]
4279
4280
4281 def gen_extractors():
4282     """ Return a list of an instance of every supported extractor.
4283     The order does matter; the first extractor matched is the one handling the URL.
4284     """
4285     return [
4286         YoutubePlaylistIE(),
4287         YoutubeChannelIE(),
4288         YoutubeUserIE(),
4289         YoutubeSearchIE(),
4290         YoutubeIE(),
4291         MetacafeIE(),
4292         DailymotionIE(),
4293         GoogleSearchIE(),
4294         PhotobucketIE(),
4295         YahooIE(),
4296         YahooSearchIE(),
4297         DepositFilesIE(),
4298         FacebookIE(),
4299         BlipTVUserIE(),
4300         BlipTVIE(),
4301         VimeoIE(),
4302         MyVideoIE(),
4303         ComedyCentralIE(),
4304         EscapistIE(),
4305         CollegeHumorIE(),
4306         XVideosIE(),
4307         SoundcloudSetIE(),
4308         SoundcloudIE(),
4309         InfoQIE(),
4310         MixcloudIE(),
4311         StanfordOpenClassroomIE(),
4312         MTVIE(),
4313         YoukuIE(),
4314         XNXXIE(),
4315         YouJizzIE(),
4316         PornotubeIE(),
4317         YouPornIE(),
4318         GooglePlusIE(),
4319         ArteTvIE(),
4320         NBAIE(),
4321         WorldStarHipHopIE(),
4322         JustinTVIE(),
4323         FunnyOrDieIE(),
4324         SteamIE(),
4325         UstreamIE(),
4326         RBMARadioIE(),
4327         EightTracksIE(),
4328         KeekIE(),
4329         TEDIE(),
4330         MySpassIE(),
4331         SpiegelIE(),
4332         LiveLeakIE(),
4333         ARDIE(),
4334         TumblrIE(),
4335         GenericIE()
4336     ]
4337
4338 def get_info_extractor(ie_name):
4339     """Returns the info extractor class with the given ie_name"""
4340     return globals()[ie_name+'IE']