git.bitcoin.ninja Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class DownloadError(Exception):
  29         """Download Error exception.
  30
  31         This exception may be thrown by FileDownloader objects if they are not
  32         configured to continue on errors. They will contain the appropriate
  33         error message.
  34         """
  35         pass
  36
  37 class SameFileError(Exception):
  38         """Same File exception.
  39
  40         This exception will be thrown by FileDownloader objects if they detect
  41         multiple files would have to be downloaded to the same file on disk.
  42         """
  43         pass
  44
  45 class FileDownloader(object):
  46         """File Downloader class.
  47
  48         File downloader objects are the ones responsible of downloading the
  49         actual video file and writing it to disk if the user has requested
  50         it, among some other tasks. In most cases there should be one per
  51         program. As, given a video URL, the downloader doesn't know how to
  52         extract all the needed information, task that InfoExtractors do, it
  53         has to pass the URL to one of them.
  54
  55         For this, file downloader objects have a method that allows
  56         InfoExtractors to be registered in a given order. When it is passed
  57         a URL, the file downloader handles it to the first InfoExtractor it
  58         finds that reports being able to handle it. The InfoExtractor returns
  59         all the information to the FileDownloader and the latter downloads the
  60         file or does whatever it's instructed to do.
  61
  62         File downloaders accept a lot of parameters. In order not to saturate
  63         the object constructor with arguments, it receives a dictionary of
  64         options instead. These options are available through the get_params()
  65         method for the InfoExtractors to use. The FileDownloader also registers
  66         itself as the downloader in charge for the InfoExtractors that are
  67         added to it, so this is a "mutual registration".
  68
  69         Available options:
  70
  71         username:       Username for authentication purposes.
  72         password:       Password for authentication purposes.
  73         usenetrc:       Use netrc for authentication instead.
  74         quiet:          Do not print messages to stdout.
  75         forceurl:       Force printing final URL.
  76         forcetitle:     Force printing title.
  77         simulate:       Do not download the video files.
  78         format:         Video format code.
  79         outtmpl:        Template for output names.
  80         ignoreerrors:   Do not stop on download errors.
  81         ratelimit:      Download speed limit, in bytes/sec.
  82         """
  83
  84         _params = None
  85         _ies = []
  86
  87         def __init__(self, params):
  88                 """Create a FileDownloader object with the given options."""
  89                 self._ies = []
  90                 self.set_params(params)
  91
  92         @staticmethod
  93         def pmkdir(filename):
  94                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  95                 components = filename.split(os.sep)
  96                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  97                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
  98                 for dir in aggregate:
  99                         if not os.path.exists(dir):
 100                                 os.mkdir(dir)
 101
 102         @staticmethod
 103         def format_bytes(bytes):
 104                 if bytes is None:
 105                         return 'N/A'
 106                 if bytes == 0:
 107                         exponent = 0
 108                 else:
 109                         exponent = long(math.log(float(bytes), 1024.0))
 110                 suffix = 'bkMGTPEZY'[exponent]
 111                 converted = float(bytes) / float(1024**exponent)
 112                 return '%.2f%s' % (converted, suffix)
 113
 114         @staticmethod
 115         def calc_percent(byte_counter, data_len):
 116                 if data_len is None:
 117                         return '---.-%'
 118                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 119
 120         @staticmethod
 121         def calc_eta(start, now, total, current):
 122                 if total is None:
 123                         return '--:--'
 124                 dif = now - start
 125                 if current == 0 or dif < 0.001: # One millisecond
 126                         return '--:--'
 127                 rate = float(current) / dif
 128                 eta = long((float(total) - float(current)) / rate)
 129                 (eta_mins, eta_secs) = divmod(eta, 60)
 130                 if eta_mins > 99:
 131                         return '--:--'
 132                 return '%02d:%02d' % (eta_mins, eta_secs)
 133
 134         @staticmethod
 135         def calc_speed(start, now, bytes):
 136                 dif = now - start
 137                 if bytes == 0 or dif < 0.001: # One millisecond
 138                         return '%10s' % '---b/s'
 139                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 140
 141         @staticmethod
 142         def best_block_size(elapsed_time, bytes):
 143                 new_min = max(bytes / 2.0, 1.0)
 144                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 145                 if elapsed_time < 0.001:
 146                         return int(new_max)
 147                 rate = bytes / elapsed_time
 148                 if rate > new_max:
 149                         return int(new_max)
 150                 if rate < new_min:
 151                         return int(new_min)
 152                 return int(rate)
 153
 154         @staticmethod
 155         def parse_bytes(bytestr):
 156                 """Parse a string indicating a byte quantity into a long integer."""
 157                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 158                 if matchobj is None:
 159                         return None
 160                 number = float(matchobj.group(1))
 161                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 162                 return long(round(number * multiplier))
 163
 164         def set_params(self, params):
 165                 """Sets parameters."""
 166                 if type(params) != dict:
 167                         raise ValueError('params: dictionary expected')
 168                 self._params = params
 169
 170         def get_params(self):
 171                 """Get parameters."""
 172                 return self._params
 173
 174         def add_info_extractor(self, ie):
 175                 """Add an InfoExtractor object to the end of the list."""
 176                 self._ies.append(ie)
 177                 ie.set_downloader(self)
 178
 179         def to_stdout(self, message, skip_eol=False):
 180                 """Print message to stdout if not in quiet mode."""
 181                 if not self._params.get('quiet', False):
 182                         if skip_eol:
 183                                 print message,
 184                         else:
 185                                 print message
 186                         sys.stdout.flush()
 187
 188         def to_stderr(self, message):
 189                 """Print message to stderr."""
 190                 print >>sys.stderr, message
 191
 192         def fixed_template(self):
 193                 """Checks if the output template is fixed."""
 194                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 195
 196         def trouble(self, message=None):
 197                 """Determine action to take when a download problem appears.
 198
 199                 Depending on if the downloader has been configured to ignore
 200                 download errors or not, this method may throw an exception or
 201                 not when errors are found, after printing the message. If it
 202                 doesn't raise, it returns an error code suitable to be returned
 203                 later as a program exit code to indicate error.
 204                 """
 205                 if message is not None:
 206                         self.to_stderr(message)
 207                 if not self._params.get('ignoreerrors', False):
 208                         raise DownloadError(message)
 209                 return 1
 210
 211         def slow_down(self, start_time, byte_counter):
 212                 """Sleep if the download speed is over the rate limit."""
 213                 rate_limit = self._params.get('ratelimit', None)
 214                 if rate_limit is None or byte_counter == 0:
 215                         return
 216                 now = time.time()
 217                 elapsed = now - start_time
 218                 if elapsed <= 0.0:
 219                         return
 220                 speed = float(byte_counter) / elapsed
 221                 if speed > rate_limit:
 222                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 223
 224         def report_destination(self, filename):
 225                 """Report destination filename."""
 226                 self.to_stdout(u'[download] Destination: %s' % filename)
 227
 228         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 229                 """Report download progress."""
 230                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 231                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 232
 233         def report_finish(self):
 234                 """Report download finished."""
 235                 self.to_stdout(u'')
 236
 237         def download(self, url_list):
 238                 """Download a given list of URLs."""
 239                 retcode = 0
 240                 if len(url_list) > 1 and self.fixed_template():
 241                         raise SameFileError(self._params['outtmpl'])
 242
 243                 for url in url_list:
 244                         suitable_found = False
 245                         for ie in self._ies:
 246                                 if not ie.suitable(url):
 247                                         continue
 248                                 # Suitable InfoExtractor found
 249                                 suitable_found = True
 250                                 all_results = ie.extract(url)
 251                                 results = [x for x in all_results if x is not None]
 252                                 if len(results) != len(all_results):
 253                                         retcode = self.trouble()
 254
 255                                 if len(results) > 1 and self.fixed_template():
 256                                         raise SameFileError(self._params['outtmpl'])
 257
 258                                 for result in results:
 259                                         # Forced printings
 260                                         if self._params.get('forcetitle', False):
 261                                                 print result['title']
 262                                         if self._params.get('forceurl', False):
 263                                                 print result['url']
 264
 265                                         # Do nothing else if in simulate mode
 266                                         if self._params.get('simulate', False):
 267                                                 continue
 268
 269                                         try:
 270                                                 filename = self._params['outtmpl'] % result
 271                                                 self.report_destination(filename)
 272                                         except (ValueError, KeyError), err:
 273                                                 retcode = self.trouble('ERROR: invalid output template: %s' % str(err))
 274                                                 continue
 275                                         try:
 276                                                 self.pmkdir(filename)
 277                                         except (OSError, IOError), err:
 278                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 279                                                 continue
 280                                         try:
 281                                                 outstream = open(filename, 'wb')
 282                                         except (OSError, IOError), err:
 283                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 284                                                 continue
 285                                         try:
 286                                                 self._do_download(outstream, result['url'])
 287                                                 outstream.close()
 288                                         except (OSError, IOError), err:
 289                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 290                                                 continue
 291                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 292                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 293                                                 continue
 294                                 break
 295                         if not suitable_found:
 296                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 297
 298                 return retcode
 299
 300         def _do_download(self, stream, url):
 301                 request = urllib2.Request(url, None, std_headers)
 302                 data = urllib2.urlopen(request)
 303                 data_len = data.info().get('Content-length', None)
 304                 data_len_str = self.format_bytes(data_len)
 305                 byte_counter = 0
 306                 block_size = 1024
 307                 start = time.time()
 308                 while True:
 309                         # Progress message
 310                         percent_str = self.calc_percent(byte_counter, data_len)
 311                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 312                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 313                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 314
 315                         # Download and write
 316                         before = time.time()
 317                         data_block = data.read(block_size)
 318                         after = time.time()
 319                         data_block_len = len(data_block)
 320                         if data_block_len == 0:
 321                                 break
 322                         byte_counter += data_block_len
 323                         stream.write(data_block)
 324                         block_size = self.best_block_size(after - before, data_block_len)
 325
 326                         # Apply rate limit
 327                         self.slow_down(start, byte_counter)
 328
 329                 self.report_finish()
 330                 if data_len is not None and str(byte_counter) != data_len:
 331                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 332
 333 class InfoExtractor(object):
 334         """Information Extractor class.
 335
 336         Information extractors are the classes that, given a URL, extract
 337         information from the video (or videos) the URL refers to. This
 338         information includes the real video URL, the video title and simplified
 339         title, author and others. It is returned in a list of dictionaries when
 340         calling its extract() method. It is a list because a URL can refer to
 341         more than one video (think of playlists). The dictionaries must include
 342         the following fields:
 343
 344         id:             Video identifier.
 345         url:            Final video URL.
 346         uploader:       Nickname of the video uploader.
 347         title:          Literal title.
 348         stitle:         Simplified title.
 349         ext:            Video filename extension.
 350
 351         Subclasses of this one should re-define the _real_initialize() and
 352         _real_extract() methods, as well as the suitable() static method.
 353         Probably, they should also be instantiated and added to the main
 354         downloader.
 355         """
 356
 357         _ready = False
 358         _downloader = None
 359
 360         def __init__(self, downloader=None):
 361                 """Constructor. Receives an optional downloader."""
 362                 self._ready = False
 363                 self.set_downloader(downloader)
 364
 365         @staticmethod
 366         def suitable(url):
 367                 """Receives a URL and returns True if suitable for this IE."""
 368                 return False
 369
 370         def initialize(self):
 371                 """Initializes an instance (authentication, etc)."""
 372                 if not self._ready:
 373                         self._real_initialize()
 374                         self._ready = True
 375
 376         def extract(self, url):
 377                 """Extracts URL information and returns it in list of dicts."""
 378                 self.initialize()
 379                 return self._real_extract(url)
 380
 381         def set_downloader(self, downloader):
 382                 """Sets the downloader for this IE."""
 383                 self._downloader = downloader
 384
 385         def to_stdout(self, message):
 386                 """Print message to stdout if downloader is not in quiet mode."""
 387                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 388                         print message
 389
 390         def to_stderr(self, message):
 391                 """Print message to stderr."""
 392                 print >>sys.stderr, message
 393
 394         def _real_initialize(self):
 395                 """Real initialization process. Redefine in subclasses."""
 396                 pass
 397
 398         def _real_extract(self, url):
 399                 """Real extraction process. Redefine in subclasses."""
 400                 pass
 401
 402 class YoutubeIE(InfoExtractor):
 403         """Information extractor for youtube.com."""
 404
 405         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 406         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 407         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 408         _NETRC_MACHINE = 'youtube'
 409
 410         @staticmethod
 411         def suitable(url):
 412                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 413
 414         def report_login(self):
 415                 """Report attempt to log in."""
 416                 self.to_stdout(u'[youtube] Logging in')
 417
 418         def report_age_confirmation(self):
 419                 """Report attempt to confirm age."""
 420                 self.to_stdout(u'[youtube] Confirming age')
 421
 422         def report_webpage_download(self, video_id):
 423                 """Report attempt to download webpage."""
 424                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 425
 426         def report_information_extraction(self, video_id):
 427                 """Report attempt to extract video information."""
 428                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 429
 430         def report_video_url(self, video_id, video_real_url):
 431                 """Report extracted video URL."""
 432                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 433
 434         def _real_initialize(self):
 435                 if self._downloader is None:
 436                         return
 437
 438                 username = None
 439                 password = None
 440                 downloader_params = self._downloader.get_params()
 441
 442                 # Attempt to use provided username and password or .netrc data
 443                 if downloader_params.get('username', None) is not None:
 444                         username = downloader_params['username']
 445                         password = downloader_params['password']
 446                 elif downloader_params.get('usenetrc', False):
 447                         try:
 448                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 449                                 if info is not None:
 450                                         username = info[0]
 451                                         password = info[2]
 452                                 else:
 453                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 454                         except (IOError, netrc.NetrcParseError), err:
 455                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 456                                 return
 457
 458                 # No authentication to be performed
 459                 if username is None:
 460                         return
 461
 462                 # Log in
 463                 login_form = {
 464                                 'current_form': 'loginForm',
 465                                 'next':         '/',
 466                                 'action_login': 'Log In',
 467                                 'username':     username,
 468                                 'password':     password,
 469                                 }
 470                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 471                 try:
 472                         self.report_login()
 473                         login_results = urllib2.urlopen(request).read()
 474                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 475                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 476                                 return
 477                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 478                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 479                         return
 480
 481                 # Confirm age
 482                 age_form = {
 483                                 'next_url':             '/',
 484                                 'action_confirm':       'Confirm',
 485                                 }
 486                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 487                 try:
 488                         self.report_age_confirmation()
 489                         age_results = urllib2.urlopen(request).read()
 490                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 491                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 492                         return
 493
 494         def _real_extract(self, url):
 495                 # Extract video id from URL
 496                 mobj = re.match(self._VALID_URL, url)
 497                 if mobj is None:
 498                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 499                         return [None]
 500                 video_id = mobj.group(2)
 501
 502                 # Downloader parameters
 503                 format_param = None
 504                 if self._downloader is not None:
 505                         params = self._downloader.get_params()
 506                         format_param = params.get('format', None)
 507
 508                 # Extension
 509                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
 510
 511                 # Normalize URL, including format
 512                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 513                 if format_param is not None:
 514                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 515                 request = urllib2.Request(normalized_url, None, std_headers)
 516                 try:
 517                         self.report_webpage_download(video_id)
 518                         video_webpage = urllib2.urlopen(request).read()
 519                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 520                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 521                         return [None]
 522                 self.report_information_extraction(video_id)
 523
 524                 # "t" param
 525                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 526                 if mobj is None:
 527                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 528                         return [None]
 529                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 530                 if format_param is not None:
 531                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 532                 self.report_video_url(video_id, video_real_url)
 533
 534                 # uploader
 535                 mobj = re.search(r'More From: ([^<]*)<', video_webpage)
 536                 if mobj is None:
 537                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 538                         return [None]
 539                 video_uploader = mobj.group(1)
 540
 541                 # title
 542                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 543                 if mobj is None:
 544                         self.to_stderr(u'ERROR: unable to extract video title')
 545                         return [None]
 546                 video_title = mobj.group(1).decode('utf-8')
 547                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 548                 video_title = video_title.replace(os.sep, u'%')
 549
 550                 # simplified title
 551                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 552                 simple_title = simple_title.strip(ur'_')
 553
 554                 # Return information
 555                 return [{
 556                         'id':           video_id.decode('utf-8'),
 557                         'url':          video_real_url.decode('utf-8'),
 558                         'uploader':     video_uploader.decode('utf-8'),
 559                         'title':        video_title,
 560                         'stitle':       simple_title,
 561                         'ext':          video_extension.decode('utf-8'),
 562                         }]
 563
 564 class MetacafeIE(InfoExtractor):
 565         """Information Extractor for metacafe.com."""
 566
 567         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 568         _DISCLAIMER = 'http://www.metacafe.com/disclaimer'
 569         _youtube_ie = None
 570
 571         def __init__(self, youtube_ie, downloader=None):
 572                 InfoExtractor.__init__(self, downloader)
 573                 self._youtube_ie = youtube_ie
 574
 575         @staticmethod
 576         def suitable(url):
 577                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 578
 579         def report_disclaimer(self):
 580                 """Report disclaimer retrieval."""
 581                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 582
 583         def report_age_confirmation(self):
 584                 """Report attempt to confirm age."""
 585                 self.to_stdout(u'[metacafe] Confirming age')
 586
 587         def report_download_webpage(self, video_id):
 588                 """Report webpage download."""
 589                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 590
 591         def report_extraction(self, video_id):
 592                 """Report information extraction."""
 593                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 594
 595         def _real_initialize(self):
 596                 # Retrieve disclaimer
 597                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 598                 try:
 599                         self.report_disclaimer()
 600                         disclaimer = urllib2.urlopen(request).read()
 601                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 602                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 603                         return
 604
 605                 # Confirm age
 606                 disclaimer_form = {
 607                         'allowAdultContent': '1',
 608                         'submit': "Continue - I'm over 18",
 609                         }
 610                 request = urllib2.Request('http://www.metacafe.com/watch/', urllib.urlencode(disclaimer_form), std_headers)
 611                 try:
 612                         self.report_age_confirmation()
 613                         disclaimer = urllib2.urlopen(request).read()
 614                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 615                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 616                         return
 617
 618         def _real_extract(self, url):
 619                 # Extract id and simplified title from URL
 620                 mobj = re.match(self._VALID_URL, url)
 621                 if mobj is None:
 622                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 623                         return [None]
 624
 625                 video_id = mobj.group(1)
 626
 627                 # Check if video comes from YouTube
 628                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 629                 if mobj2 is not None:
 630                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 631
 632                 simple_title = mobj.group(2).decode('utf-8')
 633                 video_extension = 'flv'
 634
 635                 # Retrieve video webpage to extract further information
 636                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 637                 try:
 638                         self.report_download_webpage(video_id)
 639                         webpage = urllib2.urlopen(request).read()
 640                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 641                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 642                         return [None]
 643
 644                 # Extract URL, uploader and title from webpage
 645                 self.report_extraction(video_id)
 646                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 647                 if mobj is None:
 648                         self.to_stderr(u'ERROR: unable to extract media URL')
 649                         return [None]
 650                 mediaURL = mobj.group(1).replace('\\', '')
 651
 652                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 653                 if mobj is None:
 654                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 655                         return [None]
 656                 gdaKey = mobj.group(1)
 657
 658                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 659
 660                 mobj = re.search(r'(?im)<meta name="title" content="Metacafe - ([^"]+)"', webpage)
 661                 if mobj is None:
 662                         self.to_stderr(u'ERROR: unable to extract title')
 663                         return [None]
 664                 video_title = mobj.group(1).decode('utf-8')
 665
 666                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 667                 if mobj is None:
 668                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 669                         return [None]
 670                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 671
 672                 # Return information
 673                 return [{
 674                         'id':           video_id.decode('utf-8'),
 675                         'url':          video_url.decode('utf-8'),
 676                         'uploader':     video_uploader.decode('utf-8'),
 677                         'title':        video_title,
 678                         'stitle':       simple_title,
 679                         'ext':          video_extension.decode('utf-8'),
 680                         }]
 681
 682 class YoutubePlaylistIE(InfoExtractor):
 683         """Information Extractor for YouTube playlists."""
 684
 685         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 686         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s'
 687         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 688         _MORE_PAGES_INDICATOR = r'class="pagerNotCurrent">Next</a>'
 689         _youtube_ie = None
 690
 691         def __init__(self, youtube_ie, downloader=None):
 692                 InfoExtractor.__init__(self, downloader)
 693                 self._youtube_ie = youtube_ie
 694
 695         @staticmethod
 696         def suitable(url):
 697                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 698
 699         def report_download_page(self, playlist_id, pagenum):
 700                 """Report attempt to download playlist page with given number."""
 701                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 702
 703         def _real_initialize(self):
 704                 self._youtube_ie.initialize()
 705
 706         def _real_extract(self, url):
 707                 # Extract playlist id
 708                 mobj = re.match(self._VALID_URL, url)
 709                 if mobj is None:
 710                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 711                         return [None]
 712
 713                 # Download playlist pages
 714                 playlist_id = mobj.group(1)
 715                 video_ids = []
 716                 pagenum = 1
 717
 718                 while True:
 719                         self.report_download_page(playlist_id, pagenum)
 720                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 721                         try:
 722                                 page = urllib2.urlopen(request).read()
 723                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 724                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 725                                 return [None]
 726
 727                         # Extract video identifiers
 728                         ids_in_page = set()
 729                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 730                                 ids_in_page.add(mobj.group(1))
 731                         video_ids.extend(list(ids_in_page))
 732
 733                         if self._MORE_PAGES_INDICATOR not in page:
 734                                 break
 735                         pagenum = pagenum + 1
 736
 737                 information = []
 738                 for id in video_ids:
 739                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 740                 return information
 741
 742 if __name__ == '__main__':
 743         try:
 744                 # Modules needed only when running the main program
 745                 import getpass
 746                 import optparse
 747
 748                 # General configuration
 749                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 750                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 751                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 752
 753                 # Parse command line
 754                 parser = optparse.OptionParser(
 755                                 usage='Usage: %prog [options] url...',
 756                                 version='2008.07.22',
 757                                 conflict_handler='resolve',
 758                                 )
 759                 parser.add_option('-h', '--help',
 760                                 action='help', help='print this help text and exit')
 761                 parser.add_option('-v', '--version',
 762                                 action='version', help='print program version and exit')
 763                 parser.add_option('-u', '--username',
 764                                 dest='username', metavar='UN', help='account username')
 765                 parser.add_option('-p', '--password',
 766                                 dest='password', metavar='PW', help='account password')
 767                 parser.add_option('-o', '--output',
 768                                 dest='outtmpl', metavar='TPL', help='output filename template')
 769                 parser.add_option('-q', '--quiet',
 770                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 771                 parser.add_option('-s', '--simulate',
 772                                 action='store_true', dest='simulate', help='do not download video', default=False)
 773                 parser.add_option('-t', '--title',
 774                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 775                 parser.add_option('-l', '--literal',
 776                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 777                 parser.add_option('-n', '--netrc',
 778                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 779                 parser.add_option('-g', '--get-url',
 780                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 781                 parser.add_option('-e', '--get-title',
 782                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 783                 parser.add_option('-f', '--format',
 784                                 dest='format', metavar='FMT', help='video format code')
 785                 parser.add_option('-b', '--best-quality',
 786                                 action='store_const', dest='format', help='alias for -f 18', const='18')
 787                 parser.add_option('-m', '--mobile-version',
 788                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 789                 parser.add_option('-i', '--ignore-errors',
 790                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 791                 parser.add_option('-r', '--rate-limit',
 792                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 793                 (opts, args) = parser.parse_args()
 794
 795                 # Conflicting, missing and erroneous options
 796                 if len(args) < 1:
 797                         sys.exit(u'ERROR: you must provide at least one URL')
 798                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 799                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
 800                 if opts.password is not None and opts.username is None:
 801                         sys.exit(u'ERROR: account username missing')
 802                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 803                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
 804                 if opts.usetitle and opts.useliteral:
 805                         sys.exit(u'ERROR: using title conflicts with using literal title')
 806                 if opts.username is not None and opts.password is None:
 807                         opts.password = getpass.getpass(u'Type account password and press return:')
 808                 if opts.ratelimit is not None:
 809                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
 810                         if numeric_limit is None:
 811                                 sys.exit(u'ERROR: invalid rate limit specified')
 812                         opts.ratelimit = numeric_limit
 813
 814                 # Information extractors
 815                 youtube_ie = YoutubeIE()
 816                 metacafe_ie = MetacafeIE(youtube_ie)
 817                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
 818
 819                 # File downloader
 820                 fd = FileDownloader({
 821                         'usenetrc': opts.usenetrc,
 822                         'username': opts.username,
 823                         'password': opts.password,
 824                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 825                         'forceurl': opts.geturl,
 826                         'forcetitle': opts.gettitle,
 827                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 828                         'format': opts.format,
 829                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode())
 830                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
 831                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
 832                                 or u'%(id)s.%(ext)s'),
 833                         'ignoreerrors': opts.ignoreerrors,
 834                         'ratelimit': opts.ratelimit,
 835                         })
 836                 fd.add_info_extractor(youtube_pl_ie)
 837                 fd.add_info_extractor(metacafe_ie)
 838                 fd.add_info_extractor(youtube_ie)
 839                 retcode = fd.download(args)
 840                 sys.exit(retcode)
 841
 842         except DownloadError:
 843                 sys.exit(1)
 844         except SameFileError:
 845                 sys.exit(u'ERROR: fixed output name but more than one file to download')
 846         except KeyboardInterrupt:
 847                 sys.exit(u'\nERROR: Interrupted by user')