_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # License: Public domain code
   5 import htmlentitydefs
   6 import httplib
   7 import math
   8 import netrc
   9 import os
  10 import os.path
  11 import re
  12 import socket
  13 import string
  14 import sys
  15 import time
  16 import urllib
  17 import urllib2
  18
  19 std_headers = {
  20         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.1) Gecko/2008070208 Firefox/3.0.1',
  21         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  22         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  23         'Accept-Language': 'en-us,en;q=0.5',
  24 }
  25
  26 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  27
  28 class DownloadError(Exception):
  29         """Download Error exception.
  30
  31         This exception may be thrown by FileDownloader objects if they are not
  32         configured to continue on errors. They will contain the appropriate
  33         error message.
  34         """
  35         pass
  36
  37 class SameFileError(Exception):
  38         """Same File exception.
  39
  40         This exception will be thrown by FileDownloader objects if they detect
  41         multiple files would have to be downloaded to the same file on disk.
  42         """
  43         pass
  44
  45 class FileDownloader(object):
  46         """File Downloader class.
  47
  48         File downloader objects are the ones responsible of downloading the
  49         actual video file and writing it to disk if the user has requested
  50         it, among some other tasks. In most cases there should be one per
  51         program. As, given a video URL, the downloader doesn't know how to
  52         extract all the needed information, task that InfoExtractors do, it
  53         has to pass the URL to one of them.
  54
  55         For this, file downloader objects have a method that allows
  56         InfoExtractors to be registered in a given order. When it is passed
  57         a URL, the file downloader handles it to the first InfoExtractor it
  58         finds that reports being able to handle it. The InfoExtractor returns
  59         all the information to the FileDownloader and the latter downloads the
  60         file or does whatever it's instructed to do.
  61
  62         File downloaders accept a lot of parameters. In order not to saturate
  63         the object constructor with arguments, it receives a dictionary of
  64         options instead. These options are available through the get_params()
  65         method for the InfoExtractors to use. The FileDownloader also registers
  66         itself as the downloader in charge for the InfoExtractors that are
  67         added to it, so this is a "mutual registration".
  68
  69         Available options:
  70
  71         username:       Username for authentication purposes.
  72         password:       Password for authentication purposes.
  73         usenetrc:       Use netrc for authentication instead.
  74         quiet:          Do not print messages to stdout.
  75         forceurl:       Force printing final URL.
  76         forcetitle:     Force printing title.
  77         simulate:       Do not download the video files.
  78         format:         Video format code.
  79         outtmpl:        Template for output names.
  80         ignoreerrors:   Do not stop on download errors.
  81         ratelimit:      Download speed limit, in bytes/sec.
  82         """
  83
  84         _params = None
  85         _ies = []
  86
  87         def __init__(self, params):
  88                 """Create a FileDownloader object with the given options."""
  89                 self._ies = []
  90                 self.set_params(params)
  91
  92         @staticmethod
  93         def pmkdir(filename):
  94                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
  95                 components = filename.split(os.sep)
  96                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
  97                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
  98                 for dir in aggregate:
  99                         if not os.path.exists(dir):
 100                                 os.mkdir(dir)
 101
 102         @staticmethod
 103         def format_bytes(bytes):
 104                 if bytes is None:
 105                         return 'N/A'
 106                 if bytes == 0:
 107                         exponent = 0
 108                 else:
 109                         exponent = long(math.log(float(bytes), 1024.0))
 110                 suffix = 'bkMGTPEZY'[exponent]
 111                 converted = float(bytes) / float(1024**exponent)
 112                 return '%.2f%s' % (converted, suffix)
 113
 114         @staticmethod
 115         def calc_percent(byte_counter, data_len):
 116                 if data_len is None:
 117                         return '---.-%'
 118                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 119
 120         @staticmethod
 121         def calc_eta(start, now, total, current):
 122                 if total is None:
 123                         return '--:--'
 124                 dif = now - start
 125                 if current == 0 or dif < 0.001: # One millisecond
 126                         return '--:--'
 127                 rate = float(current) / dif
 128                 eta = long((float(total) - float(current)) / rate)
 129                 (eta_mins, eta_secs) = divmod(eta, 60)
 130                 if eta_mins > 99:
 131                         return '--:--'
 132                 return '%02d:%02d' % (eta_mins, eta_secs)
 133
 134         @staticmethod
 135         def calc_speed(start, now, bytes):
 136                 dif = now - start
 137                 if bytes == 0 or dif < 0.001: # One millisecond
 138                         return '%10s' % '---b/s'
 139                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 140
 141         @staticmethod
 142         def best_block_size(elapsed_time, bytes):
 143                 new_min = max(bytes / 2.0, 1.0)
 144                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 145                 if elapsed_time < 0.001:
 146                         return int(new_max)
 147                 rate = bytes / elapsed_time
 148                 if rate > new_max:
 149                         return int(new_max)
 150                 if rate < new_min:
 151                         return int(new_min)
 152                 return int(rate)
 153
 154         @staticmethod
 155         def parse_bytes(bytestr):
 156                 """Parse a string indicating a byte quantity into a long integer."""
 157                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 158                 if matchobj is None:
 159                         return None
 160                 number = float(matchobj.group(1))
 161                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 162                 return long(round(number * multiplier))
 163
 164         def set_params(self, params):
 165                 """Sets parameters."""
 166                 if type(params) != dict:
 167                         raise ValueError('params: dictionary expected')
 168                 self._params = params
 169
 170         def get_params(self):
 171                 """Get parameters."""
 172                 return self._params
 173
 174         def add_info_extractor(self, ie):
 175                 """Add an InfoExtractor object to the end of the list."""
 176                 self._ies.append(ie)
 177                 ie.set_downloader(self)
 178
 179         def to_stdout(self, message, skip_eol=False):
 180                 """Print message to stdout if not in quiet mode."""
 181                 if not self._params.get('quiet', False):
 182                         print u'%s%s' % (message, [u'\n', u''][skip_eol]),
 183                         sys.stdout.flush()
 184
 185         def to_stderr(self, message):
 186                 """Print message to stderr."""
 187                 print >>sys.stderr, message
 188
 189         def fixed_template(self):
 190                 """Checks if the output template is fixed."""
 191                 return (re.search(ur'(?u)%\(.+?\)s', self._params['outtmpl']) is None)
 192
 193         def trouble(self, message=None):
 194                 """Determine action to take when a download problem appears.
 195
 196                 Depending on if the downloader has been configured to ignore
 197                 download errors or not, this method may throw an exception or
 198                 not when errors are found, after printing the message. If it
 199                 doesn't raise, it returns an error code suitable to be returned
 200                 later as a program exit code to indicate error.
 201                 """
 202                 if message is not None:
 203                         self.to_stderr(message)
 204                 if not self._params.get('ignoreerrors', False):
 205                         raise DownloadError(message)
 206                 return 1
 207
 208         def slow_down(self, start_time, byte_counter):
 209                 """Sleep if the download speed is over the rate limit."""
 210                 rate_limit = self._params.get('ratelimit', None)
 211                 if rate_limit is None or byte_counter == 0:
 212                         return
 213                 now = time.time()
 214                 elapsed = now - start_time
 215                 if elapsed <= 0.0:
 216                         return
 217                 speed = float(byte_counter) / elapsed
 218                 if speed > rate_limit:
 219                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 220
 221         def report_destination(self, filename):
 222                 """Report destination filename."""
 223                 self.to_stdout(u'[download] Destination: %s' % filename)
 224
 225         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 226                 """Report download progress."""
 227                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 228                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 229
 230         def report_finish(self):
 231                 """Report download finished."""
 232                 self.to_stdout(u'')
 233
 234         def download(self, url_list):
 235                 """Download a given list of URLs."""
 236                 retcode = 0
 237                 if len(url_list) > 1 and self.fixed_template():
 238                         raise SameFileError(self._params['outtmpl'])
 239
 240                 for url in url_list:
 241                         suitable_found = False
 242                         for ie in self._ies:
 243                                 if not ie.suitable(url):
 244                                         continue
 245                                 # Suitable InfoExtractor found
 246                                 suitable_found = True
 247                                 all_results = ie.extract(url)
 248                                 results = [x for x in all_results if x is not None]
 249                                 if len(results) != len(all_results):
 250                                         retcode = self.trouble()
 251
 252                                 if len(results) > 1 and self.fixed_template():
 253                                         raise SameFileError(self._params['outtmpl'])
 254
 255                                 for result in results:
 256                                         # Forced printings
 257                                         if self._params.get('forcetitle', False):
 258                                                 print result['title']
 259                                         if self._params.get('forceurl', False):
 260                                                 print result['url']
 261
 262                                         # Do nothing else if in simulate mode
 263                                         if self._params.get('simulate', False):
 264                                                 continue
 265
 266                                         try:
 267                                                 filename = self._params['outtmpl'] % result
 268                                                 self.report_destination(filename)
 269                                         except (ValueError, KeyError), err:
 270                                                 retcode = self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 271                                                 continue
 272                                         try:
 273                                                 self.pmkdir(filename)
 274                                         except (OSError, IOError), err:
 275                                                 retcode = self.trouble('ERROR: unable to create directories: %s' % str(err))
 276                                                 continue
 277                                         try:
 278                                                 outstream = open(filename, 'wb')
 279                                         except (OSError, IOError), err:
 280                                                 retcode = self.trouble('ERROR: unable to open for writing: %s' % str(err))
 281                                                 continue
 282                                         try:
 283                                                 self._do_download(outstream, result['url'])
 284                                                 outstream.close()
 285                                         except (OSError, IOError), err:
 286                                                 retcode = self.trouble('ERROR: unable to write video data: %s' % str(err))
 287                                                 continue
 288                                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 289                                                 retcode = self.trouble('ERROR: unable to download video data: %s' % str(err))
 290                                                 continue
 291                                 break
 292                         if not suitable_found:
 293                                 retcode = self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 294
 295                 return retcode
 296
 297         def _do_download(self, stream, url):
 298                 request = urllib2.Request(url, None, std_headers)
 299                 data = urllib2.urlopen(request)
 300                 data_len = data.info().get('Content-length', None)
 301                 data_len_str = self.format_bytes(data_len)
 302                 byte_counter = 0
 303                 block_size = 1024
 304                 start = time.time()
 305                 while True:
 306                         # Progress message
 307                         percent_str = self.calc_percent(byte_counter, data_len)
 308                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 309                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 310                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 311
 312                         # Download and write
 313                         before = time.time()
 314                         data_block = data.read(block_size)
 315                         after = time.time()
 316                         data_block_len = len(data_block)
 317                         if data_block_len == 0:
 318                                 break
 319                         byte_counter += data_block_len
 320                         stream.write(data_block)
 321                         block_size = self.best_block_size(after - before, data_block_len)
 322
 323                         # Apply rate limit
 324                         self.slow_down(start, byte_counter)
 325
 326                 self.report_finish()
 327                 if data_len is not None and str(byte_counter) != data_len:
 328                         raise ValueError('Content too short: %s/%s bytes' % (byte_counter, data_len))
 329
 330 class InfoExtractor(object):
 331         """Information Extractor class.
 332
 333         Information extractors are the classes that, given a URL, extract
 334         information from the video (or videos) the URL refers to. This
 335         information includes the real video URL, the video title and simplified
 336         title, author and others. It is returned in a list of dictionaries when
 337         calling its extract() method. It is a list because a URL can refer to
 338         more than one video (think of playlists). The dictionaries must include
 339         the following fields:
 340
 341         id:             Video identifier.
 342         url:            Final video URL.
 343         uploader:       Nickname of the video uploader.
 344         title:          Literal title.
 345         stitle:         Simplified title.
 346         ext:            Video filename extension.
 347
 348         Subclasses of this one should re-define the _real_initialize() and
 349         _real_extract() methods, as well as the suitable() static method.
 350         Probably, they should also be instantiated and added to the main
 351         downloader.
 352         """
 353
 354         _ready = False
 355         _downloader = None
 356
 357         def __init__(self, downloader=None):
 358                 """Constructor. Receives an optional downloader."""
 359                 self._ready = False
 360                 self.set_downloader(downloader)
 361
 362         @staticmethod
 363         def suitable(url):
 364                 """Receives a URL and returns True if suitable for this IE."""
 365                 return False
 366
 367         def initialize(self):
 368                 """Initializes an instance (authentication, etc)."""
 369                 if not self._ready:
 370                         self._real_initialize()
 371                         self._ready = True
 372
 373         def extract(self, url):
 374                 """Extracts URL information and returns it in list of dicts."""
 375                 self.initialize()
 376                 return self._real_extract(url)
 377
 378         def set_downloader(self, downloader):
 379                 """Sets the downloader for this IE."""
 380                 self._downloader = downloader
 381
 382         def to_stdout(self, message):
 383                 """Print message to stdout if downloader is not in quiet mode."""
 384                 if self._downloader is None or not self._downloader.get_params().get('quiet', False):
 385                         print message
 386
 387         def to_stderr(self, message):
 388                 """Print message to stderr."""
 389                 print >>sys.stderr, message
 390
 391         def _real_initialize(self):
 392                 """Real initialization process. Redefine in subclasses."""
 393                 pass
 394
 395         def _real_extract(self, url):
 396                 """Real extraction process. Redefine in subclasses."""
 397                 pass
 398
 399 class YoutubeIE(InfoExtractor):
 400         """Information extractor for youtube.com."""
 401
 402         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?\?(?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 403         _LOGIN_URL = 'http://www.youtube.com/login?next=/'
 404         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/'
 405         _NETRC_MACHINE = 'youtube'
 406
 407         @staticmethod
 408         def suitable(url):
 409                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 410
 411         def report_login(self):
 412                 """Report attempt to log in."""
 413                 self.to_stdout(u'[youtube] Logging in')
 414
 415         def report_age_confirmation(self):
 416                 """Report attempt to confirm age."""
 417                 self.to_stdout(u'[youtube] Confirming age')
 418
 419         def report_webpage_download(self, video_id):
 420                 """Report attempt to download webpage."""
 421                 self.to_stdout(u'[youtube] %s: Downloading video webpage' % video_id)
 422
 423         def report_information_extraction(self, video_id):
 424                 """Report attempt to extract video information."""
 425                 self.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 426
 427         def report_video_url(self, video_id, video_real_url):
 428                 """Report extracted video URL."""
 429                 self.to_stdout(u'[youtube] %s: URL: %s' % (video_id, video_real_url))
 430
 431         def _real_initialize(self):
 432                 if self._downloader is None:
 433                         return
 434
 435                 username = None
 436                 password = None
 437                 downloader_params = self._downloader.get_params()
 438
 439                 # Attempt to use provided username and password or .netrc data
 440                 if downloader_params.get('username', None) is not None:
 441                         username = downloader_params['username']
 442                         password = downloader_params['password']
 443                 elif downloader_params.get('usenetrc', False):
 444                         try:
 445                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 446                                 if info is not None:
 447                                         username = info[0]
 448                                         password = info[2]
 449                                 else:
 450                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 451                         except (IOError, netrc.NetrcParseError), err:
 452                                 self.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 453                                 return
 454
 455                 # No authentication to be performed
 456                 if username is None:
 457                         return
 458
 459                 # Log in
 460                 login_form = {
 461                                 'current_form': 'loginForm',
 462                                 'next':         '/',
 463                                 'action_login': 'Log In',
 464                                 'username':     username,
 465                                 'password':     password,
 466                                 }
 467                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 468                 try:
 469                         self.report_login()
 470                         login_results = urllib2.urlopen(request).read()
 471                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 472                                 self.to_stderr(u'WARNING: unable to log in: bad username or password')
 473                                 return
 474                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 475                         self.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 476                         return
 477
 478                 # Confirm age
 479                 age_form = {
 480                                 'next_url':             '/',
 481                                 'action_confirm':       'Confirm',
 482                                 }
 483                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 484                 try:
 485                         self.report_age_confirmation()
 486                         age_results = urllib2.urlopen(request).read()
 487                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 488                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 489                         return
 490
 491         def _real_extract(self, url):
 492                 # Extract video id from URL
 493                 mobj = re.match(self._VALID_URL, url)
 494                 if mobj is None:
 495                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 496                         return [None]
 497                 video_id = mobj.group(2)
 498
 499                 # Downloader parameters
 500                 format_param = None
 501                 if self._downloader is not None:
 502                         params = self._downloader.get_params()
 503                         format_param = params.get('format', None)
 504
 505                 # Extension
 506                 video_extension = {'18': 'mp4', '17': '3gp'}.get(format_param, 'flv')
 507
 508                 # Normalize URL, including format
 509                 normalized_url = 'http://www.youtube.com/watch?v=%s' % video_id
 510                 if format_param is not None:
 511                         normalized_url = '%s&fmt=%s' % (normalized_url, format_param)
 512                 request = urllib2.Request(normalized_url, None, std_headers)
 513                 try:
 514                         self.report_webpage_download(video_id)
 515                         video_webpage = urllib2.urlopen(request).read()
 516                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 517                         self.to_stderr(u'ERROR: unable to download video webpage: %s' % str(err))
 518                         return [None]
 519                 self.report_information_extraction(video_id)
 520
 521                 # "t" param
 522                 mobj = re.search(r', "t": "([^"]+)"', video_webpage)
 523                 if mobj is None:
 524                         self.to_stderr(u'ERROR: unable to extract "t" parameter')
 525                         return [None]
 526                 video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s' % (video_id, mobj.group(1))
 527                 if format_param is not None:
 528                         video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 529                 self.report_video_url(video_id, video_real_url)
 530
 531                 # uploader
 532                 mobj = re.search(r"var watchUsername = '([^']+)';", video_webpage)
 533                 if mobj is None:
 534                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 535                         return [None]
 536                 video_uploader = mobj.group(1)
 537
 538                 # title
 539                 mobj = re.search(r'(?im)<title>YouTube - ([^<]*)</title>', video_webpage)
 540                 if mobj is None:
 541                         self.to_stderr(u'ERROR: unable to extract video title')
 542                         return [None]
 543                 video_title = mobj.group(1).decode('utf-8')
 544                 video_title = re.sub(ur'(?u)&(.+?);', lambda x: unichr(htmlentitydefs.name2codepoint[x.group(1)]), video_title)
 545                 video_title = video_title.replace(os.sep, u'%')
 546
 547                 # simplified title
 548                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 549                 simple_title = simple_title.strip(ur'_')
 550
 551                 # Return information
 552                 return [{
 553                         'id':           video_id.decode('utf-8'),
 554                         'url':          video_real_url.decode('utf-8'),
 555                         'uploader':     video_uploader.decode('utf-8'),
 556                         'title':        video_title,
 557                         'stitle':       simple_title,
 558                         'ext':          video_extension.decode('utf-8'),
 559                         }]
 560
 561 class MetacafeIE(InfoExtractor):
 562         """Information Extractor for metacafe.com."""
 563
 564         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 565         _DISCLAIMER = 'http://www.metacafe.com/disclaimer'
 566         _youtube_ie = None
 567
 568         def __init__(self, youtube_ie, downloader=None):
 569                 InfoExtractor.__init__(self, downloader)
 570                 self._youtube_ie = youtube_ie
 571
 572         @staticmethod
 573         def suitable(url):
 574                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 575
 576         def report_disclaimer(self):
 577                 """Report disclaimer retrieval."""
 578                 self.to_stdout(u'[metacafe] Retrieving disclaimer')
 579
 580         def report_age_confirmation(self):
 581                 """Report attempt to confirm age."""
 582                 self.to_stdout(u'[metacafe] Confirming age')
 583
 584         def report_download_webpage(self, video_id):
 585                 """Report webpage download."""
 586                 self.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 587
 588         def report_extraction(self, video_id):
 589                 """Report information extraction."""
 590                 self.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 591
 592         def _real_initialize(self):
 593                 # Retrieve disclaimer
 594                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 595                 try:
 596                         self.report_disclaimer()
 597                         disclaimer = urllib2.urlopen(request).read()
 598                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 599                         self.to_stderr(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 600                         return
 601
 602                 # Confirm age
 603                 disclaimer_form = {
 604                         'allowAdultContent': '1',
 605                         'submit': "Continue - I'm over 18",
 606                         }
 607                 request = urllib2.Request('http://www.metacafe.com/watch/', urllib.urlencode(disclaimer_form), std_headers)
 608                 try:
 609                         self.report_age_confirmation()
 610                         disclaimer = urllib2.urlopen(request).read()
 611                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 612                         self.to_stderr(u'ERROR: unable to confirm age: %s' % str(err))
 613                         return
 614
 615         def _real_extract(self, url):
 616                 # Extract id and simplified title from URL
 617                 mobj = re.match(self._VALID_URL, url)
 618                 if mobj is None:
 619                         self.to_stderr(u'ERROR: invalid URL: %s' % url)
 620                         return [None]
 621
 622                 video_id = mobj.group(1)
 623
 624                 # Check if video comes from YouTube
 625                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 626                 if mobj2 is not None:
 627                         return self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 628
 629                 simple_title = mobj.group(2).decode('utf-8')
 630                 video_extension = 'flv'
 631
 632                 # Retrieve video webpage to extract further information
 633                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 634                 try:
 635                         self.report_download_webpage(video_id)
 636                         webpage = urllib2.urlopen(request).read()
 637                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 638                         self.to_stderr(u'ERROR: unable retrieve video webpage: %s' % str(err))
 639                         return [None]
 640
 641                 # Extract URL, uploader and title from webpage
 642                 self.report_extraction(video_id)
 643                 mobj = re.search(r'(?m)"mediaURL":"(http.*?\.flv)"', webpage)
 644                 if mobj is None:
 645                         self.to_stderr(u'ERROR: unable to extract media URL')
 646                         return [None]
 647                 mediaURL = mobj.group(1).replace('\\', '')
 648
 649                 mobj = re.search(r'(?m)"gdaKey":"(.*?)"', webpage)
 650                 if mobj is None:
 651                         self.to_stderr(u'ERROR: unable to extract gdaKey')
 652                         return [None]
 653                 gdaKey = mobj.group(1)
 654
 655                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 656
 657                 mobj = re.search(r'(?im)<meta name="title" content="Metacafe - ([^"]+)"', webpage)
 658                 if mobj is None:
 659                         self.to_stderr(u'ERROR: unable to extract title')
 660                         return [None]
 661                 video_title = mobj.group(1).decode('utf-8')
 662
 663                 mobj = re.search(r'(?m)<li id="ChnlUsr">.*?Submitter:<br />(.*?)</li>', webpage)
 664                 if mobj is None:
 665                         self.to_stderr(u'ERROR: unable to extract uploader nickname')
 666                         return [None]
 667                 video_uploader = re.sub(r'<.*?>', '', mobj.group(1))
 668
 669                 # Return information
 670                 return [{
 671                         'id':           video_id.decode('utf-8'),
 672                         'url':          video_url.decode('utf-8'),
 673                         'uploader':     video_uploader.decode('utf-8'),
 674                         'title':        video_title,
 675                         'stitle':       simple_title,
 676                         'ext':          video_extension.decode('utf-8'),
 677                         }]
 678
 679 class YoutubePlaylistIE(InfoExtractor):
 680         """Information Extractor for YouTube playlists."""
 681
 682         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/view_play_list\?p=(.+)'
 683         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s'
 684         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
 685         _MORE_PAGES_INDICATOR = r'/view_play_list?p=%s&amp;page=%s'
 686         _youtube_ie = None
 687
 688         def __init__(self, youtube_ie, downloader=None):
 689                 InfoExtractor.__init__(self, downloader)
 690                 self._youtube_ie = youtube_ie
 691
 692         @staticmethod
 693         def suitable(url):
 694                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
 695
 696         def report_download_page(self, playlist_id, pagenum):
 697                 """Report attempt to download playlist page with given number."""
 698                 self.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
 699
 700         def _real_initialize(self):
 701                 self._youtube_ie.initialize()
 702
 703         def _real_extract(self, url):
 704                 # Extract playlist id
 705                 mobj = re.match(self._VALID_URL, url)
 706                 if mobj is None:
 707                         self.to_stderr(u'ERROR: invalid url: %s' % url)
 708                         return [None]
 709
 710                 # Download playlist pages
 711                 playlist_id = mobj.group(1)
 712                 video_ids = []
 713                 pagenum = 1
 714
 715                 while True:
 716                         self.report_download_page(playlist_id, pagenum)
 717                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
 718                         try:
 719                                 page = urllib2.urlopen(request).read()
 720                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 721                                 self.to_stderr(u'ERROR: unable to download webpage: %s' % str(err))
 722                                 return [None]
 723
 724                         # Extract video identifiers
 725                         ids_in_page = set()
 726                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
 727                                 ids_in_page.add(mobj.group(1))
 728                         video_ids.extend(list(ids_in_page))
 729
 730                         if (self._MORE_PAGES_INDICATOR % (playlist_id, pagenum + 1)) not in page:
 731                                 break
 732                         pagenum = pagenum + 1
 733
 734                 information = []
 735                 for id in video_ids:
 736                         information.extend(self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id))
 737                 return information
 738
 739 if __name__ == '__main__':
 740         try:
 741                 # Modules needed only when running the main program
 742                 import getpass
 743                 import optparse
 744
 745                 # General configuration
 746                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
 747                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
 748                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
 749
 750                 # Parse command line
 751                 parser = optparse.OptionParser(
 752                                 usage='Usage: %prog [options] url...',
 753                                 version='2008.07.26',
 754                                 conflict_handler='resolve',
 755                                 )
 756                 parser.add_option('-h', '--help',
 757                                 action='help', help='print this help text and exit')
 758                 parser.add_option('-v', '--version',
 759                                 action='version', help='print program version and exit')
 760                 parser.add_option('-u', '--username',
 761                                 dest='username', metavar='UN', help='account username')
 762                 parser.add_option('-p', '--password',
 763                                 dest='password', metavar='PW', help='account password')
 764                 parser.add_option('-o', '--output',
 765                                 dest='outtmpl', metavar='TPL', help='output filename template')
 766                 parser.add_option('-q', '--quiet',
 767                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
 768                 parser.add_option('-s', '--simulate',
 769                                 action='store_true', dest='simulate', help='do not download video', default=False)
 770                 parser.add_option('-t', '--title',
 771                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
 772                 parser.add_option('-l', '--literal',
 773                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
 774                 parser.add_option('-n', '--netrc',
 775                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
 776                 parser.add_option('-g', '--get-url',
 777                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
 778                 parser.add_option('-e', '--get-title',
 779                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
 780                 parser.add_option('-f', '--format',
 781                                 dest='format', metavar='FMT', help='video format code')
 782                 parser.add_option('-b', '--best-quality',
 783                                 action='store_const', dest='format', help='alias for -f 18', const='18')
 784                 parser.add_option('-m', '--mobile-version',
 785                                 action='store_const', dest='format', help='alias for -f 17', const='17')
 786                 parser.add_option('-i', '--ignore-errors',
 787                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
 788                 parser.add_option('-r', '--rate-limit',
 789                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
 790                 (opts, args) = parser.parse_args()
 791
 792                 # Conflicting, missing and erroneous options
 793                 if len(args) < 1:
 794                         sys.exit(u'ERROR: you must provide at least one URL')
 795                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
 796                         sys.exit(u'ERROR: using .netrc conflicts with giving username/password')
 797                 if opts.password is not None and opts.username is None:
 798                         sys.exit(u'ERROR: account username missing')
 799                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
 800                         sys.exit(u'ERROR: using output template conflicts with using title or literal title')
 801                 if opts.usetitle and opts.useliteral:
 802                         sys.exit(u'ERROR: using title conflicts with using literal title')
 803                 if opts.username is not None and opts.password is None:
 804                         opts.password = getpass.getpass(u'Type account password and press return:')
 805                 if opts.ratelimit is not None:
 806                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
 807                         if numeric_limit is None:
 808                                 sys.exit(u'ERROR: invalid rate limit specified')
 809                         opts.ratelimit = numeric_limit
 810
 811                 # Information extractors
 812                 youtube_ie = YoutubeIE()
 813                 metacafe_ie = MetacafeIE(youtube_ie)
 814                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
 815
 816                 # File downloader
 817                 fd = FileDownloader({
 818                         'usenetrc': opts.usenetrc,
 819                         'username': opts.username,
 820                         'password': opts.password,
 821                         'quiet': (opts.quiet or opts.geturl or opts.gettitle),
 822                         'forceurl': opts.geturl,
 823                         'forcetitle': opts.gettitle,
 824                         'simulate': (opts.simulate or opts.geturl or opts.gettitle),
 825                         'format': opts.format,
 826                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode())
 827                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
 828                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
 829                                 or u'%(id)s.%(ext)s'),
 830                         'ignoreerrors': opts.ignoreerrors,
 831                         'ratelimit': opts.ratelimit,
 832                         })
 833                 fd.add_info_extractor(youtube_pl_ie)
 834                 fd.add_info_extractor(metacafe_ie)
 835                 fd.add_info_extractor(youtube_ie)
 836                 retcode = fd.download(args)
 837                 sys.exit(retcode)
 838
 839         except DownloadError:
 840                 sys.exit(1)
 841         except SameFileError:
 842                 sys.exit(u'ERROR: fixed output name but more than one file to download')
 843         except KeyboardInterrupt:
 844                 sys.exit(u'\nERROR: Interrupted by user')