_ Git - youtube-dl/blob - youtube-dl

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3 # Author: Ricardo Garcia Gonzalez
   4 # Author: Danny Colligan
   5 # Author: Benjamin Johnson
   6 # License: Public domain code
   7 import htmlentitydefs
   8 import httplib
   9 import locale
  10 import math
  11 import netrc
  12 import os
  13 import os.path
  14 import re
  15 import socket
  16 import string
  17 import subprocess
  18 import sys
  19 import time
  20 import urllib
  21 import urllib2
  22
  23 # parse_qs was moved from the cgi module to the urlparse module recently.
  24 try:
  25         from urlparse import parse_qs
  26 except ImportError:
  27         from cgi import parse_qs
  28
  29 std_headers = {
  30         'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2) Gecko/20100115 Firefox/3.6',
  31         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
  32         'Accept': 'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
  33         'Accept-Language': 'en-us,en;q=0.5',
  34 }
  35
  36 simple_title_chars = string.ascii_letters.decode('ascii') + string.digits.decode('ascii')
  37
  38 def preferredencoding():
  39         """Get preferred encoding.
  40
  41         Returns the best encoding scheme for the system, based on
  42         locale.getpreferredencoding() and some further tweaks.
  43         """
  44         def yield_preferredencoding():
  45                 try:
  46                         pref = locale.getpreferredencoding()
  47                         u'TEST'.encode(pref)
  48                 except:
  49                         pref = 'UTF-8'
  50                 while True:
  51                         yield pref
  52         return yield_preferredencoding().next()
  53
  54 def htmlentity_transform(matchobj):
  55         """Transforms an HTML entity to a Unicode character.
  56
  57         This function receives a match object and is intended to be used with
  58         the re.sub() function.
  59         """
  60         entity = matchobj.group(1)
  61
  62         # Known non-numeric HTML entity
  63         if entity in htmlentitydefs.name2codepoint:
  64                 return unichr(htmlentitydefs.name2codepoint[entity])
  65
  66         # Unicode character
  67         mobj = re.match(ur'(?u)#(x?\d+)', entity)
  68         if mobj is not None:
  69                 numstr = mobj.group(1)
  70                 if numstr.startswith(u'x'):
  71                         base = 16
  72                         numstr = u'0%s' % numstr
  73                 else:
  74                         base = 10
  75                 return unichr(long(numstr, base))
  76
  77         # Unknown entity in name, return its literal representation
  78         return (u'&%s;' % entity)
  79
  80 def sanitize_title(utitle):
  81         """Sanitizes a video title so it could be used as part of a filename."""
  82         utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
  83         return utitle.replace(unicode(os.sep), u'%')
  84
  85 def sanitize_open(filename, open_mode):
  86         """Try to open the given filename, and slightly tweak it if this fails.
  87
  88         Attempts to open the given filename. If this fails, it tries to change
  89         the filename slightly, step by step, until it's either able to open it
  90         or it fails and raises a final exception, like the standard open()
  91         function.
  92
  93         It returns the tuple (stream, definitive_file_name).
  94         """
  95         try:
  96                 if filename == u'-':
  97                         return (sys.stdout, filename)
  98                 stream = open(filename, open_mode)
  99                 return (stream, filename)
 100         except (IOError, OSError), err:
 101                 # In case of error, try to remove win32 forbidden chars
 102                 filename = re.sub(ur'[<>:"\|\?\*]', u'#', filename)
 103
 104                 # An exception here should be caught in the caller
 105                 stream = open(filename, open_mode)
 106                 return (stream, filename)
 107
 108
 109 class DownloadError(Exception):
 110         """Download Error exception.
 111
 112         This exception may be thrown by FileDownloader objects if they are not
 113         configured to continue on errors. They will contain the appropriate
 114         error message.
 115         """
 116         pass
 117
 118 class SameFileError(Exception):
 119         """Same File exception.
 120
 121         This exception will be thrown by FileDownloader objects if they detect
 122         multiple files would have to be downloaded to the same file on disk.
 123         """
 124         pass
 125
 126 class PostProcessingError(Exception):
 127         """Post Processing exception.
 128
 129         This exception may be raised by PostProcessor's .run() method to
 130         indicate an error in the postprocessing task.
 131         """
 132         pass
 133
 134 class UnavailableFormatError(Exception):
 135         """Unavailable Format exception.
 136
 137         This exception will be thrown when a video is requested
 138         in a format that is not available for that video.
 139         """
 140         pass
 141
 142 class ContentTooShortError(Exception):
 143         """Content Too Short exception.
 144
 145         This exception may be raised by FileDownloader objects when a file they
 146         download is too small for what the server announced first, indicating
 147         the connection was probably interrupted.
 148         """
 149         # Both in bytes
 150         downloaded = None
 151         expected = None
 152
 153         def __init__(self, downloaded, expected):
 154                 self.downloaded = downloaded
 155                 self.expected = expected
 156
 157 class FileDownloader(object):
 158         """File Downloader class.
 159
 160         File downloader objects are the ones responsible of downloading the
 161         actual video file and writing it to disk if the user has requested
 162         it, among some other tasks. In most cases there should be one per
 163         program. As, given a video URL, the downloader doesn't know how to
 164         extract all the needed information, task that InfoExtractors do, it
 165         has to pass the URL to one of them.
 166
 167         For this, file downloader objects have a method that allows
 168         InfoExtractors to be registered in a given order. When it is passed
 169         a URL, the file downloader handles it to the first InfoExtractor it
 170         finds that reports being able to handle it. The InfoExtractor extracts
 171         all the information about the video or videos the URL refers to, and
 172         asks the FileDownloader to process the video information, possibly
 173         downloading the video.
 174
 175         File downloaders accept a lot of parameters. In order not to saturate
 176         the object constructor with arguments, it receives a dictionary of
 177         options instead. These options are available through the params
 178         attribute for the InfoExtractors to use. The FileDownloader also
 179         registers itself as the downloader in charge for the InfoExtractors
 180         that are added to it, so this is a "mutual registration".
 181
 182         Available options:
 183
 184         username:       Username for authentication purposes.
 185         password:       Password for authentication purposes.
 186         usenetrc:       Use netrc for authentication instead.
 187         quiet:          Do not print messages to stdout.
 188         forceurl:       Force printing final URL.
 189         forcetitle:     Force printing title.
 190         simulate:       Do not download the video files.
 191         format:         Video format code.
 192         outtmpl:        Template for output names.
 193         ignoreerrors:   Do not stop on download errors.
 194         ratelimit:      Download speed limit, in bytes/sec.
 195         nooverwrites:   Prevent overwriting files.
 196         continuedl:     Try to continue downloads if possible.
 197         noprogress:     Do not print the progress bar.
 198         """
 199
 200         params = None
 201         _ies = []
 202         _pps = []
 203         _download_retcode = None
 204         _num_downloads = None
 205
 206         def __init__(self, params):
 207                 """Create a FileDownloader object with the given options."""
 208                 self._ies = []
 209                 self._pps = []
 210                 self._download_retcode = 0
 211                 self._num_downloads = 0
 212                 self.params = params
 213
 214         @staticmethod
 215         def pmkdir(filename):
 216                 """Create directory components in filename. Similar to Unix "mkdir -p"."""
 217                 components = filename.split(os.sep)
 218                 aggregate = [os.sep.join(components[0:x]) for x in xrange(1, len(components))]
 219                 aggregate = ['%s%s' % (x, os.sep) for x in aggregate] # Finish names with separator
 220                 for dir in aggregate:
 221                         if not os.path.exists(dir):
 222                                 os.mkdir(dir)
 223
 224         @staticmethod
 225         def format_bytes(bytes):
 226                 if bytes is None:
 227                         return 'N/A'
 228                 if type(bytes) is str:
 229                         bytes = float(bytes)
 230                 if bytes == 0.0:
 231                         exponent = 0
 232                 else:
 233                         exponent = long(math.log(bytes, 1024.0))
 234                 suffix = 'bkMGTPEZY'[exponent]
 235                 converted = float(bytes) / float(1024**exponent)
 236                 return '%.2f%s' % (converted, suffix)
 237
 238         @staticmethod
 239         def calc_percent(byte_counter, data_len):
 240                 if data_len is None:
 241                         return '---.-%'
 242                 return '%6s' % ('%3.1f%%' % (float(byte_counter) / float(data_len) * 100.0))
 243
 244         @staticmethod
 245         def calc_eta(start, now, total, current):
 246                 if total is None:
 247                         return '--:--'
 248                 dif = now - start
 249                 if current == 0 or dif < 0.001: # One millisecond
 250                         return '--:--'
 251                 rate = float(current) / dif
 252                 eta = long((float(total) - float(current)) / rate)
 253                 (eta_mins, eta_secs) = divmod(eta, 60)
 254                 if eta_mins > 99:
 255                         return '--:--'
 256                 return '%02d:%02d' % (eta_mins, eta_secs)
 257
 258         @staticmethod
 259         def calc_speed(start, now, bytes):
 260                 dif = now - start
 261                 if bytes == 0 or dif < 0.001: # One millisecond
 262                         return '%10s' % '---b/s'
 263                 return '%10s' % ('%s/s' % FileDownloader.format_bytes(float(bytes) / dif))
 264
 265         @staticmethod
 266         def best_block_size(elapsed_time, bytes):
 267                 new_min = max(bytes / 2.0, 1.0)
 268                 new_max = min(max(bytes * 2.0, 1.0), 4194304) # Do not surpass 4 MB
 269                 if elapsed_time < 0.001:
 270                         return long(new_max)
 271                 rate = bytes / elapsed_time
 272                 if rate > new_max:
 273                         return long(new_max)
 274                 if rate < new_min:
 275                         return long(new_min)
 276                 return long(rate)
 277
 278         @staticmethod
 279         def parse_bytes(bytestr):
 280                 """Parse a string indicating a byte quantity into a long integer."""
 281                 matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr)
 282                 if matchobj is None:
 283                         return None
 284                 number = float(matchobj.group(1))
 285                 multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower())
 286                 return long(round(number * multiplier))
 287
 288         @staticmethod
 289         def verify_url(url):
 290                 """Verify a URL is valid and data could be downloaded. Return real data URL."""
 291                 request = urllib2.Request(url, None, std_headers)
 292                 data = urllib2.urlopen(request)
 293                 data.read(1)
 294                 url = data.geturl()
 295                 data.close()
 296                 return url
 297
 298         def add_info_extractor(self, ie):
 299                 """Add an InfoExtractor object to the end of the list."""
 300                 self._ies.append(ie)
 301                 ie.set_downloader(self)
 302
 303         def add_post_processor(self, pp):
 304                 """Add a PostProcessor object to the end of the chain."""
 305                 self._pps.append(pp)
 306                 pp.set_downloader(self)
 307
 308         def to_stdout(self, message, skip_eol=False, ignore_encoding_errors=False):
 309                 """Print message to stdout if not in quiet mode."""
 310                 try:
 311                         if not self.params.get('quiet', False):
 312                                 print (u'%s%s' % (message, [u'\n', u''][skip_eol])).encode(preferredencoding()),
 313                         sys.stdout.flush()
 314                 except (UnicodeEncodeError), err:
 315                         if not ignore_encoding_errors:
 316                                 raise
 317
 318         def to_stderr(self, message):
 319                 """Print message to stderr."""
 320                 print >>sys.stderr, message.encode(preferredencoding())
 321
 322         def fixed_template(self):
 323                 """Checks if the output template is fixed."""
 324                 return (re.search(ur'(?u)%\(.+?\)s', self.params['outtmpl']) is None)
 325
 326         def trouble(self, message=None):
 327                 """Determine action to take when a download problem appears.
 328
 329                 Depending on if the downloader has been configured to ignore
 330                 download errors or not, this method may throw an exception or
 331                 not when errors are found, after printing the message.
 332                 """
 333                 if message is not None:
 334                         self.to_stderr(message)
 335                 if not self.params.get('ignoreerrors', False):
 336                         raise DownloadError(message)
 337                 self._download_retcode = 1
 338
 339         def slow_down(self, start_time, byte_counter):
 340                 """Sleep if the download speed is over the rate limit."""
 341                 rate_limit = self.params.get('ratelimit', None)
 342                 if rate_limit is None or byte_counter == 0:
 343                         return
 344                 now = time.time()
 345                 elapsed = now - start_time
 346                 if elapsed <= 0.0:
 347                         return
 348                 speed = float(byte_counter) / elapsed
 349                 if speed > rate_limit:
 350                         time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 351
 352         def report_destination(self, filename):
 353                 """Report destination filename."""
 354                 self.to_stdout(u'[download] Destination: %s' % filename, ignore_encoding_errors=True)
 355
 356         def report_progress(self, percent_str, data_len_str, speed_str, eta_str):
 357                 """Report download progress."""
 358                 if self.params.get('noprogress', False):
 359                         return
 360                 self.to_stdout(u'\r[download] %s of %s at %s ETA %s' %
 361                                 (percent_str, data_len_str, speed_str, eta_str), skip_eol=True)
 362
 363         def report_resuming_byte(self, resume_len):
 364                 """Report attemtp to resume at given byte."""
 365                 self.to_stdout(u'[download] Resuming download at byte %s' % resume_len)
 366
 367         def report_file_already_downloaded(self, file_name):
 368                 """Report file has already been fully downloaded."""
 369                 try:
 370                         self.to_stdout(u'[download] %s has already been downloaded' % file_name)
 371                 except (UnicodeEncodeError), err:
 372                         self.to_stdout(u'[download] The file has already been downloaded')
 373
 374         def report_unable_to_resume(self):
 375                 """Report it was impossible to resume download."""
 376                 self.to_stdout(u'[download] Unable to resume')
 377
 378         def report_finish(self):
 379                 """Report download finished."""
 380                 if self.params.get('noprogress', False):
 381                         self.to_stdout(u'[download] Download completed')
 382                 else:
 383                         self.to_stdout(u'')
 384
 385         def process_info(self, info_dict):
 386                 """Process a single dictionary returned by an InfoExtractor."""
 387                 # Do nothing else if in simulate mode
 388                 if self.params.get('simulate', False):
 389                         # Verify URL if it's an HTTP one
 390                         if info_dict['url'].startswith('http'):
 391                                 try:
 392                                         self.verify_url(info_dict['url'].encode('utf-8')).decode('utf-8')
 393                                 except (OSError, IOError, urllib2.URLError, httplib.HTTPException, socket.error), err:
 394                                         raise UnavailableFormatError
 395
 396                         # Forced printings
 397                         if self.params.get('forcetitle', False):
 398                                 print info_dict['title'].encode(preferredencoding(), 'xmlcharrefreplace')
 399                         if self.params.get('forceurl', False):
 400                                 print info_dict['url'].encode(preferredencoding(), 'xmlcharrefreplace')
 401                         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 402                                 print info_dict['thumbnail'].encode(preferredencoding(), 'xmlcharrefreplace')
 403                         if self.params.get('forcedescription', False) and 'description' in info_dict:
 404                                 print info_dict['description'].encode(preferredencoding(), 'xmlcharrefreplace')
 405
 406                         return
 407
 408                 try:
 409                         template_dict = dict(info_dict)
 410                         template_dict['epoch'] = unicode(long(time.time()))
 411                         template_dict['ord'] = unicode('%05d' % self._num_downloads)
 412                         filename = self.params['outtmpl'] % template_dict
 413                 except (ValueError, KeyError), err:
 414                         self.trouble('ERROR: invalid output template or system charset: %s' % str(err))
 415                 if self.params.get('nooverwrites', False) and os.path.exists(filename):
 416                         self.to_stderr(u'WARNING: file exists: %s; skipping' % filename)
 417                         return
 418
 419                 try:
 420                         self.pmkdir(filename)
 421                 except (OSError, IOError), err:
 422                         self.trouble('ERROR: unable to create directories: %s' % str(err))
 423                         return
 424
 425                 try:
 426                         success = self._do_download(filename, info_dict['url'].encode('utf-8'))
 427                 except (OSError, IOError), err:
 428                         raise UnavailableFormatError
 429                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 430                         self.trouble('ERROR: unable to download video data: %s' % str(err))
 431                         return
 432                 except (ContentTooShortError, ), err:
 433                         self.trouble('ERROR: content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 434                         return
 435
 436                 if success:
 437                         try:
 438                                 self.post_process(filename, info_dict)
 439                         except (PostProcessingError), err:
 440                                 self.trouble('ERROR: postprocessing: %s' % str(err))
 441                                 return
 442
 443         def download(self, url_list):
 444                 """Download a given list of URLs."""
 445                 if len(url_list) > 1 and self.fixed_template():
 446                         raise SameFileError(self.params['outtmpl'])
 447
 448                 for url in url_list:
 449                         suitable_found = False
 450                         for ie in self._ies:
 451                                 # Go to next InfoExtractor if not suitable
 452                                 if not ie.suitable(url):
 453                                         continue
 454
 455                                 # Suitable InfoExtractor found
 456                                 suitable_found = True
 457
 458                                 # Extract information from URL and process it
 459                                 ie.extract(url)
 460
 461                                 # Suitable InfoExtractor had been found; go to next URL
 462                                 break
 463
 464                         if not suitable_found:
 465                                 self.trouble('ERROR: no suitable InfoExtractor: %s' % url)
 466
 467                 return self._download_retcode
 468
 469         def post_process(self, filename, ie_info):
 470                 """Run the postprocessing chain on the given file."""
 471                 info = dict(ie_info)
 472                 info['filepath'] = filename
 473                 for pp in self._pps:
 474                         info = pp.run(info)
 475                         if info is None:
 476                                 break
 477
 478         def _download_with_rtmpdump(self, filename, url):
 479                 self.report_destination(filename)
 480
 481                 # Check for rtmpdump first
 482                 try:
 483                         subprocess.call(['rtmpdump', '-h'], stdout=(file(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
 484                 except (OSError, IOError):
 485                         self.trouble(u'ERROR: RTMP download detected but "rtmpdump" could not be run')
 486                         return False
 487
 488                 # Download using rtmpdump. rtmpdump returns exit code 2 when
 489                 # the connection was interrumpted and resuming appears to be
 490                 # possible. This is part of rtmpdump's normal usage, AFAIK.
 491                 basic_args = ['rtmpdump', '-q', '-r', url, '-o', filename]
 492                 retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)])
 493                 while retval == 2 or retval == 1:
 494                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename), skip_eol=True)
 495                         time.sleep(2.0) # This seems to be needed
 496                         retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1])
 497                 if retval == 0:
 498                         self.to_stdout(u'\r[rtmpdump] %s bytes' % os.path.getsize(filename))
 499                         return True
 500                 else:
 501                         self.trouble('\nERROR: rtmpdump exited with code %d' % retval)
 502                         return False
 503
 504         def _do_download(self, filename, url):
 505                 # Attempt to download using rtmpdump
 506                 if url.startswith('rtmp'):
 507                         return self._download_with_rtmpdump(filename, url)
 508
 509                 stream = None
 510                 open_mode = 'wb'
 511                 basic_request = urllib2.Request(url, None, std_headers)
 512                 request = urllib2.Request(url, None, std_headers)
 513
 514                 # Establish possible resume length
 515                 if os.path.isfile(filename):
 516                         resume_len = os.path.getsize(filename)
 517                 else:
 518                         resume_len = 0
 519
 520                 # Request parameters in case of being able to resume
 521                 if self.params.get('continuedl', False) and resume_len != 0:
 522                         self.report_resuming_byte(resume_len)
 523                         request.add_header('Range','bytes=%d-' % resume_len)
 524                         open_mode = 'ab'
 525
 526                 # Establish connection
 527                 try:
 528                         data = urllib2.urlopen(request)
 529                 except (urllib2.HTTPError, ), err:
 530                         if err.code != 416: #  416 is 'Requested range not satisfiable'
 531                                 raise
 532                         # Unable to resume
 533                         data = urllib2.urlopen(basic_request)
 534                         content_length = data.info()['Content-Length']
 535
 536                         if content_length is not None and long(content_length) == resume_len:
 537                                 # Because the file had already been fully downloaded
 538                                 self.report_file_already_downloaded(filename)
 539                                 return True
 540                         else:
 541                                 # Because the server didn't let us
 542                                 self.report_unable_to_resume()
 543                                 open_mode = 'wb'
 544
 545                 data_len = data.info().get('Content-length', None)
 546                 data_len_str = self.format_bytes(data_len)
 547                 byte_counter = 0
 548                 block_size = 1024
 549                 start = time.time()
 550                 while True:
 551                         # Download and write
 552                         before = time.time()
 553                         data_block = data.read(block_size)
 554                         after = time.time()
 555                         data_block_len = len(data_block)
 556                         if data_block_len == 0:
 557                                 break
 558                         byte_counter += data_block_len
 559
 560                         # Open file just in time
 561                         if stream is None:
 562                                 try:
 563                                         (stream, filename) = sanitize_open(filename, open_mode)
 564                                         self.report_destination(filename)
 565                                         self._num_downloads += 1
 566                                 except (OSError, IOError), err:
 567                                         self.trouble('ERROR: unable to open for writing: %s' % str(err))
 568                                         return False
 569                         stream.write(data_block)
 570                         block_size = self.best_block_size(after - before, data_block_len)
 571
 572                         # Progress message
 573                         percent_str = self.calc_percent(byte_counter, data_len)
 574                         eta_str = self.calc_eta(start, time.time(), data_len, byte_counter)
 575                         speed_str = self.calc_speed(start, time.time(), byte_counter)
 576                         self.report_progress(percent_str, data_len_str, speed_str, eta_str)
 577
 578                         # Apply rate limit
 579                         self.slow_down(start, byte_counter)
 580
 581                 self.report_finish()
 582                 if data_len is not None and str(byte_counter) != data_len:
 583                         raise ContentTooShortError(byte_counter, long(data_len))
 584                 return True
 585
 586 class InfoExtractor(object):
 587         """Information Extractor class.
 588
 589         Information extractors are the classes that, given a URL, extract
 590         information from the video (or videos) the URL refers to. This
 591         information includes the real video URL, the video title and simplified
 592         title, author and others. The information is stored in a dictionary
 593         which is then passed to the FileDownloader. The FileDownloader
 594         processes this information possibly downloading the video to the file
 595         system, among other possible outcomes. The dictionaries must include
 596         the following fields:
 597
 598         id:             Video identifier.
 599         url:            Final video URL.
 600         uploader:       Nickname of the video uploader.
 601         title:          Literal title.
 602         stitle:         Simplified title.
 603         ext:            Video filename extension.
 604         format:         Video format.
 605
 606         The following fields are optional. Their primary purpose is to allow
 607         youtube-dl to serve as the backend for a video search function, such
 608         as the one in youtube2mp3.  They are only used when their respective
 609         forced printing functions are called:
 610
 611         thumbnail:      Full URL to a video thumbnail image.
 612         description:    One-line video description.
 613
 614         Subclasses of this one should re-define the _real_initialize() and
 615         _real_extract() methods, as well as the suitable() static method.
 616         Probably, they should also be instantiated and added to the main
 617         downloader.
 618         """
 619
 620         _ready = False
 621         _downloader = None
 622
 623         def __init__(self, downloader=None):
 624                 """Constructor. Receives an optional downloader."""
 625                 self._ready = False
 626                 self.set_downloader(downloader)
 627
 628         @staticmethod
 629         def suitable(url):
 630                 """Receives a URL and returns True if suitable for this IE."""
 631                 return False
 632
 633         def initialize(self):
 634                 """Initializes an instance (authentication, etc)."""
 635                 if not self._ready:
 636                         self._real_initialize()
 637                         self._ready = True
 638
 639         def extract(self, url):
 640                 """Extracts URL information and returns it in list of dicts."""
 641                 self.initialize()
 642                 return self._real_extract(url)
 643
 644         def set_downloader(self, downloader):
 645                 """Sets the downloader for this IE."""
 646                 self._downloader = downloader
 647
 648         def _real_initialize(self):
 649                 """Real initialization process. Redefine in subclasses."""
 650                 pass
 651
 652         def _real_extract(self, url):
 653                 """Real extraction process. Redefine in subclasses."""
 654                 pass
 655
 656 class YoutubeIE(InfoExtractor):
 657         """Information extractor for youtube.com."""
 658
 659         _VALID_URL = r'^((?:http://)?(?:\w+\.)?youtube\.com/(?:(?:v/)|(?:(?:watch(?:\.php)?)?[\?#](?:.+&)?v=)))?([0-9A-Za-z_-]+)(?(1).+)?$'
 660         _LANG_URL = r'http://uk.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
 661         _LOGIN_URL = 'http://www.youtube.com/signup?next=/&gl=US&hl=en'
 662         _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
 663         _NETRC_MACHINE = 'youtube'
 664         _available_formats = ['37', '22', '35', '18', '34', '5', '17', '13', None] # listed in order of priority for -b flag
 665         _video_extensions = {
 666                 '13': '3gp',
 667                 '17': 'mp4',
 668                 '18': 'mp4',
 669                 '22': 'mp4',
 670                 '37': 'mp4',
 671         }
 672
 673         @staticmethod
 674         def suitable(url):
 675                 return (re.match(YoutubeIE._VALID_URL, url) is not None)
 676
 677         def report_lang(self):
 678                 """Report attempt to set language."""
 679                 self._downloader.to_stdout(u'[youtube] Setting language')
 680
 681         def report_login(self):
 682                 """Report attempt to log in."""
 683                 self._downloader.to_stdout(u'[youtube] Logging in')
 684
 685         def report_age_confirmation(self):
 686                 """Report attempt to confirm age."""
 687                 self._downloader.to_stdout(u'[youtube] Confirming age')
 688
 689         def report_video_info_webpage_download(self, video_id):
 690                 """Report attempt to download video info webpage."""
 691                 self._downloader.to_stdout(u'[youtube] %s: Downloading video info webpage' % video_id)
 692
 693         def report_information_extraction(self, video_id):
 694                 """Report attempt to extract video information."""
 695                 self._downloader.to_stdout(u'[youtube] %s: Extracting video information' % video_id)
 696
 697         def report_unavailable_format(self, video_id, format):
 698                 """Report extracted video URL."""
 699                 self._downloader.to_stdout(u'[youtube] %s: Format %s not available' % (video_id, format))
 700
 701         def report_rtmp_download(self):
 702                 """Indicate the download will use the RTMP protocol."""
 703                 self._downloader.to_stdout(u'[youtube] RTMP download detected')
 704
 705         def _real_initialize(self):
 706                 if self._downloader is None:
 707                         return
 708
 709                 username = None
 710                 password = None
 711                 downloader_params = self._downloader.params
 712
 713                 # Attempt to use provided username and password or .netrc data
 714                 if downloader_params.get('username', None) is not None:
 715                         username = downloader_params['username']
 716                         password = downloader_params['password']
 717                 elif downloader_params.get('usenetrc', False):
 718                         try:
 719                                 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
 720                                 if info is not None:
 721                                         username = info[0]
 722                                         password = info[2]
 723                                 else:
 724                                         raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
 725                         except (IOError, netrc.NetrcParseError), err:
 726                                 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % str(err))
 727                                 return
 728
 729                 # Set language
 730                 request = urllib2.Request(self._LANG_URL, None, std_headers)
 731                 try:
 732                         self.report_lang()
 733                         urllib2.urlopen(request).read()
 734                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 735                         self._downloader.to_stderr(u'WARNING: unable to set language: %s' % str(err))
 736                         return
 737
 738                 # No authentication to be performed
 739                 if username is None:
 740                         return
 741
 742                 # Log in
 743                 login_form = {
 744                                 'current_form': 'loginForm',
 745                                 'next':         '/',
 746                                 'action_login': 'Log In',
 747                                 'username':     username,
 748                                 'password':     password,
 749                                 }
 750                 request = urllib2.Request(self._LOGIN_URL, urllib.urlencode(login_form), std_headers)
 751                 try:
 752                         self.report_login()
 753                         login_results = urllib2.urlopen(request).read()
 754                         if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
 755                                 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
 756                                 return
 757                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 758                         self._downloader.to_stderr(u'WARNING: unable to log in: %s' % str(err))
 759                         return
 760
 761                 # Confirm age
 762                 age_form = {
 763                                 'next_url':             '/',
 764                                 'action_confirm':       'Confirm',
 765                                 }
 766                 request = urllib2.Request(self._AGE_URL, urllib.urlencode(age_form), std_headers)
 767                 try:
 768                         self.report_age_confirmation()
 769                         age_results = urllib2.urlopen(request).read()
 770                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 771                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 772                         return
 773
 774         def _real_extract(self, url):
 775                 # Extract video id from URL
 776                 mobj = re.match(self._VALID_URL, url)
 777                 if mobj is None:
 778                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 779                         return
 780                 video_id = mobj.group(2)
 781
 782                 # Downloader parameters
 783                 best_quality = False
 784                 all_formats = False
 785                 format_param = None
 786                 quality_index = 0
 787                 if self._downloader is not None:
 788                         params = self._downloader.params
 789                         format_param = params.get('format', None)
 790                         if format_param == '0':
 791                                 format_param = self._available_formats[quality_index]
 792                                 best_quality = True
 793                         elif format_param == '-1':
 794                                 format_param = self._available_formats[quality_index]
 795                                 all_formats = True
 796
 797                 while True:
 798                         # Extension
 799                         video_extension = self._video_extensions.get(format_param, 'flv')
 800
 801                         # Get video info
 802                         self.report_video_info_webpage_download(video_id)
 803                         for el_type in ['embedded', 'detailpage', 'vevo']:
 804                                 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s&el=%s&ps=default&eurl=&gl=US&hl=en'
 805                                                    % (video_id, el_type))
 806                                 request = urllib2.Request(video_info_url, None, std_headers)
 807                                 try:
 808                                         video_info_webpage = urllib2.urlopen(request).read()
 809                                         video_info = parse_qs(video_info_webpage)
 810                                         if 'token' in video_info:
 811                                                 break
 812                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 813                                         self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % str(err))
 814                                         return
 815                         self.report_information_extraction(video_id)
 816
 817                         # "t" param
 818                         if 'token' not in video_info:
 819                                 # Attempt to see if YouTube has issued an error message
 820                                 if 'reason' not in video_info:
 821                                         self._downloader.trouble(u'ERROR: unable to extract "t" parameter for unknown reason')
 822                                         stream = open('reportme-ydl-%s.dat' % time.time(), 'wb')
 823                                         stream.write(video_info_webpage)
 824                                         stream.close()
 825                                 else:
 826                                         reason = urllib.unquote_plus(video_info['reason'][0])
 827                                         self._downloader.trouble(u'ERROR: YouTube said: %s' % reason.decode('utf-8'))
 828                                 return
 829                         token = urllib.unquote_plus(video_info['token'][0])
 830                         video_real_url = 'http://www.youtube.com/get_video?video_id=%s&t=%s&eurl=&el=detailpage&ps=default&gl=US&hl=en' % (video_id, token)
 831                         if format_param is not None:
 832                                 video_real_url = '%s&fmt=%s' % (video_real_url, format_param)
 833
 834                         # Check possible RTMP download
 835                         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 836                                 self.report_rtmp_download()
 837                                 video_real_url = video_info['conn'][0]
 838
 839                         # uploader
 840                         if 'author' not in video_info:
 841                                 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
 842                                 return
 843                         video_uploader = urllib.unquote_plus(video_info['author'][0])
 844
 845                         # title
 846                         if 'title' not in video_info:
 847                                 self._downloader.trouble(u'ERROR: unable to extract video title')
 848                                 return
 849                         video_title = urllib.unquote_plus(video_info['title'][0])
 850                         video_title = video_title.decode('utf-8')
 851                         video_title = sanitize_title(video_title)
 852
 853                         # simplified title
 854                         simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
 855                         simple_title = simple_title.strip(ur'_')
 856
 857                         # thumbnail image
 858                         if 'thumbnail_url' not in video_info:
 859                                 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
 860                                 video_thumbnail = ''
 861                         else:   # don't panic if we can't find it
 862                                 video_thumbnail = urllib.unquote_plus(video_info['thumbnail_url'][0])
 863
 864                         # get video description
 865                         video_description = 'No description available.'    # we need something to pass to self._downloader
 866                         # this requires an additional HTTP request and a little
 867                         # more time, so don't do it unless absolutely necessary
 868                         if self._downloader.params.get('forcedescription', False):
 869                                 video_page_url = 'http://www.youtube.com/watch?v=' + video_id
 870                                 request = urllib2.Request(video_page_url, None, std_headers)
 871                                 try:
 872                                         video_page_webpage = urllib2.urlopen(request).read()
 873                                         mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', video_page_webpage)
 874                                         if mobj is not None:
 875                                                 video_description = mobj.group(1)
 876                                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 877                                         pass    # don't panic if we can't find it
 878
 879                         try:
 880                                 # Process video information
 881                                 self._downloader.process_info({
 882                                         'id':           video_id.decode('utf-8'),
 883                                         'url':          video_real_url.decode('utf-8'),
 884                                         'uploader':     video_uploader.decode('utf-8'),
 885                                         'title':        video_title,
 886                                         'stitle':       simple_title,
 887                                         'ext':          video_extension.decode('utf-8'),
 888                                         'format':       (format_param is None and u'NA' or format_param.decode('utf-8')),
 889                                         'thumbnail':    video_thumbnail.decode('utf-8'),
 890                                         'description':  video_description.decode('utf-8'),
 891                                 })
 892
 893                                 if all_formats:
 894                                         if quality_index == len(self._available_formats):
 895                                                 # None left to get
 896                                                 return
 897                                         else:
 898                                                 quality_index += 1
 899                                                 format_param = self._available_formats[quality_index]
 900                                                 continue
 901                                 return
 902
 903                         except UnavailableFormatError, err:
 904                                 if best_quality or all_formats:
 905                                         if quality_index == len(self._available_formats):
 906                                                 # I don't ever expect this to happen
 907                                                 if not all_formats:
 908                                                         self._downloader.trouble(u'ERROR: no known formats available for video')
 909                                                 return
 910                                         else:
 911                                                 self.report_unavailable_format(video_id, format_param)
 912                                                 quality_index += 1
 913                                                 format_param = self._available_formats[quality_index]
 914                                                 continue
 915                                 else:
 916                                         self._downloader.trouble('ERROR: format not available for video')
 917                                         return
 918
 919
 920 class MetacafeIE(InfoExtractor):
 921         """Information Extractor for metacafe.com."""
 922
 923         _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
 924         _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
 925         _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
 926         _youtube_ie = None
 927
 928         def __init__(self, youtube_ie, downloader=None):
 929                 InfoExtractor.__init__(self, downloader)
 930                 self._youtube_ie = youtube_ie
 931
 932         @staticmethod
 933         def suitable(url):
 934                 return (re.match(MetacafeIE._VALID_URL, url) is not None)
 935
 936         def report_disclaimer(self):
 937                 """Report disclaimer retrieval."""
 938                 self._downloader.to_stdout(u'[metacafe] Retrieving disclaimer')
 939
 940         def report_age_confirmation(self):
 941                 """Report attempt to confirm age."""
 942                 self._downloader.to_stdout(u'[metacafe] Confirming age')
 943
 944         def report_download_webpage(self, video_id):
 945                 """Report webpage download."""
 946                 self._downloader.to_stdout(u'[metacafe] %s: Downloading webpage' % video_id)
 947
 948         def report_extraction(self, video_id):
 949                 """Report information extraction."""
 950                 self._downloader.to_stdout(u'[metacafe] %s: Extracting information' % video_id)
 951
 952         def _real_initialize(self):
 953                 # Retrieve disclaimer
 954                 request = urllib2.Request(self._DISCLAIMER, None, std_headers)
 955                 try:
 956                         self.report_disclaimer()
 957                         disclaimer = urllib2.urlopen(request).read()
 958                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 959                         self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % str(err))
 960                         return
 961
 962                 # Confirm age
 963                 disclaimer_form = {
 964                         'filters': '0',
 965                         'submit': "Continue - I'm over 18",
 966                         }
 967                 request = urllib2.Request(self._FILTER_POST, urllib.urlencode(disclaimer_form), std_headers)
 968                 try:
 969                         self.report_age_confirmation()
 970                         disclaimer = urllib2.urlopen(request).read()
 971                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 972                         self._downloader.trouble(u'ERROR: unable to confirm age: %s' % str(err))
 973                         return
 974
 975         def _real_extract(self, url):
 976                 # Extract id and simplified title from URL
 977                 mobj = re.match(self._VALID_URL, url)
 978                 if mobj is None:
 979                         self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
 980                         return
 981
 982                 video_id = mobj.group(1)
 983
 984                 # Check if video comes from YouTube
 985                 mobj2 = re.match(r'^yt-(.*)$', video_id)
 986                 if mobj2 is not None:
 987                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % mobj2.group(1))
 988                         return
 989
 990                 simple_title = mobj.group(2).decode('utf-8')
 991                 video_extension = 'flv'
 992
 993                 # Retrieve video webpage to extract further information
 994                 request = urllib2.Request('http://www.metacafe.com/watch/%s/' % video_id)
 995                 try:
 996                         self.report_download_webpage(video_id)
 997                         webpage = urllib2.urlopen(request).read()
 998                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
 999                         self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % str(err))
1000                         return
1001
1002                 # Extract URL, uploader and title from webpage
1003                 self.report_extraction(video_id)
1004                 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
1005                 if mobj is None:
1006                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1007                         return
1008                 mediaURL = urllib.unquote(mobj.group(1))
1009
1010                 #mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
1011                 #if mobj is None:
1012                 #       self._downloader.trouble(u'ERROR: unable to extract gdaKey')
1013                 #       return
1014                 #gdaKey = mobj.group(1)
1015                 #
1016                 #video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
1017
1018                 video_url = mediaURL
1019
1020                 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
1021                 if mobj is None:
1022                         self._downloader.trouble(u'ERROR: unable to extract title')
1023                         return
1024                 video_title = mobj.group(1).decode('utf-8')
1025                 video_title = sanitize_title(video_title)
1026
1027                 mobj = re.search(r'(?ms)By:\s*<a .*?>(.+?)<', webpage)
1028                 if mobj is None:
1029                         self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
1030                         return
1031                 video_uploader = mobj.group(1)
1032
1033                 try:
1034                         # Process video information
1035                         self._downloader.process_info({
1036                                 'id':           video_id.decode('utf-8'),
1037                                 'url':          video_url.decode('utf-8'),
1038                                 'uploader':     video_uploader.decode('utf-8'),
1039                                 'title':        video_title,
1040                                 'stitle':       simple_title,
1041                                 'ext':          video_extension.decode('utf-8'),
1042                                 'format':       u'NA',
1043                         })
1044                 except UnavailableFormatError:
1045                         self._downloader.trouble(u'ERROR: format not available for video')
1046
1047
1048 class GoogleIE(InfoExtractor):
1049         """Information extractor for video.google.com."""
1050
1051         _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
1052
1053         def __init__(self, downloader=None):
1054                 InfoExtractor.__init__(self, downloader)
1055
1056         @staticmethod
1057         def suitable(url):
1058                 return (re.match(GoogleIE._VALID_URL, url) is not None)
1059
1060         def report_download_webpage(self, video_id):
1061                 """Report webpage download."""
1062                 self._downloader.to_stdout(u'[video.google] %s: Downloading webpage' % video_id)
1063
1064         def report_extraction(self, video_id):
1065                 """Report information extraction."""
1066                 self._downloader.to_stdout(u'[video.google] %s: Extracting information' % video_id)
1067
1068         def _real_initialize(self):
1069                 return
1070
1071         def _real_extract(self, url):
1072                 # Extract id from URL
1073                 mobj = re.match(self._VALID_URL, url)
1074                 if mobj is None:
1075                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1076                         return
1077
1078                 video_id = mobj.group(1)
1079
1080                 video_extension = 'mp4'
1081
1082                 # Retrieve video webpage to extract further information
1083                 request = urllib2.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
1084                 try:
1085                         self.report_download_webpage(video_id)
1086                         webpage = urllib2.urlopen(request).read()
1087                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1088                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1089                         return
1090
1091                 # Extract URL, uploader, and title from webpage
1092                 self.report_extraction(video_id)
1093                 mobj = re.search(r"download_url:'([^']+)'", webpage)
1094                 if mobj is None:
1095                         video_extension = 'flv'
1096                         mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
1097                 if mobj is None:
1098                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1099                         return
1100                 mediaURL = urllib.unquote(mobj.group(1))
1101                 mediaURL = mediaURL.replace('\\x3d', '\x3d')
1102                 mediaURL = mediaURL.replace('\\x26', '\x26')
1103
1104                 video_url = mediaURL
1105
1106                 mobj = re.search(r'<title>(.*)</title>', webpage)
1107                 if mobj is None:
1108                         self._downloader.trouble(u'ERROR: unable to extract title')
1109                         return
1110                 video_title = mobj.group(1).decode('utf-8')
1111                 video_title = sanitize_title(video_title)
1112                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1113
1114                 # Extract video description
1115                 mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
1116                 if mobj is None:
1117                         self._downloader.trouble(u'ERROR: unable to extract video description')
1118                         return
1119                 video_description = mobj.group(1).decode('utf-8')
1120                 if not video_description:
1121                         video_description = 'No description available.'
1122
1123                 # Extract video thumbnail
1124                 if self._downloader.params.get('forcethumbnail', False):
1125                         request = urllib2.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
1126                         try:
1127                                 webpage = urllib2.urlopen(request).read()
1128                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1129                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1130                                 return
1131                         mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
1132                         if mobj is None:
1133                                 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1134                                 return
1135                         video_thumbnail = mobj.group(1)
1136                 else:   # we need something to pass to process_info
1137                         video_thumbnail = ''
1138
1139
1140                 try:
1141                         # Process video information
1142                         self._downloader.process_info({
1143                                 'id':           video_id.decode('utf-8'),
1144                                 'url':          video_url.decode('utf-8'),
1145                                 'uploader':     u'NA',
1146                                 'title':        video_title,
1147                                 'stitle':       simple_title,
1148                                 'ext':          video_extension.decode('utf-8'),
1149                                 'format':       u'NA',
1150                         })
1151                 except UnavailableFormatError:
1152                         self._downloader.trouble(u'ERROR: format not available for video')
1153
1154
1155 class PhotobucketIE(InfoExtractor):
1156         """Information extractor for photobucket.com."""
1157
1158         _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
1159
1160         def __init__(self, downloader=None):
1161                 InfoExtractor.__init__(self, downloader)
1162
1163         @staticmethod
1164         def suitable(url):
1165                 return (re.match(PhotobucketIE._VALID_URL, url) is not None)
1166
1167         def report_download_webpage(self, video_id):
1168                 """Report webpage download."""
1169                 self._downloader.to_stdout(u'[photobucket] %s: Downloading webpage' % video_id)
1170
1171         def report_extraction(self, video_id):
1172                 """Report information extraction."""
1173                 self._downloader.to_stdout(u'[photobucket] %s: Extracting information' % video_id)
1174
1175         def _real_initialize(self):
1176                 return
1177
1178         def _real_extract(self, url):
1179                 # Extract id from URL
1180                 mobj = re.match(self._VALID_URL, url)
1181                 if mobj is None:
1182                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1183                         return
1184
1185                 video_id = mobj.group(1)
1186
1187                 video_extension = 'flv'
1188
1189                 # Retrieve video webpage to extract further information
1190                 request = urllib2.Request(url)
1191                 try:
1192                         self.report_download_webpage(video_id)
1193                         webpage = urllib2.urlopen(request).read()
1194                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1195                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1196                         return
1197
1198                 # Extract URL, uploader, and title from webpage
1199                 self.report_extraction(video_id)
1200                 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
1201                 if mobj is None:
1202                         self._downloader.trouble(u'ERROR: unable to extract media URL')
1203                         return
1204                 mediaURL = urllib.unquote(mobj.group(1))
1205
1206                 video_url = mediaURL
1207
1208                 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
1209                 if mobj is None:
1210                         self._downloader.trouble(u'ERROR: unable to extract title')
1211                         return
1212                 video_title = mobj.group(1).decode('utf-8')
1213                 video_title = sanitize_title(video_title)
1214                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1215
1216                 video_uploader = mobj.group(2).decode('utf-8')
1217
1218                 try:
1219                         # Process video information
1220                         self._downloader.process_info({
1221                                 'id':           video_id.decode('utf-8'),
1222                                 'url':          video_url.decode('utf-8'),
1223                                 'uploader':     video_uploader,
1224                                 'title':        video_title,
1225                                 'stitle':       simple_title,
1226                                 'ext':          video_extension.decode('utf-8'),
1227                                 'format':       u'NA',
1228                         })
1229                 except UnavailableFormatError:
1230                         self._downloader.trouble(u'ERROR: format not available for video')
1231
1232
1233 class YahooIE(InfoExtractor):
1234         """Information extractor for video.yahoo.com."""
1235
1236         # _VALID_URL matches all Yahoo! Video URLs
1237         # _VPAGE_URL matches only the extractable '/watch/' URLs
1238         _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
1239         _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
1240
1241         def __init__(self, downloader=None):
1242                 InfoExtractor.__init__(self, downloader)
1243
1244         @staticmethod
1245         def suitable(url):
1246                 return (re.match(YahooIE._VALID_URL, url) is not None)
1247
1248         def report_download_webpage(self, video_id):
1249                 """Report webpage download."""
1250                 self._downloader.to_stdout(u'[video.yahoo] %s: Downloading webpage' % video_id)
1251
1252         def report_extraction(self, video_id):
1253                 """Report information extraction."""
1254                 self._downloader.to_stdout(u'[video.yahoo] %s: Extracting information' % video_id)
1255
1256         def _real_initialize(self):
1257                 return
1258
1259         def _real_extract(self, url):
1260                 # Extract ID from URL
1261                 mobj = re.match(self._VALID_URL, url)
1262                 if mobj is None:
1263                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1264                         return
1265
1266                 video_id = mobj.group(2)
1267                 video_extension = 'flv'
1268
1269                 # Rewrite valid but non-extractable URLs as
1270                 # extractable English language /watch/ URLs
1271                 if re.match(self._VPAGE_URL, url) is None:
1272                         request = urllib2.Request(url)
1273                         try:
1274                                 webpage = urllib2.urlopen(request).read()
1275                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1276                                 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1277                                 return
1278
1279                         mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
1280                         if mobj is None:
1281                                 self._downloader.trouble(u'ERROR: Unable to extract id field')
1282                                 return
1283                         yahoo_id = mobj.group(1)
1284
1285                         mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
1286                         if mobj is None:
1287                                 self._downloader.trouble(u'ERROR: Unable to extract vid field')
1288                                 return
1289                         yahoo_vid = mobj.group(1)
1290
1291                         url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1292                         return self._real_extract(url)
1293
1294                 # Retrieve video webpage to extract further information
1295                 request = urllib2.Request(url)
1296                 try:
1297                         self.report_download_webpage(video_id)
1298                         webpage = urllib2.urlopen(request).read()
1299                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1300                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1301                         return
1302
1303                 # Extract uploader and title from webpage
1304                 self.report_extraction(video_id)
1305                 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1306                 if mobj is None:
1307                         self._downloader.trouble(u'ERROR: unable to extract video title')
1308                         return
1309                 video_title = mobj.group(1).decode('utf-8')
1310                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1311
1312                 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1313                 if mobj is None:
1314                         self._downloader.trouble(u'ERROR: unable to extract video uploader')
1315                         return
1316                 video_uploader = mobj.group(1).decode('utf-8')
1317
1318                 # Extract video thumbnail
1319                 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1320                 if mobj is None:
1321                         self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
1322                         return
1323                 video_thumbnail = mobj.group(1).decode('utf-8')
1324
1325                 # Extract video description
1326                 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1327                 if mobj is None:
1328                         self._downloader.trouble(u'ERROR: unable to extract video description')
1329                         return
1330                 video_description = mobj.group(1).decode('utf-8')
1331                 if not video_description: video_description = 'No description available.'
1332
1333                 # Extract video height and width
1334                 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1335                 if mobj is None:
1336                         self._downloader.trouble(u'ERROR: unable to extract video height')
1337                         return
1338                 yv_video_height = mobj.group(1)
1339
1340                 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1341                 if mobj is None:
1342                         self._downloader.trouble(u'ERROR: unable to extract video width')
1343                         return
1344                 yv_video_width = mobj.group(1)
1345
1346                 # Retrieve video playlist to extract media URL
1347                 # I'm not completely sure what all these options are, but we
1348                 # seem to need most of them, otherwise the server sends a 401.
1349                 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR'  # not sure what this represents
1350                 yv_bitrate = '700'  # according to Wikipedia this is hard-coded
1351                 request = urllib2.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1352                                           '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1353                                           '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1354                 try:
1355                         self.report_download_webpage(video_id)
1356                         webpage = urllib2.urlopen(request).read()
1357                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1358                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1359                         return
1360
1361                 # Extract media URL from playlist XML
1362                 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1363                 if mobj is None:
1364                         self._downloader.trouble(u'ERROR: Unable to extract media URL')
1365                         return
1366                 video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1367                 video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
1368
1369                 try:
1370                         # Process video information
1371                         self._downloader.process_info({
1372                                 'id':           video_id.decode('utf-8'),
1373                                 'url':          video_url,
1374                                 'uploader':     video_uploader,
1375                                 'title':        video_title,
1376                                 'stitle':       simple_title,
1377                                 'ext':          video_extension.decode('utf-8'),
1378                                 'thumbnail':    video_thumbnail.decode('utf-8'),
1379                                 'description':  video_description,
1380                                 'thumbnail':    video_thumbnail,
1381                                 'description':  video_description,
1382                         })
1383                 except UnavailableFormatError:
1384                         self._downloader.trouble(u'ERROR: format not available for video')
1385
1386
1387 class GenericIE(InfoExtractor):
1388         """Generic last-resort information extractor."""
1389
1390         def __init__(self, downloader=None):
1391                 InfoExtractor.__init__(self, downloader)
1392
1393         @staticmethod
1394         def suitable(url):
1395                 return True
1396
1397         def report_download_webpage(self, video_id):
1398                 """Report webpage download."""
1399                 self._downloader.to_stdout(u'WARNING: Falling back on generic information extractor.')
1400                 self._downloader.to_stdout(u'[generic] %s: Downloading webpage' % video_id)
1401
1402         def report_extraction(self, video_id):
1403                 """Report information extraction."""
1404                 self._downloader.to_stdout(u'[generic] %s: Extracting information' % video_id)
1405
1406         def _real_initialize(self):
1407                 return
1408
1409         def _real_extract(self, url):
1410                 video_id = url.split('/')[-1]
1411                 request = urllib2.Request(url)
1412                 try:
1413                         self.report_download_webpage(video_id)
1414                         webpage = urllib2.urlopen(request).read()
1415                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1416                         self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % str(err))
1417                         return
1418                 except ValueError, err:
1419                         # since this is the last-resort InfoExtractor, if
1420                         # this error is thrown, it'll be thrown here
1421                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1422                         return
1423
1424                 # Start with something easy: JW Player in SWFObject
1425                 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1426                 if mobj is None:
1427                         # Broaden the search a little bit
1428                         mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1429                 if mobj is None:
1430                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1431                         return
1432
1433                 # It's possible that one of the regexes
1434                 # matched, but returned an empty group:
1435                 if mobj.group(1) is None:
1436                         self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1437                         return
1438
1439                 video_url = urllib.unquote(mobj.group(1))
1440                 video_id  = os.path.basename(video_url)
1441
1442                 # here's a fun little line of code for you:
1443                 video_extension = os.path.splitext(video_id)[1][1:]
1444                 video_id        = os.path.splitext(video_id)[0]
1445
1446                 # it's tempting to parse this further, but you would
1447                 # have to take into account all the variations like
1448                 #   Video Title - Site Name
1449                 #   Site Name | Video Title
1450                 #   Video Title - Tagline | Site Name
1451                 # and so on and so forth; it's just not practical
1452                 mobj = re.search(r'<title>(.*)</title>', webpage)
1453                 if mobj is None:
1454                         self._downloader.trouble(u'ERROR: unable to extract title')
1455                         return
1456                 video_title = mobj.group(1).decode('utf-8')
1457                 video_title = sanitize_title(video_title)
1458                 simple_title = re.sub(ur'(?u)([^%s]+)' % simple_title_chars, ur'_', video_title)
1459
1460                 # video uploader is domain name
1461                 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1462                 if mobj is None:
1463                         self._downloader.trouble(u'ERROR: unable to extract title')
1464                         return
1465                 video_uploader = mobj.group(1).decode('utf-8')
1466
1467                 try:
1468                         # Process video information
1469                         self._downloader.process_info({
1470                                 'id':           video_id.decode('utf-8'),
1471                                 'url':          video_url.decode('utf-8'),
1472                                 'uploader':     video_uploader,
1473                                 'title':        video_title,
1474                                 'stitle':       simple_title,
1475                                 'ext':          video_extension.decode('utf-8'),
1476                                 'format':       u'NA',
1477                         })
1478                 except UnavailableFormatError:
1479                         self._downloader.trouble(u'ERROR: format not available for video')
1480
1481
1482 class YoutubeSearchIE(InfoExtractor):
1483         """Information Extractor for YouTube search queries."""
1484         _VALID_QUERY = r'ytsearch(\d+|all)?:[\s\S]+'
1485         _TEMPLATE_URL = 'http://www.youtube.com/results?search_query=%s&page=%s&gl=US&hl=en'
1486         _VIDEO_INDICATOR = r'href="/watch\?v=.+?"'
1487         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1488         _youtube_ie = None
1489         _max_youtube_results = 1000
1490
1491         def __init__(self, youtube_ie, downloader=None):
1492                 InfoExtractor.__init__(self, downloader)
1493                 self._youtube_ie = youtube_ie
1494
1495         @staticmethod
1496         def suitable(url):
1497                 return (re.match(YoutubeSearchIE._VALID_QUERY, url) is not None)
1498
1499         def report_download_page(self, query, pagenum):
1500                 """Report attempt to download playlist page with given number."""
1501                 query = query.decode(preferredencoding())
1502                 self._downloader.to_stdout(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503
1504         def _real_initialize(self):
1505                 self._youtube_ie.initialize()
1506
1507         def _real_extract(self, query):
1508                 mobj = re.match(self._VALID_QUERY, query)
1509                 if mobj is None:
1510                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1511                         return
1512
1513                 prefix, query = query.split(':')
1514                 prefix = prefix[8:]
1515                 query  = query.encode('utf-8')
1516                 if prefix == '':
1517                         self._download_n_results(query, 1)
1518                         return
1519                 elif prefix == 'all':
1520                         self._download_n_results(query, self._max_youtube_results)
1521                         return
1522                 else:
1523                         try:
1524                                 n = long(prefix)
1525                                 if n <= 0:
1526                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1527                                         return
1528                                 elif n > self._max_youtube_results:
1529                                         self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)'  % (self._max_youtube_results, n))
1530                                         n = self._max_youtube_results
1531                                 self._download_n_results(query, n)
1532                                 return
1533                         except ValueError: # parsing prefix as integer fails
1534                                 self._download_n_results(query, 1)
1535                                 return
1536
1537         def _download_n_results(self, query, n):
1538                 """Downloads a specified number of results for a query"""
1539
1540                 video_ids = []
1541                 already_seen = set()
1542                 pagenum = 1
1543
1544                 while True:
1545                         self.report_download_page(query, pagenum)
1546                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1547                         request = urllib2.Request(result_url, None, std_headers)
1548                         try:
1549                                 page = urllib2.urlopen(request).read()
1550                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1551                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1552                                 return
1553
1554                         # Extract video identifiers
1555                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1556                                 video_id = page[mobj.span()[0]:mobj.span()[1]].split('=')[2][:-1]
1557                                 if video_id not in already_seen:
1558                                         video_ids.append(video_id)
1559                                         already_seen.add(video_id)
1560                                         if len(video_ids) == n:
1561                                                 # Specified n videos reached
1562                                                 for id in video_ids:
1563                                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1564                                                 return
1565
1566                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1567                                 for id in video_ids:
1568                                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1569                                 return
1570
1571                         pagenum = pagenum + 1
1572
1573 class GoogleSearchIE(InfoExtractor):
1574         """Information Extractor for Google Video search queries."""
1575         _VALID_QUERY = r'gvsearch(\d+|all)?:[\s\S]+'
1576         _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1577         _VIDEO_INDICATOR = r'videoplay\?docid=([^\&>]+)\&'
1578         _MORE_PAGES_INDICATOR = r'<span>Next</span>'
1579         _google_ie = None
1580         _max_google_results = 1000
1581
1582         def __init__(self, google_ie, downloader=None):
1583                 InfoExtractor.__init__(self, downloader)
1584                 self._google_ie = google_ie
1585
1586         @staticmethod
1587         def suitable(url):
1588                 return (re.match(GoogleSearchIE._VALID_QUERY, url) is not None)
1589
1590         def report_download_page(self, query, pagenum):
1591                 """Report attempt to download playlist page with given number."""
1592                 query = query.decode(preferredencoding())
1593                 self._downloader.to_stdout(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1594
1595         def _real_initialize(self):
1596                 self._google_ie.initialize()
1597
1598         def _real_extract(self, query):
1599                 mobj = re.match(self._VALID_QUERY, query)
1600                 if mobj is None:
1601                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1602                         return
1603
1604                 prefix, query = query.split(':')
1605                 prefix = prefix[8:]
1606                 query  = query.encode('utf-8')
1607                 if prefix == '':
1608                         self._download_n_results(query, 1)
1609                         return
1610                 elif prefix == 'all':
1611                         self._download_n_results(query, self._max_google_results)
1612                         return
1613                 else:
1614                         try:
1615                                 n = long(prefix)
1616                                 if n <= 0:
1617                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1618                                         return
1619                                 elif n > self._max_google_results:
1620                                         self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)'  % (self._max_google_results, n))
1621                                         n = self._max_google_results
1622                                 self._download_n_results(query, n)
1623                                 return
1624                         except ValueError: # parsing prefix as integer fails
1625                                 self._download_n_results(query, 1)
1626                                 return
1627
1628         def _download_n_results(self, query, n):
1629                 """Downloads a specified number of results for a query"""
1630
1631                 video_ids = []
1632                 already_seen = set()
1633                 pagenum = 1
1634
1635                 while True:
1636                         self.report_download_page(query, pagenum)
1637                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1638                         request = urllib2.Request(result_url, None, std_headers)
1639                         try:
1640                                 page = urllib2.urlopen(request).read()
1641                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1642                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1643                                 return
1644
1645                         # Extract video identifiers
1646                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647                                 video_id = mobj.group(1)
1648                                 if video_id not in already_seen:
1649                                         video_ids.append(video_id)
1650                                         already_seen.add(video_id)
1651                                         if len(video_ids) == n:
1652                                                 # Specified n videos reached
1653                                                 for id in video_ids:
1654                                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1655                                                 return
1656
1657                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658                                 for id in video_ids:
1659                                         self._google_ie.extract('http://video.google.com/videoplay?docid=%s' % id)
1660                                 return
1661
1662                         pagenum = pagenum + 1
1663
1664 class YahooSearchIE(InfoExtractor):
1665         """Information Extractor for Yahoo! Video search queries."""
1666         _VALID_QUERY = r'yvsearch(\d+|all)?:[\s\S]+'
1667         _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1668         _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1669         _MORE_PAGES_INDICATOR = r'\s*Next'
1670         _yahoo_ie = None
1671         _max_yahoo_results = 1000
1672
1673         def __init__(self, yahoo_ie, downloader=None):
1674                 InfoExtractor.__init__(self, downloader)
1675                 self._yahoo_ie = yahoo_ie
1676
1677         @staticmethod
1678         def suitable(url):
1679                 return (re.match(YahooSearchIE._VALID_QUERY, url) is not None)
1680
1681         def report_download_page(self, query, pagenum):
1682                 """Report attempt to download playlist page with given number."""
1683                 query = query.decode(preferredencoding())
1684                 self._downloader.to_stdout(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1685
1686         def _real_initialize(self):
1687                 self._yahoo_ie.initialize()
1688
1689         def _real_extract(self, query):
1690                 mobj = re.match(self._VALID_QUERY, query)
1691                 if mobj is None:
1692                         self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1693                         return
1694
1695                 prefix, query = query.split(':')
1696                 prefix = prefix[8:]
1697                 query  = query.encode('utf-8')
1698                 if prefix == '':
1699                         self._download_n_results(query, 1)
1700                         return
1701                 elif prefix == 'all':
1702                         self._download_n_results(query, self._max_yahoo_results)
1703                         return
1704                 else:
1705                         try:
1706                                 n = long(prefix)
1707                                 if n <= 0:
1708                                         self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1709                                         return
1710                                 elif n > self._max_yahoo_results:
1711                                         self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)'  % (self._max_yahoo_results, n))
1712                                         n = self._max_yahoo_results
1713                                 self._download_n_results(query, n)
1714                                 return
1715                         except ValueError: # parsing prefix as integer fails
1716                                 self._download_n_results(query, 1)
1717                                 return
1718
1719         def _download_n_results(self, query, n):
1720                 """Downloads a specified number of results for a query"""
1721
1722                 video_ids = []
1723                 already_seen = set()
1724                 pagenum = 1
1725
1726                 while True:
1727                         self.report_download_page(query, pagenum)
1728                         result_url = self._TEMPLATE_URL % (urllib.quote_plus(query), pagenum)
1729                         request = urllib2.Request(result_url, None, std_headers)
1730                         try:
1731                                 page = urllib2.urlopen(request).read()
1732                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1733                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1734                                 return
1735
1736                         # Extract video identifiers
1737                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1738                                 video_id = mobj.group(1)
1739                                 if video_id not in already_seen:
1740                                         video_ids.append(video_id)
1741                                         already_seen.add(video_id)
1742                                         if len(video_ids) == n:
1743                                                 # Specified n videos reached
1744                                                 for id in video_ids:
1745                                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1746                                                 return
1747
1748                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1749                                 for id in video_ids:
1750                                         self._yahoo_ie.extract('http://video.yahoo.com/watch/%s' % id)
1751                                 return
1752
1753                         pagenum = pagenum + 1
1754
1755 class YoutubePlaylistIE(InfoExtractor):
1756         """Information Extractor for YouTube playlists."""
1757
1758         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/(?:(?:view_play_list|my_playlists)\?.*?p=|user/.*?/user/)([^&]+).*'
1759         _TEMPLATE_URL = 'http://www.youtube.com/view_play_list?p=%s&page=%s&gl=US&hl=en'
1760         _VIDEO_INDICATOR = r'/watch\?v=(.+?)&'
1761         _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*</a>'
1762         _youtube_ie = None
1763
1764         def __init__(self, youtube_ie, downloader=None):
1765                 InfoExtractor.__init__(self, downloader)
1766                 self._youtube_ie = youtube_ie
1767
1768         @staticmethod
1769         def suitable(url):
1770                 return (re.match(YoutubePlaylistIE._VALID_URL, url) is not None)
1771
1772         def report_download_page(self, playlist_id, pagenum):
1773                 """Report attempt to download playlist page with given number."""
1774                 self._downloader.to_stdout(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1775
1776         def _real_initialize(self):
1777                 self._youtube_ie.initialize()
1778
1779         def _real_extract(self, url):
1780                 # Extract playlist id
1781                 mobj = re.match(self._VALID_URL, url)
1782                 if mobj is None:
1783                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1784                         return
1785
1786                 # Download playlist pages
1787                 playlist_id = mobj.group(1)
1788                 video_ids = []
1789                 pagenum = 1
1790
1791                 while True:
1792                         self.report_download_page(playlist_id, pagenum)
1793                         request = urllib2.Request(self._TEMPLATE_URL % (playlist_id, pagenum), None, std_headers)
1794                         try:
1795                                 page = urllib2.urlopen(request).read()
1796                         except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1797                                 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1798                                 return
1799
1800                         # Extract video identifiers
1801                         ids_in_page = []
1802                         for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1803                                 if mobj.group(1) not in ids_in_page:
1804                                         ids_in_page.append(mobj.group(1))
1805                         video_ids.extend(ids_in_page)
1806
1807                         if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1808                                 break
1809                         pagenum = pagenum + 1
1810
1811                 for id in video_ids:
1812                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1813                 return
1814
1815 class YoutubeUserIE(InfoExtractor):
1816         """Information Extractor for YouTube users."""
1817
1818         _VALID_URL = r'(?:http://)?(?:\w+\.)?youtube.com/user/(.*)'
1819         _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1820         _VIDEO_INDICATOR = r'http://gdata.youtube.com/feeds/api/videos/(.*)' # XXX Fix this.
1821         _youtube_ie = None
1822
1823         def __init__(self, youtube_ie, downloader=None):
1824                 InfoExtractor.__init__(self, downloader)
1825                 self._youtube_ie = youtube_ie
1826
1827         @staticmethod
1828         def suitable(url):
1829                 return (re.match(YoutubeUserIE._VALID_URL, url) is not None)
1830
1831         def report_download_page(self, username):
1832                 """Report attempt to download user page."""
1833                 self._downloader.to_stdout(u'[youtube] user %s: Downloading page ' % (username))
1834
1835         def _real_initialize(self):
1836                 self._youtube_ie.initialize()
1837
1838         def _real_extract(self, url):
1839                 # Extract username
1840                 mobj = re.match(self._VALID_URL, url)
1841                 if mobj is None:
1842                         self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1843                         return
1844
1845                 # Download user page
1846                 username = mobj.group(1)
1847                 video_ids = []
1848                 pagenum = 1
1849
1850                 self.report_download_page(username)
1851                 request = urllib2.Request(self._TEMPLATE_URL % (username), None, std_headers)
1852                 try:
1853                         page = urllib2.urlopen(request).read()
1854                 except (urllib2.URLError, httplib.HTTPException, socket.error), err:
1855                         self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1856                         return
1857
1858                 # Extract video identifiers
1859                 ids_in_page = []
1860
1861                 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1862                         if mobj.group(1) not in ids_in_page:
1863                                 ids_in_page.append(mobj.group(1))
1864                 video_ids.extend(ids_in_page)
1865
1866                 for id in video_ids:
1867                         self._youtube_ie.extract('http://www.youtube.com/watch?v=%s' % id)
1868                 return
1869
1870 class PostProcessor(object):
1871         """Post Processor class.
1872
1873         PostProcessor objects can be added to downloaders with their
1874         add_post_processor() method. When the downloader has finished a
1875         successful download, it will take its internal chain of PostProcessors
1876         and start calling the run() method on each one of them, first with
1877         an initial argument and then with the returned value of the previous
1878         PostProcessor.
1879
1880         The chain will be stopped if one of them ever returns None or the end
1881         of the chain is reached.
1882
1883         PostProcessor objects follow a "mutual registration" process similar
1884         to InfoExtractor objects.
1885         """
1886
1887         _downloader = None
1888
1889         def __init__(self, downloader=None):
1890                 self._downloader = downloader
1891
1892         def set_downloader(self, downloader):
1893                 """Sets the downloader for this PP."""
1894                 self._downloader = downloader
1895
1896         def run(self, information):
1897                 """Run the PostProcessor.
1898
1899                 The "information" argument is a dictionary like the ones
1900                 composed by InfoExtractors. The only difference is that this
1901                 one has an extra field called "filepath" that points to the
1902                 downloaded file.
1903
1904                 When this method returns None, the postprocessing chain is
1905                 stopped. However, this method may return an information
1906                 dictionary that will be passed to the next postprocessing
1907                 object in the chain. It can be the one it received after
1908                 changing some fields.
1909
1910                 In addition, this method may raise a PostProcessingError
1911                 exception that will be taken into account by the downloader
1912                 it was called from.
1913                 """
1914                 return information # by default, do nothing
1915
1916 ### MAIN PROGRAM ###
1917 if __name__ == '__main__':
1918         try:
1919                 # Modules needed only when running the main program
1920                 import getpass
1921                 import optparse
1922
1923                 # Function to update the program file with the latest version from bitbucket.org
1924                 def update_self(downloader, filename):
1925                         # Note: downloader only used for options
1926                         if not os.access (filename, os.W_OK):
1927                                 sys.exit('ERROR: no write permissions on %s' % filename)
1928
1929                         downloader.to_stdout('Updating to latest stable version...')
1930                         latest_url = 'http://bitbucket.org/rg3/youtube-dl/raw/tip/LATEST_VERSION'
1931                         latest_version = urllib.urlopen(latest_url).read().strip()
1932                         prog_url = 'http://bitbucket.org/rg3/youtube-dl/raw/%s/youtube-dl' % latest_version
1933                         newcontent = urllib.urlopen(prog_url).read()
1934                         stream = open(filename, 'w')
1935                         stream.write(newcontent)
1936                         stream.close()
1937                         downloader.to_stdout('Updated to version %s' % latest_version)
1938
1939                 # General configuration
1940                 urllib2.install_opener(urllib2.build_opener(urllib2.ProxyHandler()))
1941                 urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor()))
1942                 socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words)
1943
1944                 # Parse command line
1945                 parser = optparse.OptionParser(
1946                         usage='Usage: %prog [options] url...',
1947                         version='2010.04.04',
1948                         conflict_handler='resolve',
1949                 )
1950
1951                 parser.add_option('-h', '--help',
1952                                 action='help', help='print this help text and exit')
1953                 parser.add_option('-v', '--version',
1954                                 action='version', help='print program version and exit')
1955                 parser.add_option('-U', '--update',
1956                                 action='store_true', dest='update_self', help='update this program to latest stable version')
1957                 parser.add_option('-i', '--ignore-errors',
1958                                 action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
1959                 parser.add_option('-r', '--rate-limit',
1960                                 dest='ratelimit', metavar='L', help='download rate limit (e.g. 50k or 44.6m)')
1961
1962                 authentication = optparse.OptionGroup(parser, 'Authentication Options')
1963                 authentication.add_option('-u', '--username',
1964                                 dest='username', metavar='UN', help='account username')
1965                 authentication.add_option('-p', '--password',
1966                                 dest='password', metavar='PW', help='account password')
1967                 authentication.add_option('-n', '--netrc',
1968                                 action='store_true', dest='usenetrc', help='use .netrc authentication data', default=False)
1969                 parser.add_option_group(authentication)
1970
1971                 video_format = optparse.OptionGroup(parser, 'Video Format Options')
1972                 video_format.add_option('-f', '--format',
1973                                 action='store', dest='format', metavar='FMT', help='video format code')
1974                 video_format.add_option('-b', '--best-quality',
1975                                 action='store_const', dest='format', help='download the best quality video possible', const='0')
1976                 video_format.add_option('-m', '--mobile-version',
1977                                 action='store_const', dest='format', help='alias for -f 17', const='17')
1978                 video_format.add_option('-d', '--high-def',
1979                                 action='store_const', dest='format', help='alias for -f 22', const='22')
1980                 video_format.add_option('--all-formats',
1981                                 action='store_const', dest='format', help='download all available video formats', const='-1')
1982                 parser.add_option_group(video_format)
1983
1984                 verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
1985                 verbosity.add_option('-q', '--quiet',
1986                                 action='store_true', dest='quiet', help='activates quiet mode', default=False)
1987                 verbosity.add_option('-s', '--simulate',
1988                                 action='store_true', dest='simulate', help='do not download video', default=False)
1989                 verbosity.add_option('-g', '--get-url',
1990                                 action='store_true', dest='geturl', help='simulate, quiet but print URL', default=False)
1991                 verbosity.add_option('-e', '--get-title',
1992                                 action='store_true', dest='gettitle', help='simulate, quiet but print title', default=False)
1993                 verbosity.add_option('--get-thumbnail',
1994                                 action='store_true', dest='getthumbnail', help='simulate, quiet but print thumbnail URL', default=False)
1995                 verbosity.add_option('--get-description',
1996                                 action='store_true', dest='getdescription', help='simulate, quiet but print video description', default=False)
1997                 verbosity.add_option('--no-progress',
1998                                 action='store_true', dest='noprogress', help='do not print progress bar', default=False)
1999                 parser.add_option_group(verbosity)
2000
2001                 filesystem = optparse.OptionGroup(parser, 'Filesystem Options')
2002                 filesystem.add_option('-t', '--title',
2003                                 action='store_true', dest='usetitle', help='use title in file name', default=False)
2004                 filesystem.add_option('-l', '--literal',
2005                                 action='store_true', dest='useliteral', help='use literal title in file name', default=False)
2006                 filesystem.add_option('-o', '--output',
2007                                 dest='outtmpl', metavar='TPL', help='output filename template')
2008                 filesystem.add_option('-a', '--batch-file',
2009                                 dest='batchfile', metavar='F', help='file containing URLs to download')
2010                 filesystem.add_option('-w', '--no-overwrites',
2011                                 action='store_true', dest='nooverwrites', help='do not overwrite files', default=False)
2012                 filesystem.add_option('-c', '--continue',
2013                                 action='store_true', dest='continue_dl', help='resume partially downloaded files', default=False)
2014                 parser.add_option_group(filesystem)
2015
2016                 (opts, args) = parser.parse_args()
2017
2018                 # Batch file verification
2019                 batchurls = []
2020                 if opts.batchfile is not None:
2021                         try:
2022                                 batchurls = open(opts.batchfile, 'r').readlines()
2023                                 batchurls = [x.strip() for x in batchurls]
2024                                 batchurls = [x for x in batchurls if len(x) > 0]
2025                         except IOError:
2026                                 sys.exit(u'ERROR: batch file could not be read')
2027                 all_urls = batchurls + args
2028
2029                 # Conflicting, missing and erroneous options
2030                 if opts.usenetrc and (opts.username is not None or opts.password is not None):
2031                         parser.error(u'using .netrc conflicts with giving username/password')
2032                 if opts.password is not None and opts.username is None:
2033                         parser.error(u'account username missing')
2034                 if opts.outtmpl is not None and (opts.useliteral or opts.usetitle):
2035                         parser.error(u'using output template conflicts with using title or literal title')
2036                 if opts.usetitle and opts.useliteral:
2037                         parser.error(u'using title conflicts with using literal title')
2038                 if opts.username is not None and opts.password is None:
2039                         opts.password = getpass.getpass(u'Type account password and press return:')
2040                 if opts.ratelimit is not None:
2041                         numeric_limit = FileDownloader.parse_bytes(opts.ratelimit)
2042                         if numeric_limit is None:
2043                                 parser.error(u'invalid rate limit specified')
2044                         opts.ratelimit = numeric_limit
2045
2046                 # Information extractors
2047                 youtube_ie = YoutubeIE()
2048                 metacafe_ie = MetacafeIE(youtube_ie)
2049                 youtube_pl_ie = YoutubePlaylistIE(youtube_ie)
2050                 youtube_user_ie = YoutubeUserIE(youtube_ie)
2051                 youtube_search_ie = YoutubeSearchIE(youtube_ie)
2052                 google_ie = GoogleIE()
2053                 google_search_ie = GoogleSearchIE(google_ie)
2054                 photobucket_ie = PhotobucketIE()
2055                 yahoo_ie = YahooIE()
2056                 yahoo_search_ie = YahooSearchIE(yahoo_ie)
2057                 generic_ie = GenericIE()
2058
2059                 # File downloader
2060                 fd = FileDownloader({
2061                         'usenetrc': opts.usenetrc,
2062                         'username': opts.username,
2063                         'password': opts.password,
2064                         'quiet': (opts.quiet or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2065                         'forceurl': opts.geturl,
2066                         'forcetitle': opts.gettitle,
2067                         'forcethumbnail': opts.getthumbnail,
2068                         'forcedescription': opts.getdescription,
2069                         'simulate': (opts.simulate or opts.geturl or opts.gettitle or opts.getthumbnail or opts.getdescription),
2070                         'format': opts.format,
2071                         'outtmpl': ((opts.outtmpl is not None and opts.outtmpl.decode(preferredencoding()))
2072                                 or (opts.format == '-1' and opts.usetitle and u'%(stitle)s-%(id)s-%(format)s.%(ext)s')
2073                                 or (opts.format == '-1' and opts.useliteral and u'%(title)s-%(id)s-%(format)s.%(ext)s')
2074                                 or (opts.format == '-1' and u'%(id)s-%(format)s.%(ext)s')
2075                                 or (opts.usetitle and u'%(stitle)s-%(id)s.%(ext)s')
2076                                 or (opts.useliteral and u'%(title)s-%(id)s.%(ext)s')
2077                                 or u'%(id)s.%(ext)s'),
2078                         'ignoreerrors': opts.ignoreerrors,
2079                         'ratelimit': opts.ratelimit,
2080                         'nooverwrites': opts.nooverwrites,
2081                         'continuedl': opts.continue_dl,
2082                         'noprogress': opts.noprogress,
2083                         })
2084                 fd.add_info_extractor(youtube_search_ie)
2085                 fd.add_info_extractor(youtube_pl_ie)
2086                 fd.add_info_extractor(youtube_user_ie)
2087                 fd.add_info_extractor(metacafe_ie)
2088                 fd.add_info_extractor(youtube_ie)
2089                 fd.add_info_extractor(google_ie)
2090                 fd.add_info_extractor(google_search_ie)
2091                 fd.add_info_extractor(photobucket_ie)
2092                 fd.add_info_extractor(yahoo_ie)
2093                 fd.add_info_extractor(yahoo_search_ie)
2094
2095                 # This must come last since it's the
2096                 # fallback if none of the others work
2097                 fd.add_info_extractor(generic_ie)
2098
2099                 # Update version
2100                 if opts.update_self:
2101                         update_self(fd, sys.argv[0])
2102
2103                 # Maybe do nothing
2104                 if len(all_urls) < 1:
2105                         if not opts.update_self:
2106                                 parser.error(u'you must provide at least one URL')
2107                         else:
2108                                 sys.exit()
2109                 retcode = fd.download(all_urls)
2110                 sys.exit(retcode)
2111
2112         except DownloadError:
2113                 sys.exit(1)
2114         except SameFileError:
2115                 sys.exit(u'ERROR: fixed output name but more than one file to download')
2116         except KeyboardInterrupt:
2117                 sys.exit(u'\nERROR: Interrupted by user')