_ Git - youtube-dl/blob - youtube_dl/YoutubeDL.py

   1 #!/usr/bin/env python
   2 # -*- coding: utf-8 -*-
   3
   4 from __future__ import absolute_import
   5
   6 import io
   7 import os
   8 import re
   9 import shutil
  10 import socket
  11 import sys
  12 import time
  13 import traceback
  14
  15 from .utils import *
  16 from .extractor import get_info_extractor, gen_extractors
  17 from .FileDownloader import FileDownloader
  18
  19
  20 class YoutubeDL(object):
  21     """YoutubeDL class.
  22
  23     YoutubeDL objects are the ones responsible of downloading the
  24     actual video file and writing it to disk if the user has requested
  25     it, among some other tasks. In most cases there should be one per
  26     program. As, given a video URL, the downloader doesn't know how to
  27     extract all the needed information, task that InfoExtractors do, it
  28     has to pass the URL to one of them.
  29
  30     For this, YoutubeDL objects have a method that allows
  31     InfoExtractors to be registered in a given order. When it is passed
  32     a URL, the YoutubeDL object handles it to the first InfoExtractor it
  33     finds that reports being able to handle it. The InfoExtractor extracts
  34     all the information about the video or videos the URL refers to, and
  35     YoutubeDL process the extracted information, possibly using a File
  36     Downloader to download the video.
  37
  38     YoutubeDL objects accept a lot of parameters. In order not to saturate
  39     the object constructor with arguments, it receives a dictionary of
  40     options instead. These options are available through the params
  41     attribute for the InfoExtractors to use. The YoutubeDL also
  42     registers itself as the downloader in charge for the InfoExtractors
  43     that are added to it, so this is a "mutual registration".
  44
  45     Available options:
  46
  47     username:          Username for authentication purposes.
  48     password:          Password for authentication purposes.
  49     videopassword:     Password for acces a video.
  50     usenetrc:          Use netrc for authentication instead.
  51     verbose:           Print additional info to stdout.
  52     quiet:             Do not print messages to stdout.
  53     forceurl:          Force printing final URL.
  54     forcetitle:        Force printing title.
  55     forceid:           Force printing ID.
  56     forcethumbnail:    Force printing thumbnail URL.
  57     forcedescription:  Force printing description.
  58     forcefilename:     Force printing final filename.
  59     simulate:          Do not download the video files.
  60     format:            Video format code.
  61     format_limit:      Highest quality format to try.
  62     outtmpl:           Template for output names.
  63     restrictfilenames: Do not allow "&" and spaces in file names
  64     ignoreerrors:      Do not stop on download errors.
  65     nooverwrites:      Prevent overwriting files.
  66     playliststart:     Playlist item to start at.
  67     playlistend:       Playlist item to end at.
  68     matchtitle:        Download only matching titles.
  69     rejecttitle:       Reject downloads for matching titles.
  70     logtostderr:       Log messages to stderr instead of stdout.
  71     writedescription:  Write the video description to a .description file
  72     writeinfojson:     Write the video description to a .info.json file
  73     writethumbnail:    Write the thumbnail image to a file
  74     writesubtitles:    Write the video subtitles to a file
  75     writeautomaticsub: Write the automatic subtitles to a file
  76     allsubtitles:      Downloads all the subtitles of the video
  77     listsubtitles:     Lists all available subtitles for the video
  78     subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
  79     subtitleslang:     Language of the subtitles to download
  80     keepvideo:         Keep the video file after post-processing
  81     daterange:         A DateRange object, download only if the upload_date is in the range.
  82     skip_download:     Skip the actual download of the video file
  83
  84     The following parameters are not used by YoutubeDL itself, they are used by
  85     the FileDownloader:
  86     nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
  87     noresizebuffer, retries, continuedl, noprogress, consoletitle
  88     """
  89
  90     params = None
  91     _ies = []
  92     _pps = []
  93     _download_retcode = None
  94     _num_downloads = None
  95     _screen_file = None
  96
  97     def __init__(self, params):
  98         """Create a FileDownloader object with the given options."""
  99         self._ies = []
 100         self._pps = []
 101         self._progress_hooks = []
 102         self._download_retcode = 0
 103         self._num_downloads = 0
 104         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
 105         self.params = params
 106         self.fd = FileDownloader(self, self.params)
 107
 108         if '%(stitle)s' in self.params['outtmpl']:
 109             self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
 110
 111     def add_info_extractor(self, ie):
 112         """Add an InfoExtractor object to the end of the list."""
 113         self._ies.append(ie)
 114         ie.set_downloader(self)
 115
 116     def add_default_info_extractors(self):
 117         """
 118         Add the InfoExtractors returned by gen_extractors to the end of the list
 119         """
 120         for ie in gen_extractors():
 121             self.add_info_extractor(ie)
 122
 123     def add_post_processor(self, pp):
 124         """Add a PostProcessor object to the end of the chain."""
 125         self._pps.append(pp)
 126         pp.set_downloader(self)
 127
 128     def to_screen(self, message, skip_eol=False):
 129         """Print message to stdout if not in quiet mode."""
 130         assert type(message) == type(u'')
 131         if not self.params.get('quiet', False):
 132             terminator = [u'\n', u''][skip_eol]
 133             output = message + terminator
 134             if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 135                 output = output.encode(preferredencoding(), 'ignore')
 136             self._screen_file.write(output)
 137             self._screen_file.flush()
 138
 139     def to_stderr(self, message):
 140         """Print message to stderr."""
 141         assert type(message) == type(u'')
 142         output = message + u'\n'
 143         if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
 144             output = output.encode(preferredencoding())
 145         sys.stderr.write(output)
 146
 147     def fixed_template(self):
 148         """Checks if the output template is fixed."""
 149         return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
 150
 151     def trouble(self, message=None, tb=None):
 152         """Determine action to take when a download problem appears.
 153
 154         Depending on if the downloader has been configured to ignore
 155         download errors or not, this method may throw an exception or
 156         not when errors are found, after printing the message.
 157
 158         tb, if given, is additional traceback information.
 159         """
 160         if message is not None:
 161             self.to_stderr(message)
 162         if self.params.get('verbose'):
 163             if tb is None:
 164                 if sys.exc_info()[0]:  # if .trouble has been called from an except block
 165                     tb = u''
 166                     if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 167                         tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
 168                     tb += compat_str(traceback.format_exc())
 169                 else:
 170                     tb_data = traceback.format_list(traceback.extract_stack())
 171                     tb = u''.join(tb_data)
 172             self.to_stderr(tb)
 173         if not self.params.get('ignoreerrors', False):
 174             if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
 175                 exc_info = sys.exc_info()[1].exc_info
 176             else:
 177                 exc_info = sys.exc_info()
 178             raise DownloadError(message, exc_info)
 179         self._download_retcode = 1
 180
 181     def report_warning(self, message):
 182         '''
 183         Print the message to stderr, it will be prefixed with 'WARNING:'
 184         If stderr is a tty file the 'WARNING:' will be colored
 185         '''
 186         if sys.stderr.isatty() and os.name != 'nt':
 187             _msg_header=u'\033[0;33mWARNING:\033[0m'
 188         else:
 189             _msg_header=u'WARNING:'
 190         warning_message=u'%s %s' % (_msg_header,message)
 191         self.to_stderr(warning_message)
 192
 193     def report_error(self, message, tb=None):
 194         '''
 195         Do the same as trouble, but prefixes the message with 'ERROR:', colored
 196         in red if stderr is a tty file.
 197         '''
 198         if sys.stderr.isatty() and os.name != 'nt':
 199             _msg_header = u'\033[0;31mERROR:\033[0m'
 200         else:
 201             _msg_header = u'ERROR:'
 202         error_message = u'%s %s' % (_msg_header, message)
 203         self.trouble(error_message, tb)
 204
 205     def slow_down(self, start_time, byte_counter):
 206         """Sleep if the download speed is over the rate limit."""
 207         rate_limit = self.params.get('ratelimit', None)
 208         if rate_limit is None or byte_counter == 0:
 209             return
 210         now = time.time()
 211         elapsed = now - start_time
 212         if elapsed <= 0.0:
 213             return
 214         speed = float(byte_counter) / elapsed
 215         if speed > rate_limit:
 216             time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
 217
 218     def report_writedescription(self, descfn):
 219         """ Report that the description file is being written """
 220         self.to_screen(u'[info] Writing video description to: ' + descfn)
 221
 222     def report_writesubtitles(self, sub_filename):
 223         """ Report that the subtitles file is being written """
 224         self.to_screen(u'[info] Writing subtitle: ' + sub_filename)
 225
 226     def report_existingsubtitles(self, sub_filename):
 227         """ Report that the subtitles file has been already written """
 228         self.to_screen(u'[info] Skipping existing subtitle: ' + sub_filename)
 229
 230     def report_writeinfojson(self, infofn):
 231         """ Report that the metadata file has been written """
 232         self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
 233
 234     def increment_downloads(self):
 235         """Increment the ordinal that assigns a number to each file."""
 236         self._num_downloads += 1
 237
 238     def prepare_filename(self, info_dict):
 239         """Generate the output filename."""
 240         try:
 241             template_dict = dict(info_dict)
 242
 243             template_dict['epoch'] = int(time.time())
 244             autonumber_size = self.params.get('autonumber_size')
 245             if autonumber_size is None:
 246                 autonumber_size = 5
 247             autonumber_templ = u'%0' + str(autonumber_size) + u'd'
 248             template_dict['autonumber'] = autonumber_templ % self._num_downloads
 249             if template_dict['playlist_index'] is not None:
 250                 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
 251
 252             sanitize = lambda k,v: sanitize_filename(
 253                 u'NA' if v is None else compat_str(v),
 254                 restricted=self.params.get('restrictfilenames'),
 255                 is_id=(k==u'id'))
 256             template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
 257
 258             filename = self.params['outtmpl'] % template_dict
 259             return filename
 260         except KeyError as err:
 261             self.report_error(u'Erroneous output template')
 262             return None
 263         except ValueError as err:
 264             self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
 265             return None
 266
 267     def _match_entry(self, info_dict):
 268         """ Returns None iff the file should be downloaded """
 269
 270         title = info_dict['title']
 271         matchtitle = self.params.get('matchtitle', False)
 272         if matchtitle:
 273             if not re.search(matchtitle, title, re.IGNORECASE):
 274                 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
 275         rejecttitle = self.params.get('rejecttitle', False)
 276         if rejecttitle:
 277             if re.search(rejecttitle, title, re.IGNORECASE):
 278                 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
 279         date = info_dict.get('upload_date', None)
 280         if date is not None:
 281             dateRange = self.params.get('daterange', DateRange())
 282             if date not in dateRange:
 283                 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
 284         return None
 285
 286     def extract_info(self, url, download=True, ie_key=None, extra_info={}):
 287         '''
 288         Returns a list with a dictionary for each video we find.
 289         If 'download', also downloads the videos.
 290         extra_info is a dict containing the extra values to add to each result
 291          '''
 292
 293         if ie_key:
 294             ie = get_info_extractor(ie_key)()
 295             ie.set_downloader(self)
 296             ies = [ie]
 297         else:
 298             ies = self._ies
 299
 300         for ie in ies:
 301             if not ie.suitable(url):
 302                 continue
 303
 304             if not ie.working():
 305                 self.report_warning(u'The program functionality for this site has been marked as broken, '
 306                                     u'and will probably not work.')
 307
 308             try:
 309                 ie_result = ie.extract(url)
 310                 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
 311                     break
 312                 if isinstance(ie_result, list):
 313                     # Backwards compatibility: old IE result format
 314                     for result in ie_result:
 315                         result.update(extra_info)
 316                     ie_result = {
 317                         '_type': 'compat_list',
 318                         'entries': ie_result,
 319                     }
 320                 else:
 321                     ie_result.update(extra_info)
 322                 if 'extractor' not in ie_result:
 323                     ie_result['extractor'] = ie.IE_NAME
 324                 return self.process_ie_result(ie_result, download=download)
 325             except ExtractorError as de: # An error we somewhat expected
 326                 self.report_error(compat_str(de), de.format_traceback())
 327                 break
 328             except Exception as e:
 329                 if self.params.get('ignoreerrors', False):
 330                     self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
 331                     break
 332                 else:
 333                     raise
 334         else:
 335             self.report_error(u'no suitable InfoExtractor: %s' % url)
 336
 337     def process_ie_result(self, ie_result, download=True, extra_info={}):
 338         """
 339         Take the result of the ie(may be modified) and resolve all unresolved
 340         references (URLs, playlist items).
 341
 342         It will also download the videos if 'download'.
 343         Returns the resolved ie_result.
 344         """
 345
 346         result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
 347         if result_type == 'video':
 348             ie_result.update(extra_info)
 349             if 'playlist' not in ie_result:
 350                 # It isn't part of a playlist
 351                 ie_result['playlist'] = None
 352                 ie_result['playlist_index'] = None
 353             if download:
 354                 self.process_info(ie_result)
 355             return ie_result
 356         elif result_type == 'url':
 357             # We have to add extra_info to the results because it may be
 358             # contained in a playlist
 359             return self.extract_info(ie_result['url'],
 360                                      download,
 361                                      ie_key=ie_result.get('ie_key'),
 362                                      extra_info=extra_info)
 363         elif result_type == 'playlist':
 364             # We process each entry in the playlist
 365             playlist = ie_result.get('title', None) or ie_result.get('id', None)
 366             self.to_screen(u'[download] Downloading playlist: %s'  % playlist)
 367
 368             playlist_results = []
 369
 370             n_all_entries = len(ie_result['entries'])
 371             playliststart = self.params.get('playliststart', 1) - 1
 372             playlistend = self.params.get('playlistend', -1)
 373
 374             if playlistend == -1:
 375                 entries = ie_result['entries'][playliststart:]
 376             else:
 377                 entries = ie_result['entries'][playliststart:playlistend]
 378
 379             n_entries = len(entries)
 380
 381             self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
 382                 (ie_result['extractor'], playlist, n_all_entries, n_entries))
 383
 384             for i,entry in enumerate(entries,1):
 385                 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
 386                 extra = {
 387                          'playlist': playlist,
 388                          'playlist_index': i + playliststart,
 389                          }
 390                 if not 'extractor' in entry:
 391                     # We set the extractor, if it's an url it will be set then to
 392                     # the new extractor, but if it's already a video we must make
 393                     # sure it's present: see issue #877
 394                     entry['extractor'] = ie_result['extractor']
 395                 entry_result = self.process_ie_result(entry,
 396                                                       download=download,
 397                                                       extra_info=extra)
 398                 playlist_results.append(entry_result)
 399             ie_result['entries'] = playlist_results
 400             return ie_result
 401         elif result_type == 'compat_list':
 402             def _fixup(r):
 403                 r.setdefault('extractor', ie_result['extractor'])
 404                 return r
 405             ie_result['entries'] = [
 406                 self.process_ie_result(_fixup(r), download=download)
 407                 for r in ie_result['entries']
 408             ]
 409             return ie_result
 410         else:
 411             raise Exception('Invalid result type: %s' % result_type)
 412
 413     def process_info(self, info_dict):
 414         """Process a single resolved IE result."""
 415
 416         assert info_dict.get('_type', 'video') == 'video'
 417         #We increment the download the download count here to match the previous behaviour.
 418         self.increment_downloads()
 419
 420         info_dict['fulltitle'] = info_dict['title']
 421         if len(info_dict['title']) > 200:
 422             info_dict['title'] = info_dict['title'][:197] + u'...'
 423
 424         # Keep for backwards compatibility
 425         info_dict['stitle'] = info_dict['title']
 426
 427         if not 'format' in info_dict:
 428             info_dict['format'] = info_dict['ext']
 429
 430         reason = self._match_entry(info_dict)
 431         if reason is not None:
 432             self.to_screen(u'[download] ' + reason)
 433             return
 434
 435         max_downloads = self.params.get('max_downloads')
 436         if max_downloads is not None:
 437             if self._num_downloads > int(max_downloads):
 438                 raise MaxDownloadsReached()
 439
 440         filename = self.prepare_filename(info_dict)
 441
 442         # Forced printings
 443         if self.params.get('forcetitle', False):
 444             compat_print(info_dict['title'])
 445         if self.params.get('forceid', False):
 446             compat_print(info_dict['id'])
 447         if self.params.get('forceurl', False):
 448             compat_print(info_dict['url'])
 449         if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
 450             compat_print(info_dict['thumbnail'])
 451         if self.params.get('forcedescription', False) and 'description' in info_dict:
 452             compat_print(info_dict['description'])
 453         if self.params.get('forcefilename', False) and filename is not None:
 454             compat_print(filename)
 455         if self.params.get('forceformat', False):
 456             compat_print(info_dict['format'])
 457
 458         # Do nothing else if in simulate mode
 459         if self.params.get('simulate', False):
 460             return
 461
 462         if filename is None:
 463             return
 464
 465         try:
 466             dn = os.path.dirname(encodeFilename(filename))
 467             if dn != '' and not os.path.exists(dn):
 468                 os.makedirs(dn)
 469         except (OSError, IOError) as err:
 470             self.report_error(u'unable to create directory ' + compat_str(err))
 471             return
 472
 473         if self.params.get('writedescription', False):
 474             try:
 475                 descfn = filename + u'.description'
 476                 self.report_writedescription(descfn)
 477                 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
 478                     descfile.write(info_dict['description'])
 479             except (OSError, IOError):
 480                 self.report_error(u'Cannot write description file ' + descfn)
 481                 return
 482
 483         subtitles_are_requested = any([self.params.get('writesubtitles', False),
 484                                        self.params.get('writeautomaticsub'),
 485                                        self.params.get('allsubtitles', False)])
 486
 487         if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
 488             # subtitles download errors are already managed as troubles in relevant IE
 489             # that way it will silently go on when used with unsupporting IE
 490             subtitles = info_dict['subtitles']
 491             sub_format = self.params.get('subtitlesformat')
 492
 493             for sub_lang in subtitles.keys():
 494                 sub = subtitles[sub_lang]
 495                 if sub is None:
 496                     continue
 497                 try:
 498                     sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 499                     if os.path.isfile(encodeFilename(sub_filename)):
 500                         self.report_existingsubtitles(sub_filename)
 501                         continue
 502                     self.report_writesubtitles(sub_filename)
 503                     with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 504                             subfile.write(sub)
 505                 except (OSError, IOError):
 506                     self.report_error(u'Cannot write subtitles file ' + descfn)
 507                     return
 508
 509         if self.params.get('writeinfojson', False):
 510             infofn = filename + u'.info.json'
 511             self.report_writeinfojson(infofn)
 512             try:
 513                 json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
 514                 write_json_file(json_info_dict, encodeFilename(infofn))
 515             except (OSError, IOError):
 516                 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
 517                 return
 518
 519         if self.params.get('writethumbnail', False):
 520             if info_dict.get('thumbnail') is not None:
 521                 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
 522                 thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
 523                 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
 524                                (info_dict['extractor'], info_dict['id']))
 525                 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
 526                 with open(thumb_filename, 'wb') as thumbf:
 527                     shutil.copyfileobj(uf, thumbf)
 528                 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
 529                                (info_dict['extractor'], info_dict['id'], thumb_filename))
 530
 531         if not self.params.get('skip_download', False):
 532             if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
 533                 success = True
 534             else:
 535                 try:
 536                     success = self.fd._do_download(filename, info_dict)
 537                 except (OSError, IOError) as err:
 538                     raise UnavailableVideoError(err)
 539                 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 540                     self.report_error(u'unable to download video data: %s' % str(err))
 541                     return
 542                 except (ContentTooShortError, ) as err:
 543                     self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
 544                     return
 545
 546             if success:
 547                 try:
 548                     self.post_process(filename, info_dict)
 549                 except (PostProcessingError) as err:
 550                     self.report_error(u'postprocessing: %s' % str(err))
 551                     return
 552
 553     def download(self, url_list):
 554         """Download a given list of URLs."""
 555         if len(url_list) > 1 and self.fixed_template():
 556             raise SameFileError(self.params['outtmpl'])
 557
 558         for url in url_list:
 559             try:
 560                 #It also downloads the videos
 561                 videos = self.extract_info(url)
 562             except UnavailableVideoError:
 563                 self.report_error(u'unable to download video')
 564             except MaxDownloadsReached:
 565                 self.to_screen(u'[info] Maximum number of downloaded files reached.')
 566                 raise
 567
 568         return self._download_retcode
 569
 570     def post_process(self, filename, ie_info):
 571         """Run all the postprocessors on the given file."""
 572         info = dict(ie_info)
 573         info['filepath'] = filename
 574         keep_video = None
 575         for pp in self._pps:
 576             try:
 577                 keep_video_wish,new_info = pp.run(info)
 578                 if keep_video_wish is not None:
 579                     if keep_video_wish:
 580                         keep_video = keep_video_wish
 581                     elif keep_video is None:
 582                         # No clear decision yet, let IE decide
 583                         keep_video = keep_video_wish
 584             except PostProcessingError as e:
 585                 self.report_error(e.msg)
 586         if keep_video is False and not self.params.get('keepvideo', False):
 587             try:
 588                 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
 589                 os.remove(encodeFilename(filename))
 590             except (IOError, OSError):
 591                 self.report_warning(u'Unable to remove downloaded video file')