2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
17 from .extractor import get_info_extractor, gen_extractors
18 from .FileDownloader import FileDownloader
21 class YoutubeDL(object):
24 YoutubeDL objects are the ones responsible of downloading the
25 actual video file and writing it to disk if the user has requested
26 it, among some other tasks. In most cases there should be one per
27 program. As, given a video URL, the downloader doesn't know how to
28 extract all the needed information, task that InfoExtractors do, it
29 has to pass the URL to one of them.
31 For this, YoutubeDL objects have a method that allows
32 InfoExtractors to be registered in a given order. When it is passed
33 a URL, the YoutubeDL object handles it to the first InfoExtractor it
34 finds that reports being able to handle it. The InfoExtractor extracts
35 all the information about the video or videos the URL refers to, and
36 YoutubeDL process the extracted information, possibly using a File
37 Downloader to download the video.
39 YoutubeDL objects accept a lot of parameters. In order not to saturate
40 the object constructor with arguments, it receives a dictionary of
41 options instead. These options are available through the params
42 attribute for the InfoExtractors to use. The YoutubeDL also
43 registers itself as the downloader in charge for the InfoExtractors
44 that are added to it, so this is a "mutual registration".
48 username: Username for authentication purposes.
49 password: Password for authentication purposes.
50 videopassword: Password for acces a video.
51 usenetrc: Use netrc for authentication instead.
52 verbose: Print additional info to stdout.
53 quiet: Do not print messages to stdout.
54 forceurl: Force printing final URL.
55 forcetitle: Force printing title.
56 forceid: Force printing ID.
57 forcethumbnail: Force printing thumbnail URL.
58 forcedescription: Force printing description.
59 forcefilename: Force printing final filename.
60 simulate: Do not download the video files.
61 format: Video format code.
62 format_limit: Highest quality format to try.
63 outtmpl: Template for output names.
64 restrictfilenames: Do not allow "&" and spaces in file names
65 ignoreerrors: Do not stop on download errors.
66 nooverwrites: Prevent overwriting files.
67 playliststart: Playlist item to start at.
68 playlistend: Playlist item to end at.
69 matchtitle: Download only matching titles.
70 rejecttitle: Reject downloads for matching titles.
71 logtostderr: Log messages to stderr instead of stdout.
72 writedescription: Write the video description to a .description file
73 writeinfojson: Write the video description to a .info.json file
74 writeannotations: Write the video annotations to a .annotations.xml file
75 writethumbnail: Write the thumbnail image to a file
76 writesubtitles: Write the video subtitles to a file
77 writeautomaticsub: Write the automatic subtitles to a file
78 allsubtitles: Downloads all the subtitles of the video
79 (requires writesubtitles or writeautomaticsub)
80 listsubtitles: Lists all available subtitles for the video
81 subtitlesformat: Subtitle format [srt/sbv/vtt] (default=srt)
82 subtitleslangs: List of languages of the subtitles to download
83 keepvideo: Keep the video file after post-processing
84 daterange: A DateRange object, download only if the upload_date is in the range.
85 skip_download: Skip the actual download of the video file
86 cachedir: Location of the cache files in the filesystem.
87 None to disable filesystem cache.
88 noplaylist: Download single video instead of a playlist if in doubt.
89 age_limit: An integer representing the user's age in years.
90 Unsuitable videos for the given age are skipped.
91 downloadarchive: File name of a file where all downloads are recorded.
92 Videos already present in the file are not downloaded
95 The following parameters are not used by YoutubeDL itself, they are used by
97 nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test,
98 noresizebuffer, retries, continuedl, noprogress, consoletitle
104 _download_retcode = None
105 _num_downloads = None
108 def __init__(self, params):
109 """Create a FileDownloader object with the given options."""
111 self._ies_instances = {}
113 self._progress_hooks = []
114 self._download_retcode = 0
115 self._num_downloads = 0
116 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
118 if (sys.version_info >= (3,) and sys.platform != 'win32' and
119 sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
120 and not params['restrictfilenames']):
121 # On Python 3, the Unicode filesystem API will throw errors (#1474)
123 u'Assuming --restrict-filenames since file system encoding '
124 u'cannot encode all charactes. '
125 u'Set the LC_ALL environment variable to fix this.')
126 params['restrictfilenames'] = True
129 self.fd = FileDownloader(self, self.params)
131 if '%(stitle)s' in self.params['outtmpl']:
132 self.report_warning(u'%(stitle)s is deprecated. Use the %(title)s and the --restrict-filenames flag(which also secures %(uploader)s et al) instead.')
134 def add_info_extractor(self, ie):
135 """Add an InfoExtractor object to the end of the list."""
137 self._ies_instances[ie.ie_key()] = ie
138 ie.set_downloader(self)
140 def get_info_extractor(self, ie_key):
142 Get an instance of an IE with name ie_key, it will try to get one from
143 the _ies list, if there's no instance it will create a new one and add
144 it to the extractor list.
146 ie = self._ies_instances.get(ie_key)
148 ie = get_info_extractor(ie_key)()
149 self.add_info_extractor(ie)
152 def add_default_info_extractors(self):
154 Add the InfoExtractors returned by gen_extractors to the end of the list
156 for ie in gen_extractors():
157 self.add_info_extractor(ie)
159 def add_post_processor(self, pp):
160 """Add a PostProcessor object to the end of the chain."""
162 pp.set_downloader(self)
164 def to_screen(self, message, skip_eol=False):
165 """Print message to stdout if not in quiet mode."""
166 if not self.params.get('quiet', False):
167 terminator = [u'\n', u''][skip_eol]
168 output = message + terminator
169 write_string(output, self._screen_file)
171 def to_stderr(self, message):
172 """Print message to stderr."""
173 assert type(message) == type(u'')
174 output = message + u'\n'
175 if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
176 output = output.encode(preferredencoding())
177 sys.stderr.write(output)
179 def fixed_template(self):
180 """Checks if the output template is fixed."""
181 return (re.search(u'(?u)%\\(.+?\\)s', self.params['outtmpl']) is None)
183 def trouble(self, message=None, tb=None):
184 """Determine action to take when a download problem appears.
186 Depending on if the downloader has been configured to ignore
187 download errors or not, this method may throw an exception or
188 not when errors are found, after printing the message.
190 tb, if given, is additional traceback information.
192 if message is not None:
193 self.to_stderr(message)
194 if self.params.get('verbose'):
196 if sys.exc_info()[0]: # if .trouble has been called from an except block
198 if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
199 tb += u''.join(traceback.format_exception(*sys.exc_info()[1].exc_info))
200 tb += compat_str(traceback.format_exc())
202 tb_data = traceback.format_list(traceback.extract_stack())
203 tb = u''.join(tb_data)
205 if not self.params.get('ignoreerrors', False):
206 if sys.exc_info()[0] and hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]:
207 exc_info = sys.exc_info()[1].exc_info
209 exc_info = sys.exc_info()
210 raise DownloadError(message, exc_info)
211 self._download_retcode = 1
213 def report_warning(self, message):
215 Print the message to stderr, it will be prefixed with 'WARNING:'
216 If stderr is a tty file the 'WARNING:' will be colored
218 if sys.stderr.isatty() and os.name != 'nt':
219 _msg_header=u'\033[0;33mWARNING:\033[0m'
221 _msg_header=u'WARNING:'
222 warning_message=u'%s %s' % (_msg_header,message)
223 self.to_stderr(warning_message)
225 def report_error(self, message, tb=None):
227 Do the same as trouble, but prefixes the message with 'ERROR:', colored
228 in red if stderr is a tty file.
230 if sys.stderr.isatty() and os.name != 'nt':
231 _msg_header = u'\033[0;31mERROR:\033[0m'
233 _msg_header = u'ERROR:'
234 error_message = u'%s %s' % (_msg_header, message)
235 self.trouble(error_message, tb)
237 def slow_down(self, start_time, byte_counter):
238 """Sleep if the download speed is over the rate limit."""
239 rate_limit = self.params.get('ratelimit', None)
240 if rate_limit is None or byte_counter == 0:
243 elapsed = now - start_time
246 speed = float(byte_counter) / elapsed
247 if speed > rate_limit:
248 time.sleep((byte_counter - rate_limit * (now - start_time)) / rate_limit)
250 def report_writedescription(self, descfn):
251 """ Report that the description file is being written """
252 self.to_screen(u'[info] Writing video description to: ' + descfn)
254 def report_writesubtitles(self, sub_filename):
255 """ Report that the subtitles file is being written """
256 self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
258 def report_writeinfojson(self, infofn):
259 """ Report that the metadata file has been written """
260 self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
262 def report_writeannotations(self, annofn):
263 """ Report that the annotations file has been written. """
264 self.to_screen(u'[info] Writing video annotations to: ' + annofn)
266 def report_file_already_downloaded(self, file_name):
267 """Report file has already been fully downloaded."""
269 self.to_screen(u'[download] %s has already been downloaded' % file_name)
270 except (UnicodeEncodeError) as err:
271 self.to_screen(u'[download] The file has already been downloaded')
273 def increment_downloads(self):
274 """Increment the ordinal that assigns a number to each file."""
275 self._num_downloads += 1
277 def prepare_filename(self, info_dict):
278 """Generate the output filename."""
280 template_dict = dict(info_dict)
282 template_dict['epoch'] = int(time.time())
283 autonumber_size = self.params.get('autonumber_size')
284 if autonumber_size is None:
286 autonumber_templ = u'%0' + str(autonumber_size) + u'd'
287 template_dict['autonumber'] = autonumber_templ % self._num_downloads
288 if template_dict['playlist_index'] is not None:
289 template_dict['playlist_index'] = u'%05d' % template_dict['playlist_index']
291 sanitize = lambda k,v: sanitize_filename(
292 u'NA' if v is None else compat_str(v),
293 restricted=self.params.get('restrictfilenames'),
295 template_dict = dict((k, sanitize(k, v)) for k,v in template_dict.items())
297 filename = self.params['outtmpl'] % template_dict
299 except KeyError as err:
300 self.report_error(u'Erroneous output template')
302 except ValueError as err:
303 self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
306 def _match_entry(self, info_dict):
307 """ Returns None iff the file should be downloaded """
309 title = info_dict['title']
310 matchtitle = self.params.get('matchtitle', False)
312 if not re.search(matchtitle, title, re.IGNORECASE):
313 return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
314 rejecttitle = self.params.get('rejecttitle', False)
316 if re.search(rejecttitle, title, re.IGNORECASE):
317 return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
318 date = info_dict.get('upload_date', None)
320 dateRange = self.params.get('daterange', DateRange())
321 if date not in dateRange:
322 return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
323 age_limit = self.params.get('age_limit')
324 if age_limit is not None:
325 if age_limit < info_dict.get('age_limit', 0):
326 return u'Skipping "' + title + '" because it is age restricted'
327 if self.in_download_archive(info_dict):
328 return (u'%(title)s has already been recorded in archive'
332 def extract_info(self, url, download=True, ie_key=None, extra_info={}):
334 Returns a list with a dictionary for each video we find.
335 If 'download', also downloads the videos.
336 extra_info is a dict containing the extra values to add to each result
340 ies = [self.get_info_extractor(ie_key)]
345 if not ie.suitable(url):
349 self.report_warning(u'The program functionality for this site has been marked as broken, '
350 u'and will probably not work.')
353 ie_result = ie.extract(url)
354 if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
356 if isinstance(ie_result, list):
357 # Backwards compatibility: old IE result format
358 for result in ie_result:
359 result.update(extra_info)
361 '_type': 'compat_list',
362 'entries': ie_result,
365 ie_result.update(extra_info)
366 if 'extractor' not in ie_result:
367 ie_result['extractor'] = ie.IE_NAME
368 return self.process_ie_result(ie_result, download=download)
369 except ExtractorError as de: # An error we somewhat expected
370 self.report_error(compat_str(de), de.format_traceback())
372 except Exception as e:
373 if self.params.get('ignoreerrors', False):
374 self.report_error(compat_str(e), tb=compat_str(traceback.format_exc()))
379 self.report_error(u'no suitable InfoExtractor: %s' % url)
381 def process_ie_result(self, ie_result, download=True, extra_info={}):
383 Take the result of the ie(may be modified) and resolve all unresolved
384 references (URLs, playlist items).
386 It will also download the videos if 'download'.
387 Returns the resolved ie_result.
390 result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
391 if result_type == 'video':
392 ie_result.update(extra_info)
393 if 'playlist' not in ie_result:
394 # It isn't part of a playlist
395 ie_result['playlist'] = None
396 ie_result['playlist_index'] = None
398 self.process_info(ie_result)
400 elif result_type == 'url':
401 # We have to add extra_info to the results because it may be
402 # contained in a playlist
403 return self.extract_info(ie_result['url'],
405 ie_key=ie_result.get('ie_key'),
406 extra_info=extra_info)
407 elif result_type == 'playlist':
408 # We process each entry in the playlist
409 playlist = ie_result.get('title', None) or ie_result.get('id', None)
410 self.to_screen(u'[download] Downloading playlist: %s' % playlist)
412 playlist_results = []
414 n_all_entries = len(ie_result['entries'])
415 playliststart = self.params.get('playliststart', 1) - 1
416 playlistend = self.params.get('playlistend', -1)
418 if playlistend == -1:
419 entries = ie_result['entries'][playliststart:]
421 entries = ie_result['entries'][playliststart:playlistend]
423 n_entries = len(entries)
425 self.to_screen(u"[%s] playlist '%s': Collected %d video ids (downloading %d of them)" %
426 (ie_result['extractor'], playlist, n_all_entries, n_entries))
428 for i,entry in enumerate(entries,1):
429 self.to_screen(u'[download] Downloading video #%s of %s' %(i, n_entries))
431 'playlist': playlist,
432 'playlist_index': i + playliststart,
434 if not 'extractor' in entry:
435 # We set the extractor, if it's an url it will be set then to
436 # the new extractor, but if it's already a video we must make
437 # sure it's present: see issue #877
438 entry['extractor'] = ie_result['extractor']
439 entry_result = self.process_ie_result(entry,
442 playlist_results.append(entry_result)
443 ie_result['entries'] = playlist_results
445 elif result_type == 'compat_list':
447 r.setdefault('extractor', ie_result['extractor'])
449 ie_result['entries'] = [
450 self.process_ie_result(_fixup(r), download=download)
451 for r in ie_result['entries']
455 raise Exception('Invalid result type: %s' % result_type)
457 def process_info(self, info_dict):
458 """Process a single resolved IE result."""
460 assert info_dict.get('_type', 'video') == 'video'
461 #We increment the download the download count here to match the previous behaviour.
462 self.increment_downloads()
464 info_dict['fulltitle'] = info_dict['title']
465 if len(info_dict['title']) > 200:
466 info_dict['title'] = info_dict['title'][:197] + u'...'
468 # Keep for backwards compatibility
469 info_dict['stitle'] = info_dict['title']
471 if not 'format' in info_dict:
472 info_dict['format'] = info_dict['ext']
474 reason = self._match_entry(info_dict)
475 if reason is not None:
476 self.to_screen(u'[download] ' + reason)
479 max_downloads = self.params.get('max_downloads')
480 if max_downloads is not None:
481 if self._num_downloads > int(max_downloads):
482 raise MaxDownloadsReached()
484 filename = self.prepare_filename(info_dict)
487 if self.params.get('forcetitle', False):
488 compat_print(info_dict['title'])
489 if self.params.get('forceid', False):
490 compat_print(info_dict['id'])
491 if self.params.get('forceurl', False):
492 # For RTMP URLs, also include the playpath
493 compat_print(info_dict['url'] + info_dict.get('play_path', u''))
494 if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
495 compat_print(info_dict['thumbnail'])
496 if self.params.get('forcedescription', False) and 'description' in info_dict:
497 compat_print(info_dict['description'])
498 if self.params.get('forcefilename', False) and filename is not None:
499 compat_print(filename)
500 if self.params.get('forceformat', False):
501 compat_print(info_dict['format'])
503 # Do nothing else if in simulate mode
504 if self.params.get('simulate', False):
511 dn = os.path.dirname(encodeFilename(filename))
512 if dn != '' and not os.path.exists(dn):
514 except (OSError, IOError) as err:
515 self.report_error(u'unable to create directory ' + compat_str(err))
518 if self.params.get('writedescription', False):
520 descfn = filename + u'.description'
521 self.report_writedescription(descfn)
522 with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
523 descfile.write(info_dict['description'])
524 except (KeyError, TypeError):
525 self.report_warning(u'There\'s no description to write.')
526 except (OSError, IOError):
527 self.report_error(u'Cannot write description file ' + descfn)
530 if self.params.get('writeannotations', False):
532 annofn = filename + u'.annotations.xml'
533 self.report_writeannotations(annofn)
534 with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
535 annofile.write(info_dict['annotations'])
536 except (KeyError, TypeError):
537 self.report_warning(u'There are no annotations to write.')
538 except (OSError, IOError):
539 self.report_error(u'Cannot write annotations file: ' + annofn)
542 subtitles_are_requested = any([self.params.get('writesubtitles', False),
543 self.params.get('writeautomaticsub')])
545 if subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
546 # subtitles download errors are already managed as troubles in relevant IE
547 # that way it will silently go on when used with unsupporting IE
548 subtitles = info_dict['subtitles']
549 sub_format = self.params.get('subtitlesformat')
550 for sub_lang in subtitles.keys():
551 sub = subtitles[sub_lang]
555 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
556 self.report_writesubtitles(sub_filename)
557 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
559 except (OSError, IOError):
560 self.report_error(u'Cannot write subtitles file ' + descfn)
563 if self.params.get('writeinfojson', False):
564 infofn = filename + u'.info.json'
565 self.report_writeinfojson(infofn)
567 json_info_dict = dict((k, v) for k,v in info_dict.items() if not k in ['urlhandle'])
568 write_json_file(json_info_dict, encodeFilename(infofn))
569 except (OSError, IOError):
570 self.report_error(u'Cannot write metadata to JSON file ' + infofn)
573 if self.params.get('writethumbnail', False):
574 if info_dict.get('thumbnail') is not None:
575 thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
576 thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
577 self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
578 (info_dict['extractor'], info_dict['id']))
580 uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
581 with open(thumb_filename, 'wb') as thumbf:
582 shutil.copyfileobj(uf, thumbf)
583 self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
584 (info_dict['extractor'], info_dict['id'], thumb_filename))
585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
586 self.report_warning(u'Unable to download thumbnail "%s": %s' %
587 (info_dict['thumbnail'], compat_str(err)))
589 if not self.params.get('skip_download', False):
590 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
594 success = self.fd._do_download(filename, info_dict)
595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
596 self.report_error(u'unable to download video data: %s' % str(err))
598 except (OSError, IOError) as err:
599 raise UnavailableVideoError(err)
600 except (ContentTooShortError, ) as err:
601 self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
606 self.post_process(filename, info_dict)
607 except (PostProcessingError) as err:
608 self.report_error(u'postprocessing: %s' % str(err))
611 self.record_download_archive(info_dict)
613 def download(self, url_list):
614 """Download a given list of URLs."""
615 if len(url_list) > 1 and self.fixed_template():
616 raise SameFileError(self.params['outtmpl'])
620 #It also downloads the videos
621 videos = self.extract_info(url)
622 except UnavailableVideoError:
623 self.report_error(u'unable to download video')
624 except MaxDownloadsReached:
625 self.to_screen(u'[info] Maximum number of downloaded files reached.')
628 return self._download_retcode
630 def post_process(self, filename, ie_info):
631 """Run all the postprocessors on the given file."""
633 info['filepath'] = filename
637 keep_video_wish,new_info = pp.run(info)
638 if keep_video_wish is not None:
640 keep_video = keep_video_wish
641 elif keep_video is None:
642 # No clear decision yet, let IE decide
643 keep_video = keep_video_wish
644 except PostProcessingError as e:
645 self.report_error(e.msg)
646 if keep_video is False and not self.params.get('keepvideo', False):
648 self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
649 os.remove(encodeFilename(filename))
650 except (IOError, OSError):
651 self.report_warning(u'Unable to remove downloaded video file')
653 def in_download_archive(self, info_dict):
654 fn = self.params.get('download_archive')
657 vid_id = info_dict['extractor'] + u' ' + info_dict['id']
659 with locked_file(fn, 'r', encoding='utf-8') as archive_file:
660 for line in archive_file:
661 if line.strip() == vid_id:
663 except IOError as ioe:
664 if ioe.errno != errno.ENOENT:
668 def record_download_archive(self, info_dict):
669 fn = self.params.get('download_archive')
672 vid_id = info_dict['extractor'] + u' ' + info_dict['id']
673 with locked_file(fn, 'a', encoding='utf-8') as archive_file:
674 archive_file.write(vid_id + u'\n')