X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FYoutubeDL.py;h=d50b7cfed3c537a02fd53a9dc46f4e0981b6608b;hb=dfb1b1468cef4ddc7ecc43776abce03763f8e426;hp=258e612afa5540ebefe4189d91d029768672f0ae;hpb=5acfa126c812c3ab7088af6c7df79697baee7831;p=youtube-dl diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 258e612af..d50b7cfed 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -38,20 +38,21 @@ from .compat import ( compat_tokenize_tokenize, compat_urllib_error, compat_urllib_request, + compat_urllib_request_DataHandler, ) from .utils import ( - escape_url, ContentTooShortError, date_from_str, DateRange, DEFAULT_OUTTMPL, determine_ext, DownloadError, + encode_compat_str, encodeFilename, + error_to_compat_str, ExtractorError, format_bytes, formatSeconds, - HEADRequest, locked_file, make_HTTPS_handler, MaxDownloadsReached, @@ -65,6 +66,7 @@ from .utils import ( SameFileError, sanitize_filename, sanitize_path, + sanitized_Request, std_headers, subtitles_filename, UnavailableVideoError, @@ -72,6 +74,7 @@ from .utils import ( version_tuple, write_json_file, write_string, + YoutubeDLCookieProcessor, YoutubeDLHandler, prepend_extension, replace_extension, @@ -157,7 +160,7 @@ class YoutubeDL(object): writethumbnail: Write the thumbnail image to a file write_all_thumbnails: Write all thumbnail formats to files writesubtitles: Write the video subtitles to a file - writeautomaticsub: Write the automatic subtitles to a file + writeautomaticsub: Write the automatically generated subtitles to a file allsubtitles: Downloads all the subtitles of the video (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video @@ -264,6 +267,8 @@ class YoutubeDL(object): The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, otherwise prefer avconv. + postprocessor_args: A list of additional command-line arguments for the + postprocessor. """ params = None @@ -285,7 +290,11 @@ class YoutubeDL(object): self._num_downloads = 0 self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)] self._err_file = sys.stderr - self.params = params + self.params = { + # Default parameters + 'nocheckcertificate': False, + } + self.params.update(params) self.cache = Cache(self) if params.get('bidi_workaround', False): @@ -488,7 +497,7 @@ class YoutubeDL(object): tb = '' if hasattr(sys.exc_info()[1], 'exc_info') and sys.exc_info()[1].exc_info[0]: tb += ''.join(traceback.format_exception(*sys.exc_info()[1].exc_info)) - tb += compat_str(traceback.format_exc()) + tb += encode_compat_str(traceback.format_exc()) else: tb_data = traceback.format_list(traceback.extract_stack()) tb = ''.join(tb_data) @@ -567,7 +576,7 @@ class YoutubeDL(object): if v is not None) template_dict = collections.defaultdict(lambda: 'NA', template_dict) - outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL)) + outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL) tmpl = compat_expanduser(outtmpl) filename = tmpl % template_dict # Temporary fix for #4787 @@ -575,7 +584,7 @@ class YoutubeDL(object): # to workaround encoding issues with subprocess on python2 @ Windows if sys.version_info < (3, 0) and sys.platform == 'win32': filename = encodeFilename(filename, True).decode(preferredencoding()) - return filename + return sanitize_path(filename) except ValueError as err: self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')') return None @@ -667,14 +676,14 @@ class YoutubeDL(object): return self.process_ie_result(ie_result, download, extra_info) else: return ie_result - except ExtractorError as de: # An error we somewhat expected - self.report_error(compat_str(de), de.format_traceback()) + except ExtractorError as e: # An error we somewhat expected + self.report_error(compat_str(e), e.format_traceback()) break except MaxDownloadsReached: raise except Exception as e: if self.params.get('ignoreerrors', False): - self.report_error(compat_str(e), tb=compat_str(traceback.format_exc())) + self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) break else: raise @@ -828,6 +837,7 @@ class YoutubeDL(object): extra_info=extra) playlist_results.append(entry_result) ie_result['entries'] = playlist_results + self.to_screen('[download] Finished downloading playlist: %s' % playlist) return ie_result elif result_type == 'compat_list': self.report_warning( @@ -920,6 +930,7 @@ class YoutubeDL(object): PICKFIRST = 'PICKFIRST' MERGE = 'MERGE' SINGLE = 'SINGLE' + GROUP = 'GROUP' FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters']) def _parse_filter(tokens): @@ -930,7 +941,38 @@ class YoutubeDL(object): else: filter_parts.append(string) - def _parse_format_selection(tokens, endwith=[]): + def _remove_unused_ops(tokens): + # Remove operators that we don't use and join them with the surrounding strings + # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + ALLOWED_OPS = ('/', '+', ',', '(', ')') + last_string, last_start, last_end, last_line = None, None, None, None + for type, string, start, end, line in tokens: + if type == tokenize.OP and string == '[': + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + # everything inside brackets will be handled by _parse_filter + for type, string, start, end, line in tokens: + yield type, string, start, end, line + if type == tokenize.OP and string == ']': + break + elif type == tokenize.OP and string in ALLOWED_OPS: + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + last_string = None + yield type, string, start, end, line + elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]: + if not last_string: + last_string = string + last_start = start + last_end = end + else: + last_string += string + if last_string: + yield tokenize.NAME, last_string, last_start, last_end, last_line + + def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False): selectors = [] current_selector = None for type, string, start, _, _ in tokens: @@ -940,26 +982,44 @@ class YoutubeDL(object): elif type in [tokenize.NAME, tokenize.NUMBER]: current_selector = FormatSelector(SINGLE, string, []) elif type == tokenize.OP: - if string in endwith: + if string == ')': + if not inside_group: + # ')' will be handled by the parentheses group + tokens.restore_last_token() break - if string == ',': + elif inside_merge and string in ['/', ',']: + tokens.restore_last_token() + break + elif inside_choice and string == ',': + tokens.restore_last_token() + break + elif string == ',': + if not current_selector: + raise syntax_error('"," must follow a format selector', start) selectors.append(current_selector) current_selector = None elif string == '/': + if not current_selector: + raise syntax_error('"/" must follow a format selector', start) first_choice = current_selector - second_choice = _parse_format_selection(tokens, [',']) - current_selector = None - selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), [])) + second_choice = _parse_format_selection(tokens, inside_choice=True) + current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), []) elif string == '[': if not current_selector: current_selector = FormatSelector(SINGLE, 'best', []) format_filter = _parse_filter(tokens) current_selector.filters.append(format_filter) + elif string == '(': + if current_selector: + raise syntax_error('Unexpected "("', start) + group = _parse_format_selection(tokens, inside_group=True) + current_selector = FormatSelector(GROUP, group, []) elif string == '+': video_selector = current_selector - audio_selector = _parse_format_selection(tokens, [',']) - current_selector = None - selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), [])) + audio_selector = _parse_format_selection(tokens, inside_merge=True) + if not video_selector or not audio_selector: + raise syntax_error('"+" must be between two format selectors', start) + current_selector = FormatSelector(MERGE, (video_selector, audio_selector), []) else: raise syntax_error('Operator not recognized: "{0}"'.format(string), start) elif type == tokenize.ENDMARKER: @@ -977,6 +1037,8 @@ class YoutubeDL(object): for format in f(formats): yield format return selector_function + elif selector.type == GROUP: + selector_function = _build_selector_function(selector.selector) elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] @@ -990,6 +1052,9 @@ class YoutubeDL(object): format_spec = selector.selector def selector_function(formats): + formats = list(formats) + if not formats: + return if format_spec == 'all': for f in formats: yield f @@ -1047,6 +1112,12 @@ class YoutubeDL(object): 'contain the video, try using ' '"-f %s+%s"' % (format_2, format_1)) return + # Formats must be opposite (video+audio) + if formats_info[0].get('acodec') == 'none' and formats_info[1].get('acodec') == 'none': + self.report_error( + 'Both formats %s and %s are video-only, you must specify "-f video+audio"' + % (format_1, format_2)) + return output_ext = ( formats_info[0]['ext'] if self.params.get('merge_output_format') is None @@ -1084,8 +1155,32 @@ class YoutubeDL(object): return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) - tokens = compat_tokenize_tokenize(stream.readline) - parsed_selector = _parse_format_selection(tokens) + try: + tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + except tokenize.TokenError: + raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) + + class TokenIterator(object): + def __init__(self, tokens): + self.tokens = tokens + self.counter = 0 + + def __iter__(self): + return self + + def __next__(self): + if self.counter >= len(self.tokens): + raise StopIteration() + value = self.tokens[self.counter] + self.counter += 1 + return value + + next = __next__ + + def restore_last_token(self): + self.counter -= 1 + + parsed_selector = _parse_format_selection(iter(TokenIterator(tokens))) return _build_selector_function(parsed_selector) def _calc_headers(self, info_dict): @@ -1102,7 +1197,7 @@ class YoutubeDL(object): return res def _calc_cookies(self, info_dict): - pr = compat_urllib_request.Request(info_dict['url']) + pr = sanitized_Request(info_dict['url']) self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') @@ -1129,7 +1224,7 @@ class YoutubeDL(object): t.get('preference'), t.get('width'), t.get('height'), t.get('id'), t.get('url'))) for i, t in enumerate(thumbnails): - if 'width' in t and 'height' in t: + if t.get('width') and t.get('height'): t['resolution'] = '%dx%d' % (t['width'], t['height']) if t.get('id') is None: t['id'] = '%d' % i @@ -1149,13 +1244,20 @@ class YoutubeDL(object): except (ValueError, OverflowError, OSError): pass + subtitles = info_dict.get('subtitles') + if subtitles: + for _, subtitle in subtitles.items(): + for subtitle_format in subtitle: + if 'ext' not in subtitle_format: + subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() + if self.params.get('listsubtitles', False): if 'automatic_captions' in info_dict: self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions') - self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles') + self.list_subtitles(info_dict['id'], subtitles, 'subtitles') return info_dict['requested_subtitles'] = self.process_subtitles( - info_dict['id'], info_dict.get('subtitles'), + info_dict['id'], subtitles, info_dict.get('automatic_captions')) # We now pick which formats have to be downloaded @@ -1210,7 +1312,7 @@ class YoutubeDL(object): # only set the 'formats' fields if the original info_dict list them # otherwise we end up with a circular reference, the first (and unique) # element in the 'formats' field in info_dict is info_dict itself, - # wich can't be exported to json + # which can't be exported to json info_dict['formats'] = formats if self.params.get('listformats'): self.list_formats(info_dict) @@ -1223,7 +1325,8 @@ class YoutubeDL(object): if req_format is None: req_format_list = [] if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and - info_dict['extractor'] in ['youtube', 'ted']): + info_dict['extractor'] in ['youtube', 'ted'] and + not info_dict.get('is_live')): merger = FFmpegMergerPP(self) if merger.available and merger.can_merge(): req_format_list.append('bestvideo+bestaudio') @@ -1358,7 +1461,7 @@ class YoutubeDL(object): if dn and not os.path.exists(dn): os.makedirs(dn) except (OSError, IOError) as err: - self.report_error('unable to create directory ' + compat_str(err)) + self.report_error('unable to create directory ' + error_to_compat_str(err)) return if self.params.get('writedescription', False): @@ -1409,7 +1512,7 @@ class YoutubeDL(object): sub_info['url'], info_dict['id'], note=False) except ExtractorError as err: self.report_warning('Unable to download subtitle for "%s": %s' % - (sub_lang, compat_str(err.cause))) + (sub_lang, error_to_compat_str(err.cause))) continue try: sub_filename = subtitles_filename(filename, sub_lang, sub_format) @@ -1688,6 +1791,10 @@ class YoutubeDL(object): res = '' if fdict.get('ext') in ['f4f', 'f4m']: res += '(unsupported) ' + if fdict.get('language'): + if res: + res += ' ' + res += '[%s]' % fdict['language'] if fdict.get('format_note') is not None: res += fdict['format_note'] + ' ' if fdict.get('tbr') is not None: @@ -1778,27 +1885,8 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ - - # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not - # always respected by websites, some tend to give out URLs with non percent-encoded - # non-ASCII characters (see telemb.py, ard.py [#3412]) - # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991) - # To work around aforementioned issue we will replace request's original URL with - # percent-encoded one - req_is_string = isinstance(req, compat_basestring) - url = req if req_is_string else req.get_full_url() - url_escaped = escape_url(url) - - # Substitute URL if any change after escaping - if url != url_escaped: - if req_is_string: - req = url_escaped - else: - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request - req = req_type( - url_escaped, data=req.data, headers=req.headers, - origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) - + if isinstance(req, compat_basestring): + req = sanitized_Request(req) return self._opener.open(req, timeout=self._socket_timeout) def print_debug_header(self): @@ -1881,8 +1969,7 @@ class YoutubeDL(object): if os.access(opts_cookiefile, os.R_OK): self.cookiejar.load() - cookie_processor = compat_urllib_request.HTTPCookieProcessor( - self.cookiejar) + cookie_processor = YoutubeDLCookieProcessor(self.cookiejar) if opts_proxy is not None: if opts_proxy == '': proxies = {} @@ -1898,8 +1985,9 @@ class YoutubeDL(object): debuglevel = 1 if self.params.get('debug_printtraffic') else 0 https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) + data_handler = compat_urllib_request_DataHandler() opener = compat_urllib_request.build_opener( - proxy_handler, https_handler, cookie_processor, ydlh) + proxy_handler, https_handler, cookie_processor, ydlh, data_handler) # Delete the default user-agent header, which would otherwise apply in # cases where our custom HTTP handler doesn't come into play @@ -1951,10 +2039,10 @@ class YoutubeDL(object): (info_dict['extractor'], info_dict['id'], thumb_display_id)) try: uf = self.urlopen(t['url']) - with open(thumb_filename, 'wb') as thumbf: + with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) self.to_screen('[%s] %s: Writing thumbnail %sto: %s' % (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename)) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download thumbnail "%s": %s' % - (t['url'], compat_str(err))) + (t['url'], error_to_compat_str(err)))