X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FYoutubeDL.py;h=9a8c7da05172e342959d2ec5ffa481ab951fa763;hb=5c2266df4b9aeb7881ed8c026a038e2a25e43734;hp=17a5407b983fbadc00453d081ccd567f10f3dade;hpb=67134eaba1a56cec4117000acb2fc9284c9cdd9a;p=youtube-dl

diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 17a5407b9..9a8c7da05 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -38,9 +38,9 @@ from .compat import (
     compat_tokenize_tokenize,
     compat_urllib_error,
     compat_urllib_request,
+    compat_urllib_request_DataHandler,
 )
 from .utils import (
-    escape_url,
     ContentTooShortError,
     date_from_str,
     DateRange,
@@ -51,7 +51,6 @@ from .utils import (
     ExtractorError,
     format_bytes,
     formatSeconds,
-    HEADRequest,
     locked_file,
     make_HTTPS_handler,
     MaxDownloadsReached,
@@ -65,6 +64,7 @@ from .utils import (
     SameFileError,
     sanitize_filename,
     sanitize_path,
+    sanitized_Request,
     std_headers,
     subtitles_filename,
     UnavailableVideoError,
@@ -72,6 +72,7 @@ from .utils import (
     version_tuple,
     write_json_file,
     write_string,
+    YoutubeDLCookieProcessor,
     YoutubeDLHandler,
     prepend_extension,
     replace_extension,
@@ -157,7 +158,7 @@ class YoutubeDL(object):
     writethumbnail:    Write the thumbnail image to a file
     write_all_thumbnails:  Write all thumbnail formats to files
     writesubtitles:    Write the video subtitles to a file
-    writeautomaticsub: Write the automatic subtitles to a file
+    writeautomaticsub: Write the automatically generated subtitles to a file
     allsubtitles:      Downloads all the subtitles of the video
                        (requires writesubtitles or writeautomaticsub)
     listsubtitles:     Lists all available subtitles for the video
@@ -264,6 +265,8 @@ class YoutubeDL(object):
     The following options are used by the post processors:
     prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
                        otherwise prefer avconv.
+    postprocessor_args: A list of additional command-line arguments for the
+                        postprocessor.
     """
 
     params = None
@@ -285,7 +288,11 @@ class YoutubeDL(object):
         self._num_downloads = 0
         self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
         self._err_file = sys.stderr
-        self.params = params
+        self.params = {
+            # Default parameters
+            'nocheckcertificate': False,
+        }
+        self.params.update(params)
         self.cache = Cache(self)
 
         if params.get('bidi_workaround', False):
@@ -567,7 +574,7 @@ class YoutubeDL(object):
                                  if v is not None)
             template_dict = collections.defaultdict(lambda: 'NA', template_dict)
 
-            outtmpl = sanitize_path(self.params.get('outtmpl', DEFAULT_OUTTMPL))
+            outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
             tmpl = compat_expanduser(outtmpl)
             filename = tmpl % template_dict
             # Temporary fix for #4787
@@ -575,7 +582,7 @@ class YoutubeDL(object):
             # to workaround encoding issues with subprocess on python2 @ Windows
             if sys.version_info < (3, 0) and sys.platform == 'win32':
                 filename = encodeFilename(filename, True).decode(preferredencoding())
-            return filename
+            return sanitize_path(filename)
         except ValueError as err:
             self.report_error('Error in output template: ' + str(err) + ' (encoding: ' + repr(preferredencoding()) + ')')
             return None
@@ -828,6 +835,7 @@ class YoutubeDL(object):
                                                       extra_info=extra)
                 playlist_results.append(entry_result)
             ie_result['entries'] = playlist_results
+            self.to_screen('[download] Finished downloading playlist: %s' % playlist)
             return ie_result
         elif result_type == 'compat_list':
             self.report_warning(
@@ -920,6 +928,7 @@ class YoutubeDL(object):
         PICKFIRST = 'PICKFIRST'
         MERGE = 'MERGE'
         SINGLE = 'SINGLE'
+        GROUP = 'GROUP'
         FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
 
         def _parse_filter(tokens):
@@ -930,7 +939,38 @@ class YoutubeDL(object):
                 else:
                     filter_parts.append(string)
 
-        def _parse_format_selection(tokens, endwith=[]):
+        def _remove_unused_ops(tokens):
+            # Remove operators that we don't use and join them with the surrounding strings
+            # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9'
+            ALLOWED_OPS = ('/', '+', ',', '(', ')')
+            last_string, last_start, last_end, last_line = None, None, None, None
+            for type, string, start, end, line in tokens:
+                if type == tokenize.OP and string == '[':
+                    if last_string:
+                        yield tokenize.NAME, last_string, last_start, last_end, last_line
+                        last_string = None
+                    yield type, string, start, end, line
+                    # everything inside brackets will be handled by _parse_filter
+                    for type, string, start, end, line in tokens:
+                        yield type, string, start, end, line
+                        if type == tokenize.OP and string == ']':
+                            break
+                elif type == tokenize.OP and string in ALLOWED_OPS:
+                    if last_string:
+                        yield tokenize.NAME, last_string, last_start, last_end, last_line
+                        last_string = None
+                    yield type, string, start, end, line
+                elif type in [tokenize.NAME, tokenize.NUMBER, tokenize.OP]:
+                    if not last_string:
+                        last_string = string
+                        last_start = start
+                        last_end = end
+                    else:
+                        last_string += string
+            if last_string:
+                yield tokenize.NAME, last_string, last_start, last_end, last_line
+
+        def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
             selectors = []
             current_selector = None
             for type, string, start, _, _ in tokens:
@@ -940,26 +980,44 @@ class YoutubeDL(object):
                 elif type in [tokenize.NAME, tokenize.NUMBER]:
                     current_selector = FormatSelector(SINGLE, string, [])
                 elif type == tokenize.OP:
-                    if string in endwith:
+                    if string == ')':
+                        if not inside_group:
+                            # ')' will be handled by the parentheses group
+                            tokens.restore_last_token()
                         break
-                    if string == ',':
+                    elif inside_merge and string in ['/', ',']:
+                        tokens.restore_last_token()
+                        break
+                    elif inside_choice and string == ',':
+                        tokens.restore_last_token()
+                        break
+                    elif string == ',':
+                        if not current_selector:
+                            raise syntax_error('"," must follow a format selector', start)
                         selectors.append(current_selector)
                         current_selector = None
                     elif string == '/':
+                        if not current_selector:
+                            raise syntax_error('"/" must follow a format selector', start)
                         first_choice = current_selector
-                        second_choice = _parse_format_selection(tokens, [','])
-                        current_selector = None
-                        selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), []))
+                        second_choice = _parse_format_selection(tokens, inside_choice=True)
+                        current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
                     elif string == '[':
                         if not current_selector:
                             current_selector = FormatSelector(SINGLE, 'best', [])
                         format_filter = _parse_filter(tokens)
                         current_selector.filters.append(format_filter)
+                    elif string == '(':
+                        if current_selector:
+                            raise syntax_error('Unexpected "("', start)
+                        group = _parse_format_selection(tokens, inside_group=True)
+                        current_selector = FormatSelector(GROUP, group, [])
                     elif string == '+':
                         video_selector = current_selector
-                        audio_selector = _parse_format_selection(tokens, [','])
-                        current_selector = None
-                        selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), []))
+                        audio_selector = _parse_format_selection(tokens, inside_merge=True)
+                        if not video_selector or not audio_selector:
+                            raise syntax_error('"+" must be between two format selectors', start)
+                        current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
                     else:
                         raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
                 elif type == tokenize.ENDMARKER:
@@ -977,6 +1035,8 @@ class YoutubeDL(object):
                         for format in f(formats):
                             yield format
                 return selector_function
+            elif selector.type == GROUP:
+                selector_function = _build_selector_function(selector.selector)
             elif selector.type == PICKFIRST:
                 fs = [_build_selector_function(s) for s in selector.selector]
 
@@ -990,7 +1050,13 @@ class YoutubeDL(object):
                 format_spec = selector.selector
 
                 def selector_function(formats):
-                    if format_spec in ['best', 'worst', None]:
+                    formats = list(formats)
+                    if not formats:
+                        return
+                    if format_spec == 'all':
+                        for f in formats:
+                            yield f
+                    elif format_spec in ['best', 'worst', None]:
                         format_idx = 0 if format_spec == 'worst' else -1
                         audiovideo_formats = [
                             f for f in formats
@@ -1081,8 +1147,32 @@ class YoutubeDL(object):
             return final_selector
 
         stream = io.BytesIO(format_spec.encode('utf-8'))
-        tokens = compat_tokenize_tokenize(stream.readline)
-        parsed_selector = _parse_format_selection(tokens)
+        try:
+            tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline)))
+        except tokenize.TokenError:
+            raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
+
+        class TokenIterator(object):
+            def __init__(self, tokens):
+                self.tokens = tokens
+                self.counter = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if self.counter >= len(self.tokens):
+                    raise StopIteration()
+                value = self.tokens[self.counter]
+                self.counter += 1
+                return value
+
+            next = __next__
+
+            def restore_last_token(self):
+                self.counter -= 1
+
+        parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
         return _build_selector_function(parsed_selector)
 
     def _calc_headers(self, info_dict):
@@ -1099,7 +1189,7 @@ class YoutubeDL(object):
         return res
 
     def _calc_cookies(self, info_dict):
-        pr = compat_urllib_request.Request(info_dict['url'])
+        pr = sanitized_Request(info_dict['url'])
         self.cookiejar.add_cookie_header(pr)
         return pr.get_header('Cookie')
 
@@ -1126,7 +1216,7 @@ class YoutubeDL(object):
                 t.get('preference'), t.get('width'), t.get('height'),
                 t.get('id'), t.get('url')))
             for i, t in enumerate(thumbnails):
-                if 'width' in t and 'height' in t:
+                if t.get('width') and t.get('height'):
                     t['resolution'] = '%dx%d' % (t['width'], t['height'])
                 if t.get('id') is None:
                     t['id'] = '%d' % i
@@ -1146,13 +1236,20 @@ class YoutubeDL(object):
             except (ValueError, OverflowError, OSError):
                 pass
 
+        subtitles = info_dict.get('subtitles')
+        if subtitles:
+            for _, subtitle in subtitles.items():
+                for subtitle_format in subtitle:
+                    if 'ext' not in subtitle_format:
+                        subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
+
         if self.params.get('listsubtitles', False):
             if 'automatic_captions' in info_dict:
                 self.list_subtitles(info_dict['id'], info_dict.get('automatic_captions'), 'automatic captions')
-            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'), 'subtitles')
+            self.list_subtitles(info_dict['id'], subtitles, 'subtitles')
             return
         info_dict['requested_subtitles'] = self.process_subtitles(
-            info_dict['id'], info_dict.get('subtitles'),
+            info_dict['id'], subtitles,
             info_dict.get('automatic_captions'))
 
         # We now pick which formats have to be downloaded
@@ -1220,18 +1317,15 @@ class YoutubeDL(object):
         if req_format is None:
             req_format_list = []
             if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
-                    info_dict['extractor'] in ['youtube', 'ted']):
+                    info_dict['extractor'] in ['youtube', 'ted'] and
+                    not info_dict.get('is_live')):
                 merger = FFmpegMergerPP(self)
                 if merger.available and merger.can_merge():
                     req_format_list.append('bestvideo+bestaudio')
             req_format_list.append('best')
             req_format = '/'.join(req_format_list)
-        formats_to_download = []
-        if req_format == 'all':
-            formats_to_download = formats
-        else:
-            format_selector = self.build_format_selector(req_format)
-            formats_to_download = list(format_selector(formats))
+        format_selector = self.build_format_selector(req_format)
+        formats_to_download = list(format_selector(formats))
         if not formats_to_download:
             raise ExtractorError('requested format not available',
                                  expected=True)
@@ -1779,27 +1873,8 @@ class YoutubeDL(object):
 
     def urlopen(self, req):
         """ Start an HTTP download """
-
-        # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
-        # always respected by websites, some tend to give out URLs with non percent-encoded
-        # non-ASCII characters (see telemb.py, ard.py [#3412])
-        # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
-        # To work around aforementioned issue we will replace request's original URL with
-        # percent-encoded one
-        req_is_string = isinstance(req, compat_basestring)
-        url = req if req_is_string else req.get_full_url()
-        url_escaped = escape_url(url)
-
-        # Substitute URL if any change after escaping
-        if url != url_escaped:
-            if req_is_string:
-                req = url_escaped
-            else:
-                req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
-                req = req_type(
-                    url_escaped, data=req.data, headers=req.headers,
-                    origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
-
+        if isinstance(req, compat_basestring):
+            req = sanitized_Request(req)
         return self._opener.open(req, timeout=self._socket_timeout)
 
     def print_debug_header(self):
@@ -1882,8 +1957,7 @@ class YoutubeDL(object):
             if os.access(opts_cookiefile, os.R_OK):
                 self.cookiejar.load()
 
-        cookie_processor = compat_urllib_request.HTTPCookieProcessor(
-            self.cookiejar)
+        cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
         if opts_proxy is not None:
             if opts_proxy == '':
                 proxies = {}
@@ -1899,8 +1973,9 @@ class YoutubeDL(object):
         debuglevel = 1 if self.params.get('debug_printtraffic') else 0
         https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
         ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
+        data_handler = compat_urllib_request_DataHandler()
         opener = compat_urllib_request.build_opener(
-            proxy_handler, https_handler, cookie_processor, ydlh)
+            proxy_handler, https_handler, cookie_processor, ydlh, data_handler)
 
         # Delete the default user-agent header, which would otherwise apply in
         # cases where our custom HTTP handler doesn't come into play
@@ -1952,7 +2027,7 @@ class YoutubeDL(object):
                                (info_dict['extractor'], info_dict['id'], thumb_display_id))
                 try:
                     uf = self.urlopen(t['url'])
-                    with open(thumb_filename, 'wb') as thumbf:
+                    with open(encodeFilename(thumb_filename), 'wb') as thumbf:
                         shutil.copyfileobj(uf, thumbf)
                     self.to_screen('[%s] %s: Writing thumbnail %sto: %s' %
                                    (info_dict['extractor'], info_dict['id'], thumb_display_id, thumb_filename))