[YoutubeDL] format spec: add additional checks for invalid syntax

[youtube-dl] / youtube_dl / YoutubeDL.py
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 17a5407b983fbadc00453d081ccd567f10f3dade..da7c510083820353d64db31a84dec7ccfac85c3a 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -920,6 +920,7 @@ class YoutubeDL(object):
          PICKFIRST = 'PICKFIRST'
          MERGE = 'MERGE'
          SINGLE = 'SINGLE'
+        GROUP = 'GROUP'
          FormatSelector = collections.namedtuple('FormatSelector', ['type', 'selector', 'filters'])
  
          def _parse_filter(tokens):
@@ -930,7 +931,7 @@ class YoutubeDL(object):
                  else:
                      filter_parts.append(string)
  
-        def _parse_format_selection(tokens, endwith=[]):
+        def _parse_format_selection(tokens, inside_merge=False, inside_choice=False, inside_group=False):
              selectors = []
              current_selector = None
              for type, string, start, _, _ in tokens:
@@ -940,26 +941,42 @@ class YoutubeDL(object):
                  elif type in [tokenize.NAME, tokenize.NUMBER]:
                      current_selector = FormatSelector(SINGLE, string, [])
                  elif type == tokenize.OP:
-                    if string in endwith:
+                    if string == ')':
+                        if not inside_group:
+                            # ')' will be handled by the parentheses group
+                            tokens.restore_last_token()
                          break
-                    if string == ',':
+                    elif inside_merge and string in ['/', ',']:
+                        tokens.restore_last_token()
+                        break
+                    elif inside_choice and string == ',':
+                        tokens.restore_last_token()
+                        break
+                    elif string == ',':
+                        if not current_selector:
+                            raise syntax_error('"," must follow a format selector', start)
                          selectors.append(current_selector)
                          current_selector = None
                      elif string == '/':
                          first_choice = current_selector
-                        second_choice = _parse_format_selection(tokens, [','])
-                        current_selector = None
-                        selectors.append(FormatSelector(PICKFIRST, (first_choice, second_choice), []))
+                        second_choice = _parse_format_selection(tokens, inside_choice=True)
+                        current_selector = FormatSelector(PICKFIRST, (first_choice, second_choice), [])
                      elif string == '[':
                          if not current_selector:
                              current_selector = FormatSelector(SINGLE, 'best', [])
                          format_filter = _parse_filter(tokens)
                          current_selector.filters.append(format_filter)
+                    elif string == '(':
+                        if current_selector:
+                            raise syntax_error('Unexpected "("', start)
+                        group = _parse_format_selection(tokens, inside_group=True)
+                        current_selector = FormatSelector(GROUP, group, [])
                      elif string == '+':
                          video_selector = current_selector
-                        audio_selector = _parse_format_selection(tokens, [','])
-                        current_selector = None
-                        selectors.append(FormatSelector(MERGE, (video_selector, audio_selector), []))
+                        audio_selector = _parse_format_selection(tokens, inside_merge=True)
+                        if not video_selector or not audio_selector:
+                            raise syntax_error('"+" must be between two format selectors', start)
+                        current_selector = FormatSelector(MERGE, (video_selector, audio_selector), [])
                      else:
                          raise syntax_error('Operator not recognized: "{0}"'.format(string), start)
                  elif type == tokenize.ENDMARKER:
@@ -977,6 +994,8 @@ class YoutubeDL(object):
                          for format in f(formats):
                              yield format
                  return selector_function
+            elif selector.type == GROUP:
+                selector_function = _build_selector_function(selector.selector)
              elif selector.type == PICKFIRST:
                  fs = [_build_selector_function(s) for s in selector.selector]
  
@@ -990,7 +1009,13 @@ class YoutubeDL(object):
                  format_spec = selector.selector
  
                  def selector_function(formats):
-                    if format_spec in ['best', 'worst', None]:
+                    formats = list(formats)
+                    if not formats:
+                        return
+                    if format_spec == 'all':
+                        for f in formats:
+                            yield f
+                    elif format_spec in ['best', 'worst', None]:
                          format_idx = 0 if format_spec == 'worst' else -1
                          audiovideo_formats = [
                              f for f in formats
@@ -1081,8 +1106,32 @@ class YoutubeDL(object):
              return final_selector
  
          stream = io.BytesIO(format_spec.encode('utf-8'))
-        tokens = compat_tokenize_tokenize(stream.readline)
-        parsed_selector = _parse_format_selection(tokens)
+        try:
+            tokens = list(compat_tokenize_tokenize(stream.readline))
+        except tokenize.TokenError:
+            raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec)))
+
+        class TokenIterator(object):
+            def __init__(self, tokens):
+                self.tokens = tokens
+                self.counter = 0
+
+            def __iter__(self):
+                return self
+
+            def __next__(self):
+                if self.counter >= len(self.tokens):
+                    raise StopIteration()
+                value = self.tokens[self.counter]
+                self.counter += 1
+                return value
+
+            next = __next__
+
+            def restore_last_token(self):
+                self.counter -= 1
+
+        parsed_selector = _parse_format_selection(iter(TokenIterator(tokens)))
          return _build_selector_function(parsed_selector)
  
      def _calc_headers(self, info_dict):
@@ -1226,12 +1275,8 @@ class YoutubeDL(object):
                      req_format_list.append('bestvideo+bestaudio')
              req_format_list.append('best')
              req_format = '/'.join(req_format_list)
-        formats_to_download = []
-        if req_format == 'all':
-            formats_to_download = formats
-        else:
-            format_selector = self.build_format_selector(req_format)
-            formats_to_download = list(format_selector(formats))
+        format_selector = self.build_format_selector(req_format)
+        formats_to_download = list(format_selector(formats))
          if not formats_to_download:
              raise ExtractorError('requested format not available',
                                   expected=True)