Merge pull request #12909 from remitamine/raw-sub

author Yen Chi Hsuan <yan12125@gmail.com>

Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)

committer GitHub <noreply@github.com>

Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)
author Yen Chi Hsuan <yan12125@gmail.com>
Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)
committer GitHub <noreply@github.com>
Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)
diff --combined youtube_dl/YoutubeDL.py

index 4f208f1e1364f7ae2a4347798c265f991d48b66f,c7100bb91efbddc52806ce723e386d1366fba666..5405a87c5f377d7673b08e5560546d639614a6ca
--- 1/youtube_dl/YoutubeDL.py
--- 2/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@@ -26,8 -26,6 +26,8 @@@ import tokeniz
   import traceback
   import random
   
+ +from string import ascii_letters
+ +
   from .compat import (
       compat_basestring,
       compat_cookiejar,
@@@ -60,7 -58,6 +60,7 @@@ from .utils import 
       format_bytes,
       formatSeconds,
       GeoRestrictedError,
+ +    int_or_none,
       ISO3166Utils,
       locked_file,
       make_HTTPS_handler,
@@@ -305,17 -302,6 +305,17 @@@ class YoutubeDL(object)
                           postprocessor.
       """
   
+ +    _NUMERIC_FIELDS = set((
+ +        'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
+ +        'timestamp', 'upload_year', 'upload_month', 'upload_day',
+ +        'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
+ +        'average_rating', 'comment_count', 'age_limit',
+ +        'start_time', 'end_time',
+ +        'chapter_number', 'season_number', 'episode_number',
+ +        'track_number', 'disc_number', 'release_year',
+ +        'playlist_index',
+ +    ))
+ +
       params = None
       _ies = []
       _pps = []
@@@ -384,10 -370,10 +384,10 @@@
                   else:
                       raise
   
- -        if (sys.version_info >= (3,) and sys.platform != 'win32' and
+ +        if (sys.platform != 'win32' and
                   sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
                   not params.get('restrictfilenames', False)):
- -            # On Python 3, the Unicode filesystem API will throw errors (#1474)
+ +            # Unicode filesystem API will throw errors (#1474, #13027)
               self.report_warning(
                   'Assuming --restrict-filenames since file system encoding '
                   'cannot encode all characters. '
@@@ -512,25 -498,24 +512,25 @@@
       def to_console_title(self, message):
           if not self.params.get('consoletitle', False):
               return
- -        if compat_os_name == 'nt' and ctypes.windll.kernel32.GetConsoleWindow():
- -            # c_wchar_p() might not be necessary if `message` is
- -            # already of type unicode()
- -            ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
+ +        if compat_os_name == 'nt':
+ +            if ctypes.windll.kernel32.GetConsoleWindow():
+ +                # c_wchar_p() might not be necessary if `message` is
+ +                # already of type unicode()
+ +                ctypes.windll.kernel32.SetConsoleTitleW(ctypes.c_wchar_p(message))
           elif 'TERM' in os.environ:
               self._write_string('\033]0;%s\007' % message, self._screen_file)
   
       def save_console_title(self):
           if not self.params.get('consoletitle', False):
               return
- -        if 'TERM' in os.environ:
+ +        if compat_os_name != 'nt' and 'TERM' in os.environ:
               # Save the title on stack
               self._write_string('\033[22;0t', self._screen_file)
   
       def restore_console_title(self):
           if not self.params.get('consoletitle', False):
               return
- -        if 'TERM' in os.environ:
+ +        if compat_os_name != 'nt' and 'TERM' in os.environ:
               # Restore the title from stack
               self._write_string('\033[23;0t', self._screen_file)
   
@@@ -653,11 -638,22 +653,11 @@@
                       r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
                       outtmpl)
   
- -            NUMERIC_FIELDS = set((
- -                'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
- -                'timestamp', 'upload_year', 'upload_month', 'upload_day',
- -                'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
- -                'average_rating', 'comment_count', 'age_limit',
- -                'start_time', 'end_time',
- -                'chapter_number', 'season_number', 'episode_number',
- -                'track_number', 'disc_number', 'release_year',
- -                'playlist_index',
- -            ))
- -
               # Missing numeric fields used together with integer presentation types
               # in format specification will break the argument substitution since
               # string 'NA' is returned for missing fields. We will patch output
               # template for missing fields to meet string presentation type.
- -            for numeric_field in NUMERIC_FIELDS:
+ +            for numeric_field in self._NUMERIC_FIELDS:
                   if numeric_field not in template_dict:
                       # As of [1] format syntax is:
                       #  %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
@@@ -676,19 -672,7 +676,19 @@@
                           FORMAT_RE.format(numeric_field),
                           r'%({0})s'.format(numeric_field), outtmpl)
   
- -            filename = expand_path(outtmpl % template_dict)
+ +            # expand_path translates '%%' into '%' and '$$' into '$'
+ +            # correspondingly that is not what we want since we need to keep
+ +            # '%%' intact for template dict substitution step. Working around
+ +            # with boundary-alike separator hack.
+ +            sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
+ +            outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
+ +
+ +            # outtmpl should be expand_path'ed before template dict substitution
+ +            # because meta fields may contain env variables we don't want to
+ +            # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
+ +            # title "Hello $PATH", we don't want `$PATH` to be expanded.
+ +            filename = expand_path(outtmpl).replace(sep, '') % template_dict
+ +
               # Temporary fix for #4787
               # 'Treat' all problem characters by passing filename through preferredencoding
               # to workaround encoding issues with subprocess on python2 @ Windows
@@@ -860,7 -844,7 +860,7 @@@
   
               force_properties = dict(
                   (k, v) for k, v in ie_result.items() if v is not None)
- -            for f in ('_type', 'url', 'ie_key'):
+ +            for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
                   if f in force_properties:
                       del force_properties[f]
               new_result = info.copy()
@@@ -1064,25 -1048,6 +1064,25 @@@
               return op(actual_value, comparison_value)
           return _filter
   
+ +    def _default_format_spec(self, info_dict, download=True):
+ +        req_format_list = []
+ +
+ +        def can_have_partial_formats():
+ +            if self.params.get('simulate', False):
+ +                return True
+ +            if not download:
+ +                return True
+ +            if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
+ +                return False
+ +            if info_dict.get('is_live'):
+ +                return False
+ +            merger = FFmpegMergerPP(self)
+ +            return merger.available and merger.can_merge()
+ +        if can_have_partial_formats():
+ +            req_format_list.append('bestvideo+bestaudio')
+ +        req_format_list.append('best')
+ +        return '/'.join(req_format_list)
+ +
       def build_format_selector(self, format_spec):
           def syntax_error(note, start):
               message = (
@@@ -1379,28 -1344,9 +1379,28 @@@
           if 'title' not in info_dict:
               raise ExtractorError('Missing "title" field in extractor result')
   
- -        if not isinstance(info_dict['id'], compat_str):
- -            self.report_warning('"id" field is not a string - forcing string conversion')
- -            info_dict['id'] = compat_str(info_dict['id'])
+ +        def report_force_conversion(field, field_not, conversion):
+ +            self.report_warning(
+ +                '"%s" field is not %s - forcing %s conversion, there is an error in extractor'
+ +                % (field, field_not, conversion))
+ +
+ +        def sanitize_string_field(info, string_field):
+ +            field = info.get(string_field)
+ +            if field is None or isinstance(field, compat_str):
+ +                return
+ +            report_force_conversion(string_field, 'a string', 'string')
+ +            info[string_field] = compat_str(field)
+ +
+ +        def sanitize_numeric_fields(info):
+ +            for numeric_field in self._NUMERIC_FIELDS:
+ +                field = info.get(numeric_field)
+ +                if field is None or isinstance(field, compat_numeric_types):
+ +                    continue
+ +                report_force_conversion(numeric_field, 'numeric', 'int')
+ +                info[numeric_field] = int_or_none(field)
+ +
+ +        sanitize_string_field(info_dict, 'id')
+ +        sanitize_numeric_fields(info_dict)
   
           if 'playlist' not in info_dict:
               # It isn't part of a playlist
@@@ -1481,28 -1427,16 +1481,28 @@@
           if not formats:
               raise ExtractorError('No video formats found!')
   
+ +        def is_wellformed(f):
+ +            url = f.get('url')
+ +            if not url:
+ +                self.report_warning(
+ +                    '"url" field is missing or empty - skipping format, '
+ +                    'there is an error in extractor')
+ +                return False
+ +            if isinstance(url, bytes):
+ +                sanitize_string_field(f, 'url')
+ +            return True
+ +
+ +        # Filter out malformed formats for better extraction robustness
+ +        formats = list(filter(is_wellformed, formats))
+ +
           formats_dict = {}
   
           # We check that all the formats have the format and format_id fields
           for i, format in enumerate(formats):
- -            if 'url' not in format:
- -                raise ExtractorError('Missing "url" key in result (index %d)' % i)
- -
+ +            sanitize_string_field(format, 'format_id')
+ +            sanitize_numeric_fields(format)
               format['url'] = sanitize_url(format['url'])
- -
- -            if format.get('format_id') is None:
+ +            if not format.get('format_id'):
                   format['format_id'] = compat_str(i)
               else:
                   # Sanitize format_id from characters used in format selector expression
@@@ -1555,10 -1489,14 +1555,10 @@@
   
           req_format = self.params.get('format')
           if req_format is None:
- -            req_format_list = []
- -            if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
- -                    not info_dict.get('is_live')):
- -                merger = FFmpegMergerPP(self)
- -                if merger.available and merger.can_merge():
- -                    req_format_list.append('bestvideo+bestaudio')
- -            req_format_list.append('best')
- -            req_format = '/'.join(req_format_list)
+ +            req_format = self._default_format_spec(info_dict, download=download)
+ +            if self.params.get('verbose'):
+ +                self.to_stdout('[debug] Default format spec: %s' % req_format)
+ +
           format_selector = self.build_format_selector(req_format)
   
           # While in format selection we may need to have an access to the original
@@@ -1710,17 -1648,12 +1710,17 @@@
           if filename is None:
               return
   
- -        try:
- -            dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
- -            if dn and not os.path.exists(dn):
- -                os.makedirs(dn)
- -        except (OSError, IOError) as err:
- -            self.report_error('unable to create directory ' + error_to_compat_str(err))
+ +        def ensure_dir_exists(path):
+ +            try:
+ +                dn = os.path.dirname(path)
+ +                if dn and not os.path.exists(dn):
+ +                    os.makedirs(dn)
+ +                return True
+ +            except (OSError, IOError) as err:
+ +                self.report_error('unable to create directory ' + error_to_compat_str(err))
+ +                return False
+ +
+ +        if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
               return
   
           if self.params.get('writedescription', False):
@@@ -1763,29 -1696,30 +1763,30 @@@
               ie = self.get_info_extractor(info_dict['extractor_key'])
               for sub_lang, sub_info in subtitles.items():
                   sub_format = sub_info['ext']
-                 if sub_info.get('data') is not None:
-                     sub_data = sub_info['data']
+                 sub_filename = subtitles_filename(filename, sub_lang, sub_format)
+                 if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
+                     self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
                   else:
-                     try:
-                         sub_data = ie._download_webpage(
-                             sub_info['url'], info_dict['id'], note=False)
-                     except ExtractorError as err:
-                         self.report_warning('Unable to download subtitle for "%s": %s' %
-                                             (sub_lang, error_to_compat_str(err.cause)))
-                         continue
-                 try:
-                     sub_filename = subtitles_filename(filename, sub_lang, sub_format)
-                     if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
-                         self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format))
+                     self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
+                     if sub_info.get('data') is not None:
+                         try:
+                             # Use newline='' to prevent conversion of newline characters
+                             # See https://github.com/rg3/youtube-dl/issues/10268
+                             with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
+                                 subfile.write(sub_info['data'])
+                         except (OSError, IOError):
+                             self.report_error('Cannot write subtitles file ' + sub_filename)
+                             return
                       else:
-                         self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
-                         # Use newline='' to prevent conversion of newline characters
-                         # See https://github.com/rg3/youtube-dl/issues/10268
-                         with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
-                             subfile.write(sub_data)
-                 except (OSError, IOError):
-                     self.report_error('Cannot write subtitles file ' + sub_filename)
-                     return
+                         try:
+                             sub_data = ie._request_webpage(
+                                 sub_info['url'], info_dict['id'], note=False).read()
+                             with io.open(encodeFilename(sub_filename), 'wb') as subfile:
+                                 subfile.write(sub_data)
+                         except (ExtractorError, IOError, OSError, ValueError) as err:
+                             self.report_warning('Unable to download subtitle for "%s": %s' %
+                                                 (sub_lang, error_to_compat_str(err)))
+                             continue
   
           if self.params.get('writeinfojson', False):
               infofn = replace_extension(filename, 'info.json', info_dict.get('ext'))
@@@ -1858,11 -1792,8 +1859,11 @@@
                           for f in requested_formats:
                               new_info = dict(info_dict)
                               new_info.update(f)
- -                            fname = self.prepare_filename(new_info)
- -                            fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
+ +                            fname = prepend_extension(
+ +                                self.prepare_filename(new_info),
+ +                                'f%s' % f['format_id'], new_info['ext'])
+ +                            if not ensure_dir_exists(fname):
+ +                                return
                               downloaded.append(fname)
                               partial_success = dl(fname, new_info)
                               success = success and partial_success
@@@ -1929,7 -1860,7 +1930,7 @@@
                           info_dict.get('protocol') == 'm3u8' and
                           self.params.get('hls_prefer_native')):
                       if fixup_policy == 'warn':
- -                        self.report_warning('%s: malformated aac bitstream.' % (
+ +                        self.report_warning('%s: malformed AAC bitstream detected.' % (
                               info_dict['id']))
                       elif fixup_policy == 'detect_or_warn':
                           fixup_pp = FFmpegFixupM3u8PP(self)
@@@ -1938,7 -1869,7 +1939,7 @@@
                               info_dict['__postprocessors'].append(fixup_pp)
                           else:
                               self.report_warning(
- -                                '%s: malformated aac bitstream. %s'
+ +                                '%s: malformed AAC bitstream detected. %s'
                                   % (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
                       else:
                           assert fixup_policy in ('ignore', 'never')
author	Yen Chi Hsuan <yan12125@gmail.com>
	Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)
committer	GitHub <noreply@github.com>
	Wed, 13 Sep 2017 09:36:40 +0000 (17:36 +0800)