[youtube] Fix uploader id and uploader URL extraction

[youtube-dl] / youtube_dl / YoutubeDL.py
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 046e032478340030ecf1ccea4b859ed45457fa6e..19370f62b0d3ddb91c74ae4b6cf6c569341fbdc5 100755 (executable)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -82,14 +82,17 @@ from .utils import (
      sanitize_url,
      sanitized_Request,
      std_headers,
+    str_or_none,
      subtitles_filename,
      UnavailableVideoError,
      url_basename,
      version_tuple,
      write_json_file,
      write_string,
+    YoutubeDLCookieJar,
      YoutubeDLCookieProcessor,
      YoutubeDLHandler,
+    YoutubeDLRedirectHandler,
  )
  from .cache import Cache
  from .extractor import get_info_extractor, gen_extractor_classes, _LAZY_LOADER
@@ -211,7 +214,7 @@ class YoutubeDL(object):
                         At the moment, this is only supported by YouTube.
      proxy:             URL of the proxy server to use
      geo_verification_proxy:  URL of the proxy to use for IP address verification
-                       on geo-restricted sites. (Experimental)
+                       on geo-restricted sites.
      socket_timeout:    Time to wait for unresponsive hosts, in seconds
      bidi_workaround:   Work around buggy terminals without bidirectional text
                         support, using fridibi
@@ -259,7 +262,7 @@ class YoutubeDL(object):
                         - "warn": only emit a warning
                         - "detect_or_warn": check whether we can do anything
                                             about it, warn otherwise (default)
-    source_address:    (Experimental) Client-side IP address to bind to.
+    source_address:    Client-side IP address to bind to.
      call_home:         Boolean, true iff we are allowed to contact the
                         youtube-dl servers for debugging.
      sleep_interval:    Number of seconds to sleep before each download when
@@ -281,14 +284,14 @@ class YoutubeDL(object):
                         match_filter_func in utils.py is one example for this.
      no_color:          Do not emit color codes in output.
      geo_bypass:        Bypass geographic restriction via faking X-Forwarded-For
-                       HTTP header (experimental)
+                       HTTP header
      geo_bypass_country:
                         Two-letter ISO 3166-2 country code that will be used for
                         explicit geographic restriction bypassing via faking
-                       X-Forwarded-For HTTP header (experimental)
+                       X-Forwarded-For HTTP header
      geo_bypass_ip_block:
                         IP range in CIDR notation that will be used similarly to
-                       geo_bypass_country (experimental)
+                       geo_bypass_country
  
      The following options determine which downloader is picked:
      external_downloader: Executable of the external downloader to call.
@@ -305,8 +308,10 @@ class YoutubeDL(object):
      http_chunk_size.
  
      The following options are used by the post processors:
-    prefer_ffmpeg:     If True, use ffmpeg instead of avconv if both are available,
-                       otherwise prefer avconv.
+    prefer_ffmpeg:     If False, use avconv instead of ffmpeg if both are available,
+                       otherwise prefer ffmpeg.
+    ffmpeg_location:   Location of the ffmpeg/avconv binary; either the path
+                       to the binary or its containing directory.
      postprocessor_args: A list of additional command-line arguments for the
                          postprocessor.
  
@@ -396,9 +401,9 @@ class YoutubeDL(object):
                  else:
                      raise
  
-        if (sys.platform != 'win32' and
-                sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
-                not params.get('restrictfilenames', False)):
+        if (sys.platform != 'win32'
+                and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
+                and not params.get('restrictfilenames', False)):
              # Unicode filesystem API will throw errors (#1474, #13027)
              self.report_warning(
                  'Assuming --restrict-filenames since file system encoding '
@@ -436,9 +441,9 @@ class YoutubeDL(object):
              if re.match(r'^-[0-9A-Za-z_-]{10}$', a)]
          if idxs:
              correct_argv = (
-                ['youtube-dl'] +
-                [a for i, a in enumerate(argv) if i not in idxs] +
-                ['--'] + [argv[i] for i in idxs]
+                ['youtube-dl']
+                + [a for i, a in enumerate(argv) if i not in idxs]
+                + ['--'] + [argv[i] for i in idxs]
              )
              self.report_warning(
                  'Long argument string detected. '
@@ -558,7 +563,7 @@ class YoutubeDL(object):
          self.restore_console_title()
  
          if self.params.get('cookiefile') is not None:
-            self.cookiejar.save()
+            self.cookiejar.save(ignore_discard=True, ignore_expires=True)
  
      def trouble(self, message=None, tb=None):
          """Determine action to take when a download problem appears.
@@ -846,10 +851,11 @@ class YoutubeDL(object):
          if result_type in ('url', 'url_transparent'):
              ie_result['url'] = sanitize_url(ie_result['url'])
              extract_flat = self.params.get('extract_flat', False)
-            if ((extract_flat == 'in_playlist' and 'playlist' in extra_info) or
-                    extract_flat is True):
-                if self.params.get('forcejson', False):
-                    self.to_stdout(json.dumps(ie_result))
+            if ((extract_flat == 'in_playlist' and 'playlist' in extra_info)
+                    or extract_flat is True):
+                self.__forced_printings(
+                    ie_result, self.prepare_filename(ie_result),
+                    incomplete=True)
                  return ie_result
  
          if result_type == 'video':
@@ -887,7 +893,7 @@ class YoutubeDL(object):
              # url_transparent. In such cases outer metadata (from ie_result)
              # should be propagated to inner one (info). For this to happen
              # _type of info should be overridden with url_transparent. This
-            # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
+            # fixes issue from https://github.com/ytdl-org/youtube-dl/pull/11163.
              if new_result.get('_type') == 'url':
                  new_result['_type'] = 'url_transparent'
  
@@ -985,7 +991,7 @@ class YoutubeDL(object):
                      'playlist_title': ie_result.get('title'),
                      'playlist_uploader': ie_result.get('uploader'),
                      'playlist_uploader_id': ie_result.get('uploader_id'),
-                    'playlist_index': i + playliststart,
+                    'playlist_index': playlistitems[i - 1] if playlistitems else i + playliststart,
                      'extractor': ie_result['extractor'],
                      'webpage_url': ie_result['webpage_url'],
                      'webpage_url_basename': url_basename(ie_result['webpage_url']),
@@ -1062,21 +1068,24 @@ class YoutubeDL(object):
          if not m:
              STR_OPERATORS = {
                  '=': operator.eq,
-                '!=': operator.ne,
                  '^=': lambda attr, value: attr.startswith(value),
                  '$=': lambda attr, value: attr.endswith(value),
                  '*=': lambda attr, value: value in attr,
              }
              str_operator_rex = re.compile(r'''(?x)
                  \s*(?P<key>ext|acodec|vcodec|container|protocol|format_id)
-                \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?
+                \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?
                  \s*(?P<value>[a-zA-Z0-9._-]+)
                  \s*$
                  ''' % '|'.join(map(re.escape, STR_OPERATORS.keys())))
              m = str_operator_rex.search(filter_spec)
              if m:
                  comparison_value = m.group('value')
-                op = STR_OPERATORS[m.group('op')]
+                str_op = STR_OPERATORS[m.group('op')]
+                if m.group('negation'):
+                    op = lambda attr, value: not str_op(attr, value)
+                else:
+                    op = str_op
  
          if not m:
              raise ValueError('Invalid filter specification %r' % filter_spec)
@@ -1601,7 +1610,7 @@ class YoutubeDL(object):
          # by extractor are incomplete or not (i.e. whether extractor provides only
          # video-only or audio-only formats) for proper formats selection for
          # extractors with such incomplete formats (see
-        # https://github.com/rg3/youtube-dl/pull/5556).
+        # https://github.com/ytdl-org/youtube-dl/pull/5556).
          # Since formats may be filtered during format selection and may not match
          # the original formats the results may be incorrect. Thus original formats
          # or pre-calculated metrics should be passed to format selection routines
@@ -1609,12 +1618,12 @@ class YoutubeDL(object):
          # We will pass a context object containing all necessary additional data
          # instead of just formats.
          # This fixes incorrect format selection issue (see
-        # https://github.com/rg3/youtube-dl/issues/10083).
+        # https://github.com/ytdl-org/youtube-dl/issues/10083).
          incomplete_formats = (
              # All formats are video-only or
-            all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or
+            all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats)
              # all formats are audio-only
-            all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
+            or all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats))
  
          ctx = {
              'formats': formats,
@@ -1686,6 +1695,36 @@ class YoutubeDL(object):
              subs[lang] = f
          return subs
  
+    def __forced_printings(self, info_dict, filename, incomplete):
+        def print_mandatory(field):
+            if (self.params.get('force%s' % field, False)
+                    and (not incomplete or info_dict.get(field) is not None)):
+                self.to_stdout(info_dict[field])
+
+        def print_optional(field):
+            if (self.params.get('force%s' % field, False)
+                    and info_dict.get(field) is not None):
+                self.to_stdout(info_dict[field])
+
+        print_mandatory('title')
+        print_mandatory('id')
+        if self.params.get('forceurl', False) and not incomplete:
+            if info_dict.get('requested_formats') is not None:
+                for f in info_dict['requested_formats']:
+                    self.to_stdout(f['url'] + f.get('play_path', ''))
+            else:
+                # For RTMP URLs, also include the playpath
+                self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
+        print_optional('thumbnail')
+        print_optional('description')
+        if self.params.get('forcefilename', False) and filename is not None:
+            self.to_stdout(filename)
+        if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
+            self.to_stdout(formatSeconds(info_dict['duration']))
+        print_mandatory('format')
+        if self.params.get('forcejson', False):
+            self.to_stdout(json.dumps(info_dict))
+
      def process_info(self, info_dict):
          """Process a single resolved IE result."""
  
@@ -1696,9 +1735,8 @@ class YoutubeDL(object):
              if self._num_downloads >= int(max_downloads):
                  raise MaxDownloadsReached()
  
+        # TODO: backward compatibility, to be removed
          info_dict['fulltitle'] = info_dict['title']
-        if len(info_dict['title']) > 200:
-            info_dict['title'] = info_dict['title'][:197] + '...'
  
          if 'format' not in info_dict:
              info_dict['format'] = info_dict['ext']
@@ -1713,29 +1751,7 @@ class YoutubeDL(object):
          info_dict['_filename'] = filename = self.prepare_filename(info_dict)
  
          # Forced printings
-        if self.params.get('forcetitle', False):
-            self.to_stdout(info_dict['fulltitle'])
-        if self.params.get('forceid', False):
-            self.to_stdout(info_dict['id'])
-        if self.params.get('forceurl', False):
-            if info_dict.get('requested_formats') is not None:
-                for f in info_dict['requested_formats']:
-                    self.to_stdout(f['url'] + f.get('play_path', ''))
-            else:
-                # For RTMP URLs, also include the playpath
-                self.to_stdout(info_dict['url'] + info_dict.get('play_path', ''))
-        if self.params.get('forcethumbnail', False) and info_dict.get('thumbnail') is not None:
-            self.to_stdout(info_dict['thumbnail'])
-        if self.params.get('forcedescription', False) and info_dict.get('description') is not None:
-            self.to_stdout(info_dict['description'])
-        if self.params.get('forcefilename', False) and filename is not None:
-            self.to_stdout(filename)
-        if self.params.get('forceduration', False) and info_dict.get('duration') is not None:
-            self.to_stdout(formatSeconds(info_dict['duration']))
-        if self.params.get('forceformat', False):
-            self.to_stdout(info_dict['format'])
-        if self.params.get('forcejson', False):
-            self.to_stdout(json.dumps(info_dict))
+        self.__forced_printings(info_dict, filename, incomplete=False)
  
          # Do nothing else if in simulate mode
          if self.params.get('simulate', False):
@@ -1776,6 +1792,8 @@ class YoutubeDL(object):
              annofn = replace_extension(filename, 'annotations.xml', info_dict.get('ext'))
              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(annofn)):
                  self.to_screen('[info] Video annotations are already present')
+            elif not info_dict.get('annotations'):
+                self.report_warning('There are no annotations to write.')
              else:
                  try:
                      self.to_screen('[info] Writing video annotations to: ' + annofn)
@@ -1797,7 +1815,7 @@ class YoutubeDL(object):
              ie = self.get_info_extractor(info_dict['extractor_key'])
              for sub_lang, sub_info in subtitles.items():
                  sub_format = sub_info['ext']
-                sub_filename = subtitles_filename(filename, sub_lang, sub_format)
+                sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
                  if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
                      self.to_screen('[info] Video subtitle %s.%s is already present' % (sub_lang, sub_format))
                  else:
@@ -1805,7 +1823,7 @@ class YoutubeDL(object):
                      if sub_info.get('data') is not None:
                          try:
                              # Use newline='' to prevent conversion of newline characters
-                            # See https://github.com/rg3/youtube-dl/issues/10268
+                            # See https://github.com/ytdl-org/youtube-dl/issues/10268
                              with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile:
                                  subfile.write(sub_info['data'])
                          except (OSError, IOError):
@@ -1940,8 +1958,8 @@ class YoutubeDL(object):
                      else:
                          assert fixup_policy in ('ignore', 'never')
  
-                if (info_dict.get('requested_formats') is None and
-                        info_dict.get('container') == 'm4a_dash'):
+                if (info_dict.get('requested_formats') is None
+                        and info_dict.get('container') == 'm4a_dash'):
                      if fixup_policy == 'warn':
                          self.report_warning(
                              '%s: writing DASH m4a. '
@@ -1960,9 +1978,9 @@ class YoutubeDL(object):
                      else:
                          assert fixup_policy in ('ignore', 'never')
  
-                if (info_dict.get('protocol') == 'm3u8_native' or
-                        info_dict.get('protocol') == 'm3u8' and
-                        self.params.get('hls_prefer_native')):
+                if (info_dict.get('protocol') == 'm3u8_native'
+                        or info_dict.get('protocol') == 'm3u8'
+                        and self.params.get('hls_prefer_native')):
                      if fixup_policy == 'warn':
                          self.report_warning('%s: malformed AAC bitstream detected.' % (
                              info_dict['id']))
@@ -1988,10 +2006,10 @@ class YoutubeDL(object):
      def download(self, url_list):
          """Download a given list of URLs."""
          outtmpl = self.params.get('outtmpl', DEFAULT_OUTTMPL)
-        if (len(url_list) > 1 and
-                outtmpl != '-' and
-                '%' not in outtmpl and
-                self.params.get('max_downloads') != 1):
+        if (len(url_list) > 1
+                and outtmpl != '-'
+                and '%' not in outtmpl
+                and self.params.get('max_downloads') != 1):
              raise SameFileError(outtmpl)
  
          for url in url_list:
@@ -2056,15 +2074,24 @@ class YoutubeDL(object):
                          self.report_warning('Unable to remove downloaded original file')
  
      def _make_archive_id(self, info_dict):
+        video_id = info_dict.get('id')
+        if not video_id:
+            return
          # Future-proof against any change in case
          # and backwards compatibility with prior versions
-        extractor = info_dict.get('extractor_key')
+        extractor = info_dict.get('extractor_key') or info_dict.get('ie_key')  # key in a playlist
          if extractor is None:
-            if 'id' in info_dict:
-                extractor = info_dict.get('ie_key')  # key in a playlist
-        if extractor is None:
-            return None  # Incomplete video information
-        return extractor.lower() + ' ' + info_dict['id']
+            url = str_or_none(info_dict.get('url'))
+            if not url:
+                return
+            # Try to find matching extractor for the URL and take its ie_key
+            for ie in self._ies:
+                if ie.suitable(url):
+                    extractor = ie.ie_key()
+                    break
+            else:
+                return
+        return extractor.lower() + ' ' + video_id
  
      def in_download_archive(self, info_dict):
          fn = self.params.get('download_archive')
@@ -2072,7 +2099,7 @@ class YoutubeDL(object):
              return False
  
          vid_id = self._make_archive_id(info_dict)
-        if vid_id is None:
+        if not vid_id:
              return False  # Incomplete video information
  
          try:
@@ -2127,8 +2154,8 @@ class YoutubeDL(object):
              if res:
                  res += ', '
              res += '%s container' % fdict['container']
-        if (fdict.get('vcodec') is not None and
-                fdict.get('vcodec') != 'none'):
+        if (fdict.get('vcodec') is not None
+                and fdict.get('vcodec') != 'none'):
              if res:
                  res += ', '
              res += fdict['vcodec']
@@ -2215,7 +2242,7 @@ class YoutubeDL(object):
              return
  
          if type('') is not compat_str:
-            # Python 2.6 on SLES11 SP1 (https://github.com/rg3/youtube-dl/issues/3326)
+            # Python 2.6 on SLES11 SP1 (https://github.com/ytdl-org/youtube-dl/issues/3326)
              self.report_warning(
                  'Your Python is broken! Update to a newer and supported version')
  
@@ -2297,10 +2324,9 @@ class YoutubeDL(object):
              self.cookiejar = compat_cookiejar.CookieJar()
          else:
              opts_cookiefile = expand_path(opts_cookiefile)
-            self.cookiejar = compat_cookiejar.MozillaCookieJar(
-                opts_cookiefile)
+            self.cookiejar = YoutubeDLCookieJar(opts_cookiefile)
              if os.access(opts_cookiefile, os.R_OK):
-                self.cookiejar.load()
+                self.cookiejar.load(ignore_discard=True, ignore_expires=True)
  
          cookie_processor = YoutubeDLCookieProcessor(self.cookiejar)
          if opts_proxy is not None:
@@ -2310,7 +2336,7 @@ class YoutubeDL(object):
                  proxies = {'http': opts_proxy, 'https': opts_proxy}
          else:
              proxies = compat_urllib_request.getproxies()
-            # Set HTTPS proxy to HTTP one if given (https://github.com/rg3/youtube-dl/issues/805)
+            # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805)
              if 'http' in proxies and 'https' not in proxies:
                  proxies['https'] = proxies['http']
          proxy_handler = PerRequestProxyHandler(proxies)
@@ -2318,12 +2344,13 @@ class YoutubeDL(object):
          debuglevel = 1 if self.params.get('debug_printtraffic') else 0
          https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel)
          ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel)
+        redirect_handler = YoutubeDLRedirectHandler()
          data_handler = compat_urllib_request_DataHandler()
  
          # When passing our own FileHandler instance, build_opener won't add the
          # default FileHandler and allows us to disable the file protocol, which
          # can be used for malicious purposes (see
-        # https://github.com/rg3/youtube-dl/issues/8227)
+        # https://github.com/ytdl-org/youtube-dl/issues/8227)
          file_handler = compat_urllib_request.FileHandler()
  
          def file_open(*args, **kwargs):
@@ -2331,11 +2358,11 @@ class YoutubeDL(object):
          file_handler.file_open = file_open
  
          opener = compat_urllib_request.build_opener(
-            proxy_handler, https_handler, cookie_processor, ydlh, data_handler, file_handler)
+            proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler)
  
          # Delete the default user-agent header, which would otherwise apply in
          # cases where our custom HTTP handler doesn't come into play
-        # (See https://github.com/rg3/youtube-dl/issues/1309 for details)
+        # (See https://github.com/ytdl-org/youtube-dl/issues/1309 for details)
          opener.addheaders = []
          self._opener = opener