Merge remote-tracking branch 'jaimeMF/format_selection'

[youtube-dl] / youtube_dl / YoutubeDL.py
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 3fc4ec378d2334f35a74949e21cfde592a2e64c7..f22a8bd0e044b9c10ecad56187a15a310a4c1d7d 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -3,6 +3,7 @@
  
  from __future__ import absolute_import
  
+import errno
  import io
  import os
  import re
@@ -70,16 +71,26 @@ class YoutubeDL(object):
      logtostderr:       Log messages to stderr instead of stdout.
      writedescription:  Write the video description to a .description file
      writeinfojson:     Write the video description to a .info.json file
+    writeannotations:  Write the video annotations to a .annotations.xml file
      writethumbnail:    Write the thumbnail image to a file
      writesubtitles:    Write the video subtitles to a file
      writeautomaticsub: Write the automatic subtitles to a file
      allsubtitles:      Downloads all the subtitles of the video
+                       (requires writesubtitles or writeautomaticsub)
      listsubtitles:     Lists all available subtitles for the video
      subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
      subtitleslangs:    List of languages of the subtitles to download
      keepvideo:         Keep the video file after post-processing
      daterange:         A DateRange object, download only if the upload_date is in the range.
      skip_download:     Skip the actual download of the video file
+    cachedir:          Location of the cache files in the filesystem.
+                       None to disable filesystem cache.
+    noplaylist:        Download single video instead of a playlist if in doubt.
+    age_limit:         An integer representing the user's age in years.
+                       Unsuitable videos for the given age are skipped.
+    downloadarchive:   File name of a file where all downloads are recorded.
+                       Videos already present in the file are not downloaded
+                       again.
      
      The following parameters are not used by YoutubeDL itself, they are used by
      the FileDownloader:
@@ -97,11 +108,23 @@ class YoutubeDL(object):
      def __init__(self, params):
          """Create a FileDownloader object with the given options."""
          self._ies = []
+        self._ies_instances = {}
          self._pps = []
          self._progress_hooks = []
          self._download_retcode = 0
          self._num_downloads = 0
          self._screen_file = [sys.stdout, sys.stderr][params.get('logtostderr', False)]
+
+        if (sys.version_info >= (3,) and sys.platform != 'win32' and
+                sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968']
+                and not params['restrictfilenames']):
+            # On Python 3, the Unicode filesystem API will throw errors (#1474)
+            self.report_warning(
+                u'Assuming --restrict-filenames since file system encoding '
+                u'cannot encode all charactes. '
+                u'Set the LC_ALL environment variable to fix this.')
+            params['restrictfilenames'] = True
+
          self.params = params
          self.fd = FileDownloader(self, self.params)
  
@@ -111,8 +134,21 @@ class YoutubeDL(object):
      def add_info_extractor(self, ie):
          """Add an InfoExtractor object to the end of the list."""
          self._ies.append(ie)
+        self._ies_instances[ie.ie_key()] = ie
          ie.set_downloader(self)
  
+    def get_info_extractor(self, ie_key):
+        """
+        Get an instance of an IE with name ie_key, it will try to get one from
+        the _ies list, if there's no instance it will create a new one and add
+        it to the extractor list.
+        """
+        ie = self._ies_instances.get(ie_key)
+        if ie is None:
+            ie = get_info_extractor(ie_key)()
+            self.add_info_extractor(ie)
+        return ie
+
      def add_default_info_extractors(self):
          """
          Add the InfoExtractors returned by gen_extractors to the end of the list
@@ -127,14 +163,10 @@ class YoutubeDL(object):
  
      def to_screen(self, message, skip_eol=False):
          """Print message to stdout if not in quiet mode."""
-        assert type(message) == type(u'')
          if not self.params.get('quiet', False):
              terminator = [u'\n', u''][skip_eol]
              output = message + terminator
-            if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr
-                output = output.encode(preferredencoding(), 'ignore')
-            self._screen_file.write(output)
-            self._screen_file.flush()
+            write_string(output, self._screen_file)
  
      def to_stderr(self, message):
          """Print message to stderr."""
@@ -227,6 +259,10 @@ class YoutubeDL(object):
          """ Report that the metadata file has been written """
          self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn)
  
+    def report_writeannotations(self, annofn):
+        """ Report that the annotations file has been written. """
+        self.to_screen(u'[info] Writing video annotations to: ' + annofn)
+
      def report_file_already_downloaded(self, file_name):
          """Report file has already been fully downloaded."""
          try:
@@ -284,6 +320,13 @@ class YoutubeDL(object):
              dateRange = self.params.get('daterange', DateRange())
              if date not in dateRange:
                  return u'[download] %s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange)
+        age_limit = self.params.get('age_limit')
+        if age_limit is not None:
+            if age_limit < info_dict.get('age_limit', 0):
+                return u'Skipping "' + title + '" because it is age restricted'
+        if self.in_download_archive(info_dict):
+            return (u'%(title)s has already been recorded in archive'
+                    % info_dict)
          return None
          
      def extract_info(self, url, download=True, ie_key=None, extra_info={}):
@@ -294,9 +337,7 @@ class YoutubeDL(object):
           '''
          
          if ie_key:
-            ie = get_info_extractor(ie_key)()
-            ie.set_downloader(self)
-            ies = [ie]
+            ies = [self.get_info_extractor(ie_key)]
          else:
              ies = self._ies
  
@@ -349,13 +390,7 @@ class YoutubeDL(object):
          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
          if result_type == 'video':
              ie_result.update(extra_info)
-            if 'playlist' not in ie_result:
-                # It isn't part of a playlist
-                ie_result['playlist'] = None
-                ie_result['playlist_index'] = None
-            if download:
-                self.process_info(ie_result)
-            return ie_result
+            return self.process_video_result(ie_result)
          elif result_type == 'url':
              # We have to add extra_info to the results because it may be
              # contained in a playlist
@@ -413,6 +448,89 @@ class YoutubeDL(object):
          else:
              raise Exception('Invalid result type: %s' % result_type)
  
+    def process_video_result(self, info_dict, download=True):
+        assert info_dict.get('_type', 'video') == 'video'
+
+        if 'playlist' not in info_dict:
+            # It isn't part of a playlist
+            info_dict['playlist'] = None
+            info_dict['playlist_index'] = None
+
+        # This extractors handle format selection themselves
+        if info_dict['extractor'] in [u'youtube', u'Youku', u'YouPorn', u'mixcloud']:
+            self.process_info(info_dict)
+            return info_dict
+
+        # We now pick which formats have to be downloaded
+        if info_dict.get('formats') is None:
+            # There's only one format available
+            formats = [info_dict]
+        else:
+            formats = info_dict['formats']
+
+        # We check that all the formats have the format and format_id fields
+        for (i, format) in enumerate(formats):
+            if format.get('format') is None:
+                if format.get('height') is not None:
+                    if format.get('width') is not None:
+                        format_desc = u'%sx%s' % (format['width'], format['height'])
+                    else:
+                        format_desc = u'%sp' % format['height']
+                else:
+                    format_desc = '???'
+                format['format'] = format_desc
+            if format.get('format_id') is None:
+                format['format_id'] = compat_str(i)
+
+        if self.params.get('listformats', None):
+            self.list_formats(info_dict)
+            return
+
+        format_limit = self.params.get('format_limit', None)
+        if format_limit:
+            formats = [f for f in formats if f['format_id'] <= format_limit]
+        if self.params.get('prefer_free_formats'):
+            def _free_formats_key(f):
+                try:
+                    ext_ord = [u'flv', u'mp4', u'webm'].index(f['ext'])
+                except ValueError:
+                    ext_ord = -1
+                # We only compare the extension if they have the same height and width
+                return (f.get('height'), f.get('width'), ext_ord)
+            formats = sorted(formats, key=_free_formats_key)
+
+        req_format = self.params.get('format', 'best')
+        formats_to_download = []
+        if req_format == 'best' or req_format is None:
+            formats_to_download = [formats[-1]]
+        elif req_format == 'worst':
+            formats_to_download = [formats[0]]
+        # The -1 is for supporting YoutubeIE
+        elif req_format in ('-1', 'all'):
+            formats_to_download = formats
+        else:
+            # We can accept formats requestd in the format: 34/10/5, we pick
+            # the first that is availble, starting from left
+            req_formats = req_format.split('/')
+            for rf in req_formats:
+                matches = filter(lambda f:f['format_id'] == rf ,formats)
+                if matches:
+                    formats_to_download = [matches[0]]
+                    break
+        if not formats_to_download:
+            raise ExtractorError(u'requested format not available')
+
+        if download:
+            if len(formats_to_download) > 1:
+                self.to_screen(u'[info] %s: downloading video in %s formats' % (info_dict['id'], len(formats_to_download)))
+            for format in formats_to_download:
+                new_info = dict(info_dict)
+                new_info.update(format)
+                self.process_info(new_info)
+        # We update the info dict with the best quality format (backwards compatibility)
+        info_dict.update(formats_to_download[-1])
+        return info_dict
+
      def process_info(self, info_dict):
          """Process a single resolved IE result."""
  
@@ -448,7 +566,8 @@ class YoutubeDL(object):
          if self.params.get('forceid', False):
              compat_print(info_dict['id'])
          if self.params.get('forceurl', False):
-            compat_print(info_dict['url'])
+            # For RTMP URLs, also include the playpath
+            compat_print(info_dict['url'] + info_dict.get('play_path', u''))
          if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
              compat_print(info_dict['thumbnail'])
          if self.params.get('forcedescription', False) and 'description' in info_dict:
@@ -479,13 +598,26 @@ class YoutubeDL(object):
                  self.report_writedescription(descfn)
                  with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile:
                      descfile.write(info_dict['description'])
+            except (KeyError, TypeError):
+                self.report_warning(u'There\'s no description to write.')
              except (OSError, IOError):
                  self.report_error(u'Cannot write description file ' + descfn)
                  return
  
+        if self.params.get('writeannotations', False):
+            try:
+               annofn = filename + u'.annotations.xml'
+               self.report_writeannotations(annofn)
+               with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile:
+                   annofile.write(info_dict['annotations'])
+            except (KeyError, TypeError):
+                self.report_warning(u'There are no annotations to write.')
+            except (OSError, IOError):
+                 self.report_error(u'Cannot write annotations file: ' + annofn)
+                 return
+
          subtitles_are_requested = any([self.params.get('writesubtitles', False),
-                                       self.params.get('writeautomaticsub'),
-                                       self.params.get('allsubtitles', False)])
+                                       self.params.get('writeautomaticsub')])
  
          if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
              # subtitles download errors are already managed as troubles in relevant IE
@@ -521,11 +653,15 @@ class YoutubeDL(object):
                  thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
                  self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
                                 (info_dict['extractor'], info_dict['id']))
-                uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
-                with open(thumb_filename, 'wb') as thumbf:
-                    shutil.copyfileobj(uf, thumbf)
-                self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
-                               (info_dict['extractor'], info_dict['id'], thumb_filename))
+                try:
+                    uf = compat_urllib_request.urlopen(info_dict['thumbnail'])
+                    with open(thumb_filename, 'wb') as thumbf:
+                        shutil.copyfileobj(uf, thumbf)
+                    self.to_screen(u'[%s] %s: Writing thumbnail to: %s' %
+                        (info_dict['extractor'], info_dict['id'], thumb_filename))
+                except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                    self.report_warning(u'Unable to download thumbnail "%s": %s' %
+                        (info_dict['thumbnail'], compat_str(err)))
  
          if not self.params.get('skip_download', False):
              if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(filename)):
@@ -533,11 +669,11 @@ class YoutubeDL(object):
              else:
                  try:
                      success = self.fd._do_download(filename, info_dict)
-                except (OSError, IOError) as err:
-                    raise UnavailableVideoError(err)
                  except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                      self.report_error(u'unable to download video data: %s' % str(err))
                      return
+                except (OSError, IOError) as err:
+                    raise UnavailableVideoError(err)
                  except (ContentTooShortError, ) as err:
                      self.report_error(u'content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded))
                      return
@@ -549,6 +685,8 @@ class YoutubeDL(object):
                      self.report_error(u'postprocessing: %s' % str(err))
                      return
  
+        self.record_download_archive(info_dict)
+
      def download(self, url_list):
          """Download a given list of URLs."""
          if len(url_list) > 1 and self.fixed_template():
@@ -588,3 +726,40 @@ class YoutubeDL(object):
                  os.remove(encodeFilename(filename))
              except (IOError, OSError):
                  self.report_warning(u'Unable to remove downloaded video file')
+
+    def in_download_archive(self, info_dict):
+        fn = self.params.get('download_archive')
+        if fn is None:
+            return False
+        vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+        try:
+            with locked_file(fn, 'r', encoding='utf-8') as archive_file:
+                for line in archive_file:
+                    if line.strip() == vid_id:
+                        return True
+        except IOError as ioe:
+            if ioe.errno != errno.ENOENT:
+                raise
+        return False
+
+    def record_download_archive(self, info_dict):
+        fn = self.params.get('download_archive')
+        if fn is None:
+            return
+        vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+        with locked_file(fn, 'a', encoding='utf-8') as archive_file:
+            archive_file.write(vid_id + u'\n')
+
+    def list_formats(self, info_dict):
+        formats_s = []
+        for format in info_dict.get('formats', [info_dict]):
+            formats_s.append("%s\t:\t%s\t[%s]" % (format['format_id'],
+                                                format['ext'],
+                                                format.get('format', '???'),
+                                                )
+                            )
+        if len(formats_s) != 1:
+            formats_s[0]  += ' (worst)'
+            formats_s[-1] += ' (best)'
+        formats_s = "\n".join(formats_s)
+        self.to_screen(u"[info] Available formats for %s:\nformat code\textension\n%s" % (info_dict['id'], formats_s))