Merge remote-tracking branch 'origin/reuse_ies'
authorPhilipp Hagemeister <phihag@phihag.de>
Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)
1  2 
youtube_dl/YoutubeDL.py
youtube_dl/extractor/common.py

diff --combined youtube_dl/YoutubeDL.py
index d5f7c81eb4189ae6eb975abaaef860f68910b4ff,cd3d6ea7b5b196abc4fc4ba4ed2b4077cbe899b6..b289bd9e26bbc9993e6f1295a31d20b3275f5f48
@@@ -76,7 -76,7 +76,7 @@@ class YoutubeDL(object)
      allsubtitles:      Downloads all the subtitles of the video
      listsubtitles:     Lists all available subtitles for the video
      subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
 -    subtitleslang:     Language of the subtitles to download
 +    subtitleslangs:    List of languages of the subtitles to download
      keepvideo:         Keep the video file after post-processing
      daterange:         A DateRange object, download only if the upload_date is in the range.
      skip_download:     Skip the actual download of the video file
@@@ -97,6 -97,7 +97,7 @@@
      def __init__(self, params):
          """Create a FileDownloader object with the given options."""
          self._ies = []
+         self._ies_instances = {}
          self._pps = []
          self._progress_hooks = []
          self._download_retcode = 0
      def add_info_extractor(self, ie):
          """Add an InfoExtractor object to the end of the list."""
          self._ies.append(ie)
+         self._ies_instances[ie.ie_key()] = ie
          ie.set_downloader(self)
  
+     def get_info_extractor(self, ie_key):
+         """
+         Get an instance of an IE with name ie_key, it will try to get one from
+         the _ies list, if there's no instance it will create a new one and add
+         it to the extractor list.
+         """
+         ie = self._ies_instances.get(ie_key)
+         if ie is None:
+             ie = get_info_extractor(ie_key)()
+             self.add_info_extractor(ie)
+         return ie
      def add_default_info_extractors(self):
          """
          Add the InfoExtractors returned by gen_extractors to the end of the list
              self.report_error(u'Erroneous output template')
              return None
          except ValueError as err:
 -            self.report_error(u'Insufficient system charset ' + repr(preferredencoding()))
 +            self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
              return None
  
      def _match_entry(self, info_dict):
           '''
          
          if ie_key:
-             ie = get_info_extractor(ie_key)()
-             ie.set_downloader(self)
-             ies = [ie]
+             ies = [self.get_info_extractor(ie_key)]
          else:
              ies = self._ies
  
  
          result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
          if result_type == 'video':
 +            ie_result.update(extra_info)
              if 'playlist' not in ie_result:
                  # It isn't part of a playlist
                  ie_result['playlist'] = None
          if self.params.get('forceid', False):
              compat_print(info_dict['id'])
          if self.params.get('forceurl', False):
 -            compat_print(info_dict['url'])
 +            # For RTMP URLs, also include the playpath
 +            compat_print(info_dict['url'] + info_dict.get('play_path', u''))
          if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
              compat_print(info_dict['thumbnail'])
          if self.params.get('forcedescription', False) and 'description' in info_dict:
                  self.report_error(u'Cannot write description file ' + descfn)
                  return
  
 -        if (self.params.get('writesubtitles', False) or self.params.get('writeautomaticsub')) and 'subtitles' in info_dict and info_dict['subtitles']:
 +        subtitles_are_requested = any([self.params.get('writesubtitles', False),
 +                                       self.params.get('writeautomaticsub'),
 +                                       self.params.get('allsubtitles', False)])
 +
 +        if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
              # subtitles download errors are already managed as troubles in relevant IE
              # that way it will silently go on when used with unsupporting IE
 -            subtitle = info_dict['subtitles'][0]
 -            (sub_error, sub_lang, sub) = subtitle
 +            subtitles = info_dict['subtitles']
              sub_format = self.params.get('subtitlesformat')
 -            if sub_error:
 -                self.report_warning("Some error while getting the subtitles")
 -            else:
 +            for sub_lang in subtitles.keys():
 +                sub = subtitles[sub_lang]
 +                if sub is None:
 +                    continue
                  try:
 -                    sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 +                    sub_filename = subtitles_filename(filename, sub_lang, sub_format)
                      self.report_writesubtitles(sub_filename)
                      with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 -                        subfile.write(sub)
 +                            subfile.write(sub)
                  except (OSError, IOError):
                      self.report_error(u'Cannot write subtitles file ' + descfn)
                      return
  
 -        if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
 -            subtitles = info_dict['subtitles']
 -            sub_format = self.params.get('subtitlesformat')
 -            for subtitle in subtitles:
 -                (sub_error, sub_lang, sub) = subtitle
 -                if sub_error:
 -                    self.report_warning("Some error while getting the subtitles")
 -                else:
 -                    try:
 -                        sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
 -                        self.report_writesubtitles(sub_filename)
 -                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
 -                                subfile.write(sub)
 -                    except (OSError, IOError):
 -                        self.report_error(u'Cannot write subtitles file ' + descfn)
 -                        return
 -
          if self.params.get('writeinfojson', False):
              infofn = filename + u'.info.json'
              self.report_writeinfojson(infofn)
                  return
  
          if self.params.get('writethumbnail', False):
 -            if 'thumbnail' in info_dict:
 -                thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
 -                if not thumb_format:
 -                    thumb_format = 'jpg'
 +            if info_dict.get('thumbnail') is not None:
 +                thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
                  thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
                  self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
                                 (info_dict['extractor'], info_dict['id']))
                  try:
                      success = self.fd._do_download(filename, info_dict)
                  except (OSError, IOError) as err:
 -                    raise UnavailableVideoError()
 +                    raise UnavailableVideoError(err)
                  except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                      self.report_error(u'unable to download video data: %s' % str(err))
                      return
                          # No clear decision yet, let IE decide
                          keep_video = keep_video_wish
              except PostProcessingError as e:
 -                self.to_stderr(u'ERROR: ' + e.msg)
 +                self.report_error(e.msg)
          if keep_video is False and not self.params.get('keepvideo', False):
              try:
                  self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
index 12169b2bb9209dd901d280e02ce10349e1cad6b4,236c7b12c939743e2a1db4d1155222b43464f673..77a13aea533d17aa57f17e01929ca3a276787844
@@@ -14,7 -14,6 +14,7 @@@ from ..utils import 
      clean_html,
      compiled_regex_type,
      ExtractorError,
 +    unescapeHTML,
  )
  
  class InfoExtractor(object):
@@@ -47,8 -46,7 +47,8 @@@
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
      player_url:     SWF Player URL (used for rtmpdump).
 -    subtitles:      The subtitle file contents.
 +    subtitles:      The subtitle file contents as a dictionary in the format
 +                    {language: subtitles}.
      view_count:     How many users have watched the video on the platform.
      urlhandle:      [internal] The urlHandle to be used to download the file,
                      like returned by urllib.request.urlopen
      @classmethod
      def suitable(cls, url):
          """Receives a URL and returns True if suitable for this IE."""
 -        return re.match(cls._VALID_URL, url) is not None
 +
 +        # This does not use has/getattr intentionally - we want to know whether
 +        # we have cached the regexp for *this* class, whereas getattr would also
 +        # match the superclass
 +        if '_VALID_URL_RE' not in cls.__dict__:
 +            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 +        return cls._VALID_URL_RE.match(url) is not None
  
      @classmethod
      def working(cls):
          """Real extraction process. Redefine in subclasses."""
          pass
  
+     @classmethod
+     def ie_key(cls):
+         """A string for getting the InfoExtractor with get_info_extractor"""
+         return cls.__name__[:-2]
      @property
      def IE_NAME(self):
          return type(self).__name__[:-2]
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              if errnote is None:
                  errnote = u'Unable to download webpage'
 -            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
 +            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
  
      def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
          """ Returns a tuple (page content as string, URL handle) """
 +
 +        # Strip hashes from the URL (#1038)
 +        if isinstance(url_or_request, (compat_str, str)):
 +            url_or_request = url_or_request.partition('#')[0]
 +
          urlh = self._request_webpage(url_or_request, video_id, note, errnote)
          content_type = urlh.headers.get('Content-Type', '')
          m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
          self.to_screen(u'Logging in')
  
      #Methods for following #608
 -    #They set the correct value of the '_type' key
 -    def video_result(self, video_info):
 -        """Returns a video"""
 -        video_info['_type'] = 'video'
 -        return video_info
      def url_result(self, url, ie=None):
          """Returns a url that points to a page that should be processed"""
          #TODO: ie should be the class used for getting the info
          
          return (username, password)
  
 +    # Helper functions for extracting OpenGraph info
 +    @staticmethod
 +    def _og_regex(prop):
 +        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
 +
 +    def _og_search_property(self, prop, html, name=None, **kargs):
 +        if name is None:
 +            name = 'OpenGraph %s' % prop
 +        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
 +        return unescapeHTML(escaped)
 +
 +    def _og_search_thumbnail(self, html, **kargs):
 +        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
 +
 +    def _og_search_description(self, html, **kargs):
 +        return self._og_search_property('description', html, fatal=False, **kargs)
 +
 +    def _og_search_title(self, html, **kargs):
 +        return self._og_search_property('title', html, **kargs)
 +
 +    def _og_search_video_url(self, html, name='video url', **kargs):
 +        return self._html_search_regex([self._og_regex('video:secure_url'),
 +                                        self._og_regex('video')],
 +                                       html, name, **kargs)
 +
  class SearchInfoExtractor(InfoExtractor):
      """
      Base class for paged search queries extractors.