Merge remote-tracking branch 'origin/reuse_ies'

author Philipp Hagemeister <phihag@phihag.de>

Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)

committer Philipp Hagemeister <phihag@phihag.de>

Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)
author Philipp Hagemeister <phihag@phihag.de>
Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)
committer Philipp Hagemeister <phihag@phihag.de>
Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)
diff --combined youtube_dl/YoutubeDL.py

index d5f7c81eb4189ae6eb975abaaef860f68910b4ff,cd3d6ea7b5b196abc4fc4ba4ed2b4077cbe899b6..b289bd9e26bbc9993e6f1295a31d20b3275f5f48
--- 1/youtube_dl/YoutubeDL.py
--- 2/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@@ -76,7 -76,7 +76,7 @@@ class YoutubeDL(object)
       allsubtitles:      Downloads all the subtitles of the video
       listsubtitles:     Lists all available subtitles for the video
       subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
- -    subtitleslang:     Language of the subtitles to download
+ +    subtitleslangs:    List of languages of the subtitles to download
       keepvideo:         Keep the video file after post-processing
       daterange:         A DateRange object, download only if the upload_date is in the range.
       skip_download:     Skip the actual download of the video file
@@@ -97,6 -97,7 +97,7 @@@
       def __init__(self, params):
           """Create a FileDownloader object with the given options."""
           self._ies = []
+         self._ies_instances = {}
           self._pps = []
           self._progress_hooks = []
           self._download_retcode = 0
@@@ -111,8 -112,21 +112,21 @@@
       def add_info_extractor(self, ie):
           """Add an InfoExtractor object to the end of the list."""
           self._ies.append(ie)
+         self._ies_instances[ie.ie_key()] = ie
           ie.set_downloader(self)
   
+     def get_info_extractor(self, ie_key):
+         """
+         Get an instance of an IE with name ie_key, it will try to get one from
+         the _ies list, if there's no instance it will create a new one and add
+         it to the extractor list.
+         """
+         ie = self._ies_instances.get(ie_key)
+         if ie is None:
+             ie = get_info_extractor(ie_key)()
+             self.add_info_extractor(ie)
+         return ie
+ 
       def add_default_info_extractors(self):
           """
           Add the InfoExtractors returned by gen_extractors to the end of the list
@@@ -264,7 -278,7 +278,7 @@@
               self.report_error(u'Erroneous output template')
               return None
           except ValueError as err:
- -            self.report_error(u'Insufficient system charset ' + repr(preferredencoding()))
+ +            self.report_error(u'Error in output template: ' + str(err) + u' (encoding: ' + repr(preferredencoding()) + ')')
               return None
   
       def _match_entry(self, info_dict):
@@@ -294,9 -308,7 +308,7 @@@
            '''
           
           if ie_key:
-             ie = get_info_extractor(ie_key)()
-             ie.set_downloader(self)
-             ies = [ie]
+             ies = [self.get_info_extractor(ie_key)]
           else:
               ies = self._ies
   
@@@ -348,7 -360,6 +360,7 @@@
   
           result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
           if result_type == 'video':
+ +            ie_result.update(extra_info)
               if 'playlist' not in ie_result:
                   # It isn't part of a playlist
                   ie_result['playlist'] = None
@@@ -448,8 -459,7 +460,8 @@@
           if self.params.get('forceid', False):
               compat_print(info_dict['id'])
           if self.params.get('forceurl', False):
- -            compat_print(info_dict['url'])
+ +            # For RTMP URLs, also include the playpath
+ +            compat_print(info_dict['url'] + info_dict.get('play_path', u''))
           if self.params.get('forcethumbnail', False) and 'thumbnail' in info_dict:
               compat_print(info_dict['thumbnail'])
           if self.params.get('forcedescription', False) and 'description' in info_dict:
@@@ -484,28 -494,41 +496,28 @@@
                   self.report_error(u'Cannot write description file ' + descfn)
                   return
   
- -        if (self.params.get('writesubtitles', False) or self.params.get('writeautomaticsub')) and 'subtitles' in info_dict and info_dict['subtitles']:
+ +        subtitles_are_requested = any([self.params.get('writesubtitles', False),
+ +                                       self.params.get('writeautomaticsub'),
+ +                                       self.params.get('allsubtitles', False)])
+ +
+ +        if  subtitles_are_requested and 'subtitles' in info_dict and info_dict['subtitles']:
               # subtitles download errors are already managed as troubles in relevant IE
               # that way it will silently go on when used with unsupporting IE
- -            subtitle = info_dict['subtitles'][0]
- -            (sub_error, sub_lang, sub) = subtitle
+ +            subtitles = info_dict['subtitles']
               sub_format = self.params.get('subtitlesformat')
- -            if sub_error:
- -                self.report_warning("Some error while getting the subtitles")
- -            else:
+ +            for sub_lang in subtitles.keys():
+ +                sub = subtitles[sub_lang]
+ +                if sub is None:
+ +                    continue
                   try:
- -                    sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
+ +                    sub_filename = subtitles_filename(filename, sub_lang, sub_format)
                       self.report_writesubtitles(sub_filename)
                       with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
- -                        subfile.write(sub)
+ +                            subfile.write(sub)
                   except (OSError, IOError):
                       self.report_error(u'Cannot write subtitles file ' + descfn)
                       return
   
- -        if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
- -            subtitles = info_dict['subtitles']
- -            sub_format = self.params.get('subtitlesformat')
- -            for subtitle in subtitles:
- -                (sub_error, sub_lang, sub) = subtitle
- -                if sub_error:
- -                    self.report_warning("Some error while getting the subtitles")
- -                else:
- -                    try:
- -                        sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
- -                        self.report_writesubtitles(sub_filename)
- -                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
- -                                subfile.write(sub)
- -                    except (OSError, IOError):
- -                        self.report_error(u'Cannot write subtitles file ' + descfn)
- -                        return
- -
           if self.params.get('writeinfojson', False):
               infofn = filename + u'.info.json'
               self.report_writeinfojson(infofn)
@@@ -517,8 -540,10 +529,8 @@@
                   return
   
           if self.params.get('writethumbnail', False):
- -            if 'thumbnail' in info_dict:
- -                thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
- -                if not thumb_format:
- -                    thumb_format = 'jpg'
+ +            if info_dict.get('thumbnail') is not None:
+ +                thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
                   thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
                   self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
                                  (info_dict['extractor'], info_dict['id']))
@@@ -535,7 -560,7 +547,7 @@@
                   try:
                       success = self.fd._do_download(filename, info_dict)
                   except (OSError, IOError) as err:
- -                    raise UnavailableVideoError()
+ +                    raise UnavailableVideoError(err)
                   except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                       self.report_error(u'unable to download video data: %s' % str(err))
                       return
@@@ -582,7 -607,7 +594,7 @@@
                           # No clear decision yet, let IE decide
                           keep_video = keep_video_wish
               except PostProcessingError as e:
- -                self.to_stderr(u'ERROR: ' + e.msg)
+ +                self.report_error(e.msg)
           if keep_video is False and not self.params.get('keepvideo', False):
               try:
                   self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
diff --combined youtube_dl/extractor/common.py

index 12169b2bb9209dd901d280e02ce10349e1cad6b4,236c7b12c939743e2a1db4d1155222b43464f673..77a13aea533d17aa57f17e01929ca3a276787844
--- 1/youtube_dl/extractor/common.py
--- 2/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@@ -14,7 -14,6 +14,7 @@@ from ..utils import 
       clean_html,
       compiled_regex_type,
       ExtractorError,
+ +    unescapeHTML,
   )
   
   class InfoExtractor(object):
@@@ -47,8 -46,7 +47,8 @@@
       uploader_id:    Nickname or id of the video uploader.
       location:       Physical location of the video.
       player_url:     SWF Player URL (used for rtmpdump).
- -    subtitles:      The subtitle file contents.
+ +    subtitles:      The subtitle file contents as a dictionary in the format
+ +                    {language: subtitles}.
       view_count:     How many users have watched the video on the platform.
       urlhandle:      [internal] The urlHandle to be used to download the file,
                       like returned by urllib.request.urlopen
@@@ -78,13 -76,7 +78,13 @@@
       @classmethod
       def suitable(cls, url):
           """Receives a URL and returns True if suitable for this IE."""
- -        return re.match(cls._VALID_URL, url) is not None
+ +
+ +        # This does not use has/getattr intentionally - we want to know whether
+ +        # we have cached the regexp for *this* class, whereas getattr would also
+ +        # match the superclass
+ +        if '_VALID_URL_RE' not in cls.__dict__:
+ +            cls._VALID_URL_RE = re.compile(cls._VALID_URL)
+ +        return cls._VALID_URL_RE.match(url) is not None
   
       @classmethod
       def working(cls):
@@@ -114,6 -106,11 +114,11 @@@
           """Real extraction process. Redefine in subclasses."""
           pass
   
+     @classmethod
+     def ie_key(cls):
+         """A string for getting the InfoExtractor with get_info_extractor"""
+         return cls.__name__[:-2]
+ 
       @property
       def IE_NAME(self):
           return type(self).__name__[:-2]
@@@ -129,15 -126,10 +134,15 @@@
           except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
               if errnote is None:
                   errnote = u'Unable to download webpage'
- -            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+ +            raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
   
       def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
           """ Returns a tuple (page content as string, URL handle) """
+ +
+ +        # Strip hashes from the URL (#1038)
+ +        if isinstance(url_or_request, (compat_str, str)):
+ +            url_or_request = url_or_request.partition('#')[0]
+ +
           urlh = self._request_webpage(url_or_request, video_id, note, errnote)
           content_type = urlh.headers.get('Content-Type', '')
           m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
@@@ -182,6 -174,11 +187,6 @@@
           self.to_screen(u'Logging in')
   
       #Methods for following #608
- -    #They set the correct value of the '_type' key
- -    def video_result(self, video_info):
- -        """Returns a video"""
- -        video_info['_type'] = 'video'
- -        return video_info
       def url_result(self, url, ie=None):
           """Returns a url that points to a page that should be processed"""
           #TODO: ie should be the class used for getting the info
@@@ -270,31 -267,6 +275,31 @@@
           
           return (username, password)
   
+ +    # Helper functions for extracting OpenGraph info
+ +    @staticmethod
+ +    def _og_regex(prop):
+ +        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+ +
+ +    def _og_search_property(self, prop, html, name=None, **kargs):
+ +        if name is None:
+ +            name = 'OpenGraph %s' % prop
+ +        escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+ +        return unescapeHTML(escaped)
+ +
+ +    def _og_search_thumbnail(self, html, **kargs):
+ +        return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+ +
+ +    def _og_search_description(self, html, **kargs):
+ +        return self._og_search_property('description', html, fatal=False, **kargs)
+ +
+ +    def _og_search_title(self, html, **kargs):
+ +        return self._og_search_property('title', html, **kargs)
+ +
+ +    def _og_search_video_url(self, html, name='video url', **kargs):
+ +        return self._html_search_regex([self._og_regex('video:secure_url'),
+ +                                        self._og_regex('video')],
+ +                                       html, name, **kargs)
+ +
   class SearchInfoExtractor(InfoExtractor):
       """
       Base class for paged search queries extractors.
author	Philipp Hagemeister <phihag@phihag.de>
	Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)
committer	Philipp Hagemeister <phihag@phihag.de>
	Wed, 28 Aug 2013 11:05:21 +0000 (13:05 +0200)
		1	2
youtube_dl/YoutubeDL.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/extractor/common.py	patch \|	diff1 \|	diff2 \|	blob \| history