Merge branch 'master' into extract_info_rewrite
authorJaime Marquínez Ferrándiz <jaimemf93@gmail.com>
Thu, 28 Mar 2013 12:02:04 +0000 (13:02 +0100)
committerJaime Marquínez Ferrándiz <jaimemf93@gmail.com>
Thu, 28 Mar 2013 12:20:33 +0000 (13:20 +0100)
1  2 
youtube_dl/FileDownloader.py
youtube_dl/InfoExtractors.py

index 68fad11bc72bf4f84b5d36266afbef403facb30e,725d4a0160388b3faa8c7a5b09cc83a8726170f8..6af2acbeee73b1258c32134100eb3a6e4abac38a
@@@ -78,7 -78,11 +78,11 @@@ class FileDownloader(object)
      updatetime:        Use the Last-modified header to set output file timestamps.
      writedescription:  Write the video description to a .description file
      writeinfojson:     Write the video description to a .info.json file
-     writesubtitles:    Write the video subtitles to a .srt file
+     writesubtitles:    Write the video subtitles to a file
+     onlysubtitles:     Downloads only the subtitles of the video
+     allsubtitles:      Downloads all the subtitles of the video
+     listsubtitles:     Lists all available subtitles for the video
+     subtitlesformat:   Subtitle format [sbv/srt] (default=srt)
      subtitleslang:     Language of the subtitles to download
      test:              Download only first bytes to test the downloader.
      keepvideo:         Keep the video file after post-processing
          """ Report that the description file is being written """
          self.to_screen(u'[info] Writing video description to: ' + descfn)
  
-     def report_writesubtitles(self, srtfn):
+     def report_writesubtitles(self, sub_filename):
          """ Report that the subtitles file is being written """
-         self.to_screen(u'[info] Writing video subtitles to: ' + srtfn)
+         self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename)
  
      def report_writeinfojson(self, infofn):
          """ Report that the metadata file has been written """
  
              filename = self.params['outtmpl'] % template_dict
              return filename
-         except (ValueError, KeyError) as err:
-             self.trouble(u'ERROR: invalid system charset or erroneous output template')
+         except KeyError as err:
+             self.trouble(u'ERROR: Erroneous output template')
+             return None
+         except ValueError as err:
+             self.trouble(u'ERROR: Insufficient system charset ' + repr(preferredencoding()))
              return None
  
      def _match_entry(self, info_dict):
              if re.search(rejecttitle, title, re.IGNORECASE):
                  return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
          return None
 +        
 +    def extract_info(self, url):
 +        '''
 +        Returns a list with a dictionary for each video we find.
 +         '''
 +        suitable_found = False
 +        for ie in self._ies:
 +            # Go to next InfoExtractor if not suitable
 +            if not ie.suitable(url):
 +                continue
 +
 +            # Warn if the _WORKING attribute is False
 +            if not ie.working():
 +                self.to_stderr(u'WARNING: the program functionality for this site has been marked as broken, '
 +                               u'and will probably not work. If you want to go on, use the -i option.')
 +
 +            # Suitable InfoExtractor found
 +            suitable_found = True
 +
 +            # Extract information from URL and process it
 +            try:
 +                ie_results = ie.extract(url)
 +                results = self.process_ie_results(ie_results, ie)
 +                return results
 +            except ExtractorError as de: # An error we somewhat expected
 +                self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback())
 +                break
 +            except Exception as e:
 +                if self.params.get('ignoreerrors', False):
 +                    self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc()))
 +                    break
 +                else:
 +                    raise
 +        if not suitable_found:
 +                self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 +    def extract_info_iterable(self, urls):
 +        '''
 +            Return the videos founded for the urls
 +        '''
 +        results = []
 +        for url in urls:
 +            results.extend(self.extract_info(url))
 +        return results
 +        
 +    def process_ie_results(self, ie_results, ie):
 +        """
 +        Take the results of the ie and return a list of videos.
 +        For url elements it will seartch the suitable ie and get the videos
 +        For playlist elements it will process each of the elements of the 'entries' key
 +        """
 +        results = [] 
 +        for result in ie_results or []:
 +            result_type = result.get('_type', 'video') #If not given we suppose it's a video, support the dafault old system
 +            if result_type == 'video':
 +                if not 'extractor' in result:
 +                    #The extractor has already been set somewhere else
 +                    result['extractor'] = ie.IE_NAME
 +                results.append(result)
 +            elif result_type == 'url':
 +                #We get the videos pointed by the url
 +                results.extend(self.extract_info(result['url']))
 +            elif result_type == 'playlist':
 +                #We process each entry in the playlist
 +                entries_result = self.process_ie_results(result['entries'], ie)
 +                results.extend(entries_result)
 +        return results
  
      def process_info(self, info_dict):
          """Process a single dictionary returned by an InfoExtractor."""
          if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
              # subtitles download errors are already managed as troubles in relevant IE
              # that way it will silently go on when used with unsupporting IE
+             subtitle = info_dict['subtitles'][0]
+             (sub_error, sub_lang, sub) = subtitle
+             sub_format = self.params.get('subtitlesformat')
              try:
-                 srtfn = filename.rsplit('.', 1)[0] + u'.srt'
-                 self.report_writesubtitles(srtfn)
-                 with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile:
-                     srtfile.write(info_dict['subtitles'])
+                 sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
+                 self.report_writesubtitles(sub_filename)
+                 with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
+                     subfile.write(sub)
              except (OSError, IOError):
                  self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
                  return
+             if self.params.get('onlysubtitles', False):
+                 return 
+         if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']:
+             subtitles = info_dict['subtitles']
+             sub_format = self.params.get('subtitlesformat')
+             for subtitle in subtitles:
+                 (sub_error, sub_lang, sub) = subtitle
+                 try:
+                     sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
+                     self.report_writesubtitles(sub_filename)
+                     with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
+                             subfile.write(sub)
+                 except (OSError, IOError):
+                     self.trouble(u'ERROR: Cannot write subtitles file ' + descfn)
+                     return
+             if self.params.get('onlysubtitles', False):
+                 return 
  
          if self.params.get('writeinfojson', False):
              infofn = filename + u'.info.json'
              raise SameFileError(self.params['outtmpl'])
  
          for url in url_list:
 -            suitable_found = False
 -            for ie in self._ies:
 -                # Go to next InfoExtractor if not suitable
 -                if not ie.suitable(url):
 -                    continue
 -
 -                # Warn if the _WORKING attribute is False
 -                if not ie.working():
 -                    self.report_warning(u'the program functionality for this site has been marked as broken, '
 -                                        u'and will probably not work. If you want to go on, use the -i option.')
 +            videos = self.extract_info(url)
  
 -                # Suitable InfoExtractor found
 -                suitable_found = True
 -
 -                # Extract information from URL and process it
 +            for video in videos or []:
                  try:
 -                    videos = ie.extract(url)
 -                except ExtractorError as de: # An error we somewhat expected
 -                    self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback())
 -                    break
 -                except Exception as e:
 -                    if self.params.get('ignoreerrors', False):
 -                        self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc()))
 -                        break
 -                    else:
 -                        raise
 -
 -                if len(videos or []) > 1 and self.fixed_template():
 -                    raise SameFileError(self.params['outtmpl'])
 -
 -                for video in videos or []:
 -                    video['extractor'] = ie.IE_NAME
 -                    try:
 -                        self.increment_downloads()
 -                        self.process_info(video)
 -                    except UnavailableVideoError:
 -                        self.trouble(u'\nERROR: unable to download video')
 -
 -                # Suitable InfoExtractor had been found; go to next URL
 -                break
 -
 -            if not suitable_found:
 -                self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url)
 +                    self.increment_downloads()
 +                    self.process_info(video)
 +                except UnavailableVideoError:
 +                    self.trouble(u'\nERROR: unable to download video')
  
          return self._download_retcode
  
index e714fa6b078a87f0520c661d9a73db71bf78df6b,835428f3232afb8a6aaeca5c72b64bf1c3cd11b7..dd4a776e4a50adf7e71ab4580bbbc2fcac81e65d
@@@ -48,7 -48,7 +48,7 @@@ class InfoExtractor(object)
      uploader_id:    Nickname or id of the video uploader.
      location:       Physical location of the video.
      player_url:     SWF Player URL (used for rtmpdump).
-     subtitles:      The .srt file contents.
+     subtitles:      The subtitle file contents.
      urlhandle:      [internal] The urlHandle to be used to download the file,
                      like returned by urllib.request.urlopen
  
      def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
          """ Returns the data of the page as a string """
          urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+         content_type = urlh.headers.get('Content-Type', '')
+         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
+         if m:
+             encoding = m.group(1)
+         else:
+             encoding = 'utf-8'
          webpage_bytes = urlh.read()
-         return webpage_bytes.decode('utf-8', 'replace')
+         return webpage_bytes.decode(encoding, 'replace')
 +        
 +    #Methods for following #608
 +    #They set the correct value of the '_type' key
 +    def video_result(self, video_info):
 +        """Returns a video"""
 +        video_info['_type'] = 'video'
 +        return video_info
 +    def url_result(self, url, ie=None):
 +        """Returns a url that points to a page that should be processed"""
 +        #TODO: ie should be the class used for getting the info
 +        video_info = {'_type': 'url',
 +                      'url': url}
 +        return video_info
 +    def playlist_result(self, entries):
 +        """Returns a playlist"""
 +        video_info = {'_type': 'playlist',
 +                      'entries': entries}
 +        return video_info
  
  
  class YoutubeIE(InfoExtractor):
  
      def report_video_subtitles_download(self, video_id):
          """Report attempt to download video info webpage."""
-         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
+         self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
+     def report_video_subtitles_request(self, video_id, sub_lang, format):
+         """Report attempt to download video info webpage."""
+         self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
+     def report_video_subtitles_available(self, video_id, sub_lang_list):
+         """Report available subtitles."""
+         sub_lang = ",".join(list(sub_lang_list.keys()))
+         self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
  
      def report_information_extraction(self, video_id):
          """Report attempt to extract video information."""
          """Indicate the download will use the RTMP protocol."""
          self._downloader.to_screen(u'[youtube] RTMP download detected')
  
-     def _closed_captions_xml_to_srt(self, xml_string):
-         srt = ''
-         texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
-         # TODO parse xml instead of regex
-         for n, (start, dur_tag, dur, caption) in enumerate(texts):
-             if not dur: dur = '4'
-             start = float(start)
-             end = start + float(dur)
-             start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
-             end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
-             caption = unescapeHTML(caption)
-             caption = unescapeHTML(caption) # double cycle, intentional
-             srt += str(n+1) + '\n'
-             srt += start + ' --> ' + end + '\n'
-             srt += caption + '\n\n'
-         return srt
-     def _extract_subtitles(self, video_id):
+     def _get_available_subtitles(self, video_id):
          self.report_video_subtitles_download(video_id)
          request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
          try:
-             srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
+             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
-         srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
-         srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
-         if not srt_lang_list:
-             return (u'WARNING: video has no closed captions', None)
-         if self._downloader.params.get('subtitleslang', False):
-             srt_lang = self._downloader.params.get('subtitleslang')
-         elif 'en' in srt_lang_list:
-             srt_lang = 'en'
-         else:
-             srt_lang = list(srt_lang_list.keys())[0]
-         if not srt_lang in srt_lang_list:
-             return (u'WARNING: no closed captions found in the specified language', None)
+         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
+         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
+         if not sub_lang_list:
+             return (u'WARNING: video doesn\'t have subtitles', None)
+         return sub_lang_list
+     def _list_available_subtitles(self, video_id):
+         sub_lang_list = self._get_available_subtitles(video_id)
+         self.report_video_subtitles_available(video_id, sub_lang_list)
+     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
+         self.report_video_subtitles_request(video_id, sub_lang, format)
          params = compat_urllib_parse.urlencode({
-             'lang': srt_lang,
-             'name': srt_lang_list[srt_lang].encode('utf-8'),
+             'lang': sub_lang,
+             'name': sub_name,
              'v': video_id,
+             'fmt': format,
          })
          url = 'http://www.youtube.com/api/timedtext?' + params
          try:
-             srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
+             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
-         if not srt_xml:
+         if not sub:
              return (u'WARNING: Did not fetch video subtitles', None)
-         return (None, self._closed_captions_xml_to_srt(srt_xml))
+         return (None, sub_lang, sub)
+     def _extract_subtitle(self, video_id):
+         sub_lang_list = self._get_available_subtitles(video_id)
+         sub_format = self._downloader.params.get('subtitlesformat')
+         if self._downloader.params.get('subtitleslang', False):
+             sub_lang = self._downloader.params.get('subtitleslang')
+         elif 'en' in sub_lang_list:
+             sub_lang = 'en'
+         else:
+             sub_lang = list(sub_lang_list.keys())[0]
+         if not sub_lang in sub_lang_list:
+             return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
+         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
+         return [subtitle]
+     def _extract_all_subtitles(self, video_id):
+         sub_lang_list = self._get_available_subtitles(video_id)
+         sub_format = self._downloader.params.get('subtitlesformat')
+         subtitles = []
+         for sub_lang in sub_lang_list:
+             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
+             subtitles.append(subtitle)
+         return subtitles
  
      def _print_formats(self, formats):
          print('Available formats:')
          else:
              video_description = ''
  
-         # closed captions
+         # subtitles
          video_subtitles = None
          if self._downloader.params.get('writesubtitles', False):
-             (srt_error, video_subtitles) = self._extract_subtitles(video_id)
-             if srt_error:
-                 self._downloader.trouble(srt_error)
+             video_subtitles = self._extract_subtitle(video_id)
+             if video_subtitles:
+                 (sub_error, sub_lang, sub) = video_subtitles[0]
+                 if sub_error:
+                     self._downloader.trouble(sub_error)
+         if self._downloader.params.get('allsubtitles', False):
+             video_subtitles = self._extract_all_subtitles(video_id)
+             for video_subtitle in video_subtitles:
+                 (sub_error, sub_lang, sub) = video_subtitle
+                 if sub_error:
+                     self._downloader.trouble(sub_error)
+         if self._downloader.params.get('listsubtitles', False):
+             sub_lang_list = self._list_available_subtitles(video_id)
+             return
  
          if 'length_seconds' not in video_info:
              self._downloader.trouble(u'WARNING: unable to extract video duration')
@@@ -1299,7 -1318,8 +1336,8 @@@ class GenericIE(InfoExtractor)
  
      def report_download_webpage(self, video_id):
          """Report webpage download."""
-         self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
+         if not self._downloader.params.get('test', False):
+             self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
          self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
  
      def report_extraction(self, video_id):
          self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
  
      def _test_redirect(self, url):
 -        """Check if it is a redirect, like url shorteners, in case restart chain."""
 +        """Check if it is a redirect, like url shorteners, in case return the new url."""
          class HeadRequest(compat_urllib_request.Request):
              def get_method(self):
                  return "HEAD"
              return False
  
          self.report_following_redirect(new_url)
 -        self._downloader.download([new_url])
 -        return True
 +        return new_url
  
      def _real_extract(self, url):
 -        if self._test_redirect(url): return
 +        new_url = self._test_redirect(url)
 +        if new_url: return [self.url_result(new_url)]
  
          video_id = url.split('/')[-1]
-         request = compat_urllib_request.Request(url)
          try:
-             self.report_download_webpage(video_id)
-             webpage = compat_urllib_request.urlopen(request).read()
-         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-             self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
-             return
+             webpage = self._download_webpage(url, video_id)
          except ValueError as err:
              # since this is the last-resort InfoExtractor, if
              # this error is thrown, it'll be thrown here
@@@ -1774,8 -1789,9 +1807,8 @@@ class YoutubePlaylistIE(InfoExtractor)
          else:
              self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
  
 -        for video in videos:
 -            self._downloader.download([video])
 -        return
 +        url_results = [self.url_result(url) for url in videos]
 +        return [self.playlist_result(url_results)]
  
  
  class YoutubeChannelIE(InfoExtractor):
  
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  
 -        for id in video_ids:
 -            self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
 -        return
 +        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
 +        url_entries = [self.url_result(url) for url in urls]
 +        return [self.playlist_result(url_entries)]
  
  
  class YoutubeUserIE(InfoExtractor):
          self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
                  (username, all_ids_count, len(video_ids)))
  
 -        for video_id in video_ids:
 -            self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
 +        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
 +        url_results = [self.url_result(url) for url in urls]
 +        return [self.playlist_result(url_results)]
  
  
  class BlipTVUserIE(InfoExtractor):
          self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
                  (self.IE_NAME, username, all_ids_count, len(video_ids)))
  
 -        for video_id in video_ids:
 -            self._downloader.download([u'http://blip.tv/'+video_id])
 +        urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
 +        url_entries = [self.url_result(url) for url in urls]
 +        return [self.playlist_result(url_entries)]
  
  
  class DepositFilesIE(InfoExtractor):
@@@ -2576,7 -2590,7 +2609,7 @@@ class EscapistIE(InfoExtractor)
              'uploader': showName,
              'upload_date': None,
              'title': showName,
-             'ext': 'flv',
+             'ext': 'mp4',
              'thumbnail': imgUrl,
              'description': description,
              'player_url': playerUrl,
@@@ -3972,11 -3986,11 +4005,11 @@@ class KeekIE(InfoExtractor)
          webpage = self._download_webpage(url, video_id)
          m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
          title = unescapeHTML(m.group('title'))
-         m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
-         uploader = unescapeHTML(m.group('uploader'))
+         m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
+         uploader = clean_html(m.group('uploader'))
          info = {
-                 'id':video_id,
-                 'url':video_url,
+                 'id': video_id,
+                 'url': video_url,
                  'ext': 'mp4',
                  'title': title,
                  'thumbnail': thumbnail,
@@@ -4113,6 -4127,40 +4146,40 @@@ class MySpassIE(InfoExtractor)
          }
          return [info]
  
+ class SpiegelIE(InfoExtractor):
+     _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?$'
+     def _real_extract(self, url):
+         m = re.match(self._VALID_URL, url)
+         video_id = m.group('videoID')
+         webpage = self._download_webpage(url, video_id)
+         m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
+         if not m:
+             raise ExtractorError(u'Cannot find title')
+         video_title = unescapeHTML(m.group(1))
+         xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
+         xml_code = self._download_webpage(xml_url, video_id,
+                     note=u'Downloading XML', errnote=u'Failed to download XML')
+         idoc = xml.etree.ElementTree.fromstring(xml_code)
+         last_type = idoc[-1]
+         filename = last_type.findall('./filename')[0].text
+         duration = float(last_type.findall('./duration')[0].text)
+         video_url = 'http://video2.spiegel.de/flash/' + filename
+         video_ext = filename.rpartition('.')[2]
+         info = {
+             'id': video_id,
+             'url': video_url,
+             'ext': video_ext,
+             'title': video_title,
+             'duration': duration,
+         }
+         return [info]
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
      The order does matter; the first extractor matched is the one handling the URL.
          KeekIE(),
          TEDIE(),
          MySpassIE(),
+         SpiegelIE(),
          GenericIE()
      ]