Merge pull request #854 from rg3/youtube_automatic_captions
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)
YoutubeIE: fallback to automatic captions when subtitles aren't found

1  2 
youtube_dl/InfoExtractors.py

index 7a882b4ae46ae06508a93a0fd926bf7350a1a5b8,937cf94477c75bd9b8c07455c19fdb9e77234c2f..9fbe6d62771d9289bc2f105678c9fa1c081f3cba
@@@ -376,6 -376,34 +376,34 @@@ class YoutubeIE(InfoExtractor)
              return (u'Did not fetch video subtitles', None, None)
          return (None, sub_lang, sub)
  
+     def _request_automatic_caption(self, video_id, webpage):
+         """We need the webpage for getting the captions url, pass it as an
+            argument to speed up the process."""
+         sub_lang = self._downloader.params.get('subtitleslang')
+         sub_format = self._downloader.params.get('subtitlesformat')
+         self.to_screen(u'%s: Looking for automatic captions' % video_id)
+         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
+         if mobj is None:
+             return [(err_msg, None, None)]
+         player_config = json.loads(mobj.group(1))
+         try:
+             args = player_config[u'args']
+             caption_url = args[u'ttsurl']
+             timestamp = args[u'timestamp']
+             params = compat_urllib_parse.urlencode({
+                 'lang': 'en',
+                 'tlang': sub_lang,
+                 'fmt': sub_format,
+                 'ts': timestamp,
+                 'kind': 'asr',
+             })
+             subtitles_url = caption_url + '&' + params
+             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
+             return [(None, sub_lang, sub)]
+         except KeyError:
+             return [(err_msg, None, None)]
      def _extract_subtitle(self, video_id):
          """
          Return a list with a tuple:
              if video_subtitles:
                  (sub_error, sub_lang, sub) = video_subtitles[0]
                  if sub_error:
-                     self._downloader.report_error(sub_error)
+                     # We try with the automatic captions
+                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
+                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
+                     if sub is not None:
+                         pass
+                     else:
+                         # We report the original error
+                         self._downloader.report_error(sub_error)
  
          if self._downloader.params.get('allsubtitles', False):
              video_subtitles = self._extract_all_subtitles(video_id)
@@@ -1884,7 -1919,7 +1919,7 @@@ class FacebookIE(InfoExtractor)
  class BlipTVIE(InfoExtractor):
      """Information extractor for blip.tv"""
  
 -    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
 +    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
      _URL_EXT = r'^.*\.([a-z0-9]+)$'
      IE_NAME = u'blip.tv'
  
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
  
 +        # See https://github.com/rg3/youtube-dl/issues/857
 +        api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
 +        if api_mobj is not None:
 +            url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
          urlp = compat_urllib_parse_urlparse(url)
          if urlp.path.startswith('/play/'):
              request = compat_urllib_request.Request(url)
@@@ -4409,8 -4440,8 +4444,8 @@@ def gen_extractors()
          YahooSearchIE(),
          DepositFilesIE(),
          FacebookIE(),
 -        BlipTVUserIE(),
          BlipTVIE(),
 +        BlipTVUserIE(),
          VimeoIE(),
          MyVideoIE(),
          ComedyCentralIE(),