Merge pull request #854 from rg3/youtube_automatic_captions

author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)

committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)
author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)
committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)
diff --combined youtube_dl/InfoExtractors.py

index 7a882b4ae46ae06508a93a0fd926bf7350a1a5b8,937cf94477c75bd9b8c07455c19fdb9e77234c2f..9fbe6d62771d9289bc2f105678c9fa1c081f3cba
--- 1/youtube_dl/InfoExtractors.py
--- 2/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@@ -376,6 -376,34 +376,34 @@@ class YoutubeIE(InfoExtractor)
               return (u'Did not fetch video subtitles', None, None)
           return (None, sub_lang, sub)
   
+     def _request_automatic_caption(self, video_id, webpage):
+         """We need the webpage for getting the captions url, pass it as an
+            argument to speed up the process."""
+         sub_lang = self._downloader.params.get('subtitleslang')
+         sub_format = self._downloader.params.get('subtitlesformat')
+         self.to_screen(u'%s: Looking for automatic captions' % video_id)
+         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+         err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
+         if mobj is None:
+             return [(err_msg, None, None)]
+         player_config = json.loads(mobj.group(1))
+         try:
+             args = player_config[u'args']
+             caption_url = args[u'ttsurl']
+             timestamp = args[u'timestamp']
+             params = compat_urllib_parse.urlencode({
+                 'lang': 'en',
+                 'tlang': sub_lang,
+                 'fmt': sub_format,
+                 'ts': timestamp,
+                 'kind': 'asr',
+             })
+             subtitles_url = caption_url + '&' + params
+             sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
+             return [(None, sub_lang, sub)]
+         except KeyError:
+             return [(err_msg, None, None)]
+ 
       def _extract_subtitle(self, video_id):
           """
           Return a list with a tuple:
@@@ -623,7 -651,14 +651,14 @@@
               if video_subtitles:
                   (sub_error, sub_lang, sub) = video_subtitles[0]
                   if sub_error:
-                     self._downloader.report_error(sub_error)
+                     # We try with the automatic captions
+                     video_subtitles = self._request_automatic_caption(video_id, video_webpage)
+                     (sub_error_auto, sub_lang, sub) = video_subtitles[0]
+                     if sub is not None:
+                         pass
+                     else:
+                         # We report the original error
+                         self._downloader.report_error(sub_error)
   
           if self._downloader.params.get('allsubtitles', False):
               video_subtitles = self._extract_all_subtitles(video_id)
@@@ -1884,7 -1919,7 +1919,7 @@@ class FacebookIE(InfoExtractor)
   class BlipTVIE(InfoExtractor):
       """Information extractor for blip.tv"""
   
- -    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
+ +    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
       _URL_EXT = r'^.*\.([a-z0-9]+)$'
       IE_NAME = u'blip.tv'
   
@@@ -1897,10 -1932,6 +1932,10 @@@
           if mobj is None:
               raise ExtractorError(u'Invalid URL: %s' % url)
   
+ +        # See https://github.com/rg3/youtube-dl/issues/857
+ +        api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
+ +        if api_mobj is not None:
+ +            url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
           urlp = compat_urllib_parse_urlparse(url)
           if urlp.path.startswith('/play/'):
               request = compat_urllib_request.Request(url)
@@@ -4409,8 -4440,8 +4444,8 @@@ def gen_extractors()
           YahooSearchIE(),
           DepositFilesIE(),
           FacebookIE(),
- -        BlipTVUserIE(),
           BlipTVIE(),
+ +        BlipTVUserIE(),
           VimeoIE(),
           MyVideoIE(),
           ComedyCentralIE(),
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Sat, 1 Jun 2013 21:18:27 +0000 (14:18 -0700)