[youtube] Support automatic captions with original language different from English...
authorJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Wed, 11 Sep 2013 17:02:01 +0000 (19:02 +0200)
committerJaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Wed, 11 Sep 2013 17:08:43 +0000 (19:08 +0200)
youtube_dl/extractor/subtitles.py
youtube_dl/extractor/youtube.py

index 9a3c54b65a271bba680397a950b0ad06701f7fed..a6780f1763440fe04a8ee135a07975da90bd6649 100644 (file)
@@ -15,28 +15,33 @@ class SubtitlesInfoExtractor(InfoExtractor):
         self.to_screen(u'%s: Available subtitles for video: %s' %
                        (video_id, sub_lang))
 
         self.to_screen(u'%s: Available subtitles for video: %s' %
                        (video_id, sub_lang))
 
-    def _extract_subtitles(self, video_id):
+    def extract_subtitles(self, video_id, video_webpage=None):
         """ returns {sub_lang: sub} or {} if subtitles not found """
         """ returns {sub_lang: sub} or {} if subtitles not found """
-        available_subs_list = self._get_available_subtitles(video_id)
+        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
+            available_subs_list = self._get_available_subtitles(video_id)
+        elif self._downloader.params.get('writeautomaticsub', False):
+            available_subs_list = self._get_available_automatic_caption(video_id, video_webpage)
+        else:
+            return None
+
         if not available_subs_list:  # error, it didn't get the available subtitles
             return {}
         if self._downloader.params.get('allsubtitles', False):
             sub_lang_list = available_subs_list
         else:
         if not available_subs_list:  # error, it didn't get the available subtitles
             return {}
         if self._downloader.params.get('allsubtitles', False):
             sub_lang_list = available_subs_list
         else:
-            if self._downloader.params.get('writesubtitles', False):
-                if self._downloader.params.get('subtitleslangs', False):
-                    requested_langs = self._downloader.params.get('subtitleslangs')
-                elif 'en' in available_subs_list:
-                    requested_langs = ['en']
-                else:
-                    requested_langs = [list(available_subs_list.keys())[0]]
+            if self._downloader.params.get('subtitleslangs', False):
+                requested_langs = self._downloader.params.get('subtitleslangs')
+            elif 'en' in available_subs_list:
+                requested_langs = ['en']
+            else:
+                requested_langs = [list(available_subs_list.keys())[0]]
 
 
-                sub_lang_list = {}
-                for sub_lang in requested_langs:
-                    if not sub_lang in available_subs_list:
-                        self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
-                        continue
-                    sub_lang_list[sub_lang] = available_subs_list[sub_lang]
+            sub_lang_list = {}
+            for sub_lang in requested_langs:
+                if not sub_lang in available_subs_list:
+                    self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
+                    continue
+                sub_lang_list[sub_lang] = available_subs_list[sub_lang]
 
         subtitles = {}
         for sub_lang, url in sub_lang_list.items():
 
         subtitles = {}
         for sub_lang, url in sub_lang_list.items():
@@ -64,23 +69,11 @@ class SubtitlesInfoExtractor(InfoExtractor):
         """
         pass
 
         """
         pass
 
-    def _request_automatic_caption(self, video_id, webpage):
+    def _get_available_automatic_caption(self, video_id, webpage):
         """
         """
-        returns {sub_lang: sub} or {} if not available
+        returns {sub_lang: url} or {} if not available
         Must be redefined by the subclasses that support automatic captions,
         otherwise it will return {}
         """
         self._downloader.report_warning(u'Automatic Captions not supported by this server')
         return {}
         Must be redefined by the subclasses that support automatic captions,
         otherwise it will return {}
         """
         self._downloader.report_warning(u'Automatic Captions not supported by this server')
         return {}
-
-    def extract_subtitles(self, video_id, video_webpage=None):
-        """
-        Extract the subtitles and/or the automatic captions if requested.
-        Returns None or a dictionary in the format {sub_lang: sub}
-        """
-        video_subtitles = None
-        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
-            video_subtitles = self._extract_subtitles(video_id)
-        elif self._downloader.params.get('writeautomaticsub', False):
-            video_subtitles = self._request_automatic_caption(video_id, video_webpage)
-        return video_subtitles
index d06cc49c45fdc8254a15ed9197c45733ad51dee9..46f977ce7e659d9f4f22e2c1b2dff81fcc3b22ab 100644 (file)
@@ -5,6 +5,7 @@ import netrc
 import re
 import socket
 import itertools
 import re
 import socket
 import itertools
+import xml.etree.ElementTree
 
 from .common import InfoExtractor, SearchInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
 
 from .common import InfoExtractor, SearchInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
@@ -478,14 +479,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             return {}
         return sub_lang_list
 
             return {}
         return sub_lang_list
 
-    def _request_automatic_caption(self, video_id, webpage):
+    def _get_available_automatic_caption(self, video_id, webpage):
         """We need the webpage for getting the captions url, pass it as an
            argument to speed up the process."""
         """We need the webpage for getting the captions url, pass it as an
            argument to speed up the process."""
-        sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
         sub_format = self._downloader.params.get('subtitlesformat')
         self.to_screen(u'%s: Looking for automatic captions' % video_id)
         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
         sub_format = self._downloader.params.get('subtitlesformat')
         self.to_screen(u'%s: Looking for automatic captions' % video_id)
         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
-        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
+        err_msg = u'Couldn\'t find automatic captions for %s' % video_id
         if mobj is None:
             self._downloader.report_warning(err_msg)
             return {}
         if mobj is None:
             self._downloader.report_warning(err_msg)
             return {}
@@ -494,16 +494,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             args = player_config[u'args']
             caption_url = args[u'ttsurl']
             timestamp = args[u'timestamp']
             args = player_config[u'args']
             caption_url = args[u'ttsurl']
             timestamp = args[u'timestamp']
-            params = compat_urllib_parse.urlencode({
-                'lang': 'en',
-                'tlang': sub_lang,
-                'fmt': sub_format,
-                'ts': timestamp,
-                'kind': 'asr',
+            # We get the available subtitles
+            list_params = compat_urllib_parse.urlencode({
+                'type': 'list',
+                'tlangs': 1,
+                'asrs': 1,
             })
             })
-            subtitles_url = caption_url + '&' + params
-            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
-            return {sub_lang: sub}
+            list_url = caption_url + '&' + list_params
+            list_page = self._download_webpage(list_url, video_id)
+            caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
+            original_lang = caption_list.find('track').attrib['lang_code']
+
+            sub_lang_list = {}
+            for lang_node in caption_list.findall('target'):
+                sub_lang = lang_node.attrib['lang_code']
+                params = compat_urllib_parse.urlencode({
+                    'lang': original_lang,
+                    'tlang': sub_lang,
+                    'fmt': sub_format,
+                    'ts': timestamp,
+                    'kind': 'asr',
+                })
+                sub_lang_list[sub_lang] = caption_url + '&' + params
+            return sub_lang_list
         # An extractor error can be raise by the download process if there are
         # no automatic captions but there are subtitles
         except (KeyError, ExtractorError):
         # An extractor error can be raise by the download process if there are
         # no automatic captions but there are subtitles
         except (KeyError, ExtractorError):