[youtube] add algo for length 91
[youtube-dl] / youtube_dl / extractor / youtube.py
index e71cd62ec3d43e2b541a7e024c6a02adbe16a355..47d5cb7ff8d63331656aa64a1d6134636a2b8b26 100644 (file)
@@ -5,9 +5,10 @@ import netrc
 import re
 import socket
 import itertools
+import xml.etree.ElementTree
 
 from .common import InfoExtractor, SearchInfoExtractor
-from .subtitles import SubtitlesIE
+from .subtitles import SubtitlesInfoExtractor
 from ..utils import (
     compat_http_client,
     compat_parse_qs,
@@ -131,71 +132,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             return
         self._confirm_age()
 
-class YoutubeSubtitlesIE(SubtitlesIE):
 
-    def _get_available_subtitles(self, video_id):
-        request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
-        try:
-            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
-            return {}
-        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
-
-        sub_lang_list = {}
-        for l in lang_list:
-            lang = l[1]
-            params = compat_urllib_parse.urlencode({
-                'lang': lang,
-                'v': video_id,
-                'fmt': self._downloader.params.get('subtitlesformat'),
-            })
-            url = u'http://www.youtube.com/api/timedtext?' + params
-            sub_lang_list[lang] = url
-        if not sub_lang_list:
-            self._downloader.report_warning(u'video doesn\'t have subtitles')
-            return {}
-        return sub_lang_list
-
-    def _request_automatic_caption(self, video_id, webpage):
-        """We need the webpage for getting the captions url, pass it as an
-           argument to speed up the process."""
-        sub_lang = (self._downloader.params.get('subtitleslangs') or ['en'])[0]
-        sub_format = self._downloader.params.get('subtitlesformat')
-        self.to_screen(u'%s: Looking for automatic captions' % video_id)
-        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
-        err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang
-        if mobj is None:
-            self._downloader.report_warning(err_msg)
-            return {}
-        player_config = json.loads(mobj.group(1))
-        try:
-            args = player_config[u'args']
-            caption_url = args[u'ttsurl']
-            timestamp = args[u'timestamp']
-            params = compat_urllib_parse.urlencode({
-                'lang': 'en',
-                'tlang': sub_lang,
-                'fmt': sub_format,
-                'ts': timestamp,
-                'kind': 'asr',
-            })
-            subtitles_url = caption_url + '&' + params
-            sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions')
-            return {sub_lang: sub}
-        # An extractor error can be raise by the download process if there are
-        # no automatic captions but there are subtitles
-        except (KeyError, ExtractorError):
-            self._downloader.report_warning(err_msg)
-            return {}
-
-class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
+class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
     IE_DESC = u'YouTube.com'
     _VALID_URL = r"""^
                      (
                          (?:https?://)?                                       # http(s):// (optional)
                          (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
-                            tube\.majestyc\.net/)                             # the various hostnames, with wildcard subdomains
+                            tube\.majestyc\.net/|
+                            youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                          (?:                                                  # the various things that can precede the ID:
                              (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
@@ -471,8 +416,12 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
     def _decrypt_signature(self, s):
         """Turn the encrypted s field into a working signature"""
 
-        if len(s) == 92:
+        if len(s) == 93:
+            return s[86:29:-1] + s[88] + s[28:5:-1]
+        elif len(s) == 92:
             return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
+        elif len(s) == 91:
+            return s[84:27:-1] + s[86] + s[26:5:-1]
         elif len(s) == 90:
             return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
         elif len(s) == 89:
@@ -484,13 +433,13 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
         elif len(s) == 86:
             return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
         elif len(s) == 85:
-            return s[40] + s[82:43:-1] + s[22] + s[42:40:-1] + s[83] + s[39:22:-1] + s[0] + s[21:2:-1]
+            return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
         elif len(s) == 84:
             return s[81:36:-1] + s[0] + s[35:2:-1]
         elif len(s) == 83:
             return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
         elif len(s) == 82:
-            return s[1:19] + s[0] + s[20:68] + s[19] + s[69:82]
+            return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
         elif len(s) == 81:
             return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
         elif len(s) == 80:
@@ -510,6 +459,79 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
             # Fallback to the other algortihms
             return self._decrypt_signature(s)
 
+    def _get_available_subtitles(self, video_id):
+        try:
+            sub_list = self._download_webpage(
+                'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
+                video_id, note=False)
+        except ExtractorError as err:
+            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
+            return {}
+        lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
+
+        sub_lang_list = {}
+        for l in lang_list:
+            lang = l[1]
+            params = compat_urllib_parse.urlencode({
+                'lang': lang,
+                'v': video_id,
+                'fmt': self._downloader.params.get('subtitlesformat'),
+            })
+            url = u'http://www.youtube.com/api/timedtext?' + params
+            sub_lang_list[lang] = url
+        if not sub_lang_list:
+            self._downloader.report_warning(u'video doesn\'t have subtitles')
+            return {}
+        return sub_lang_list
+
+    def _get_available_automatic_caption(self, video_id, webpage):
+        """We need the webpage for getting the captions url, pass it as an
+           argument to speed up the process."""
+        sub_format = self._downloader.params.get('subtitlesformat')
+        self.to_screen(u'%s: Looking for automatic captions' % video_id)
+        mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
+        err_msg = u'Couldn\'t find automatic captions for %s' % video_id
+        if mobj is None:
+            self._downloader.report_warning(err_msg)
+            return {}
+        player_config = json.loads(mobj.group(1))
+        try:
+            args = player_config[u'args']
+            caption_url = args[u'ttsurl']
+            timestamp = args[u'timestamp']
+            # We get the available subtitles
+            list_params = compat_urllib_parse.urlencode({
+                'type': 'list',
+                'tlangs': 1,
+                'asrs': 1,
+            })
+            list_url = caption_url + '&' + list_params
+            list_page = self._download_webpage(list_url, video_id)
+            caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
+            original_lang_node = caption_list.find('track')
+            if original_lang_node.attrib.get('kind') != 'asr' :
+                self._downloader.report_warning(u'Video doesn\'t have automatic captions')
+                return {}
+            original_lang = original_lang_node.attrib['lang_code']
+
+            sub_lang_list = {}
+            for lang_node in caption_list.findall('target'):
+                sub_lang = lang_node.attrib['lang_code']
+                params = compat_urllib_parse.urlencode({
+                    'lang': original_lang,
+                    'tlang': sub_lang,
+                    'fmt': sub_format,
+                    'ts': timestamp,
+                    'kind': 'asr',
+                })
+                sub_lang_list[sub_lang] = caption_url + '&' + params
+            return sub_lang_list
+        # An extractor error can be raise by the download process if there are
+        # no automatic captions but there are subtitles
+        except (KeyError, ExtractorError):
+            self._downloader.report_warning(err_msg)
+            return {}
+
     def _print_formats(self, formats):
         print('Available formats:')
         for x in formats:
@@ -708,15 +730,10 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
                 video_description = u''
 
         # subtitles
-        video_subtitles = None
-
-        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
-            video_subtitles = self._extract_subtitles(video_id)
-        elif self._downloader.params.get('writeautomaticsub', False):
-            video_subtitles = self._request_automatic_caption(video_id, video_webpage)
+        video_subtitles = self.extract_subtitles(video_id, video_webpage)
 
         if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id)
+            self._list_available_subtitles(video_id, video_webpage)
             return
 
         if 'length_seconds' not in video_info:
@@ -770,10 +787,7 @@ class YoutubeIE(YoutubeSubtitlesIE, YoutubeBaseInfoExtractor):
                         if self._downloader.params.get('verbose'):
                             s = url_data['s'][0]
                             if age_gate:
-                                player_version = self._search_regex(r'ad3-(.+?)\.swf',
-                                    video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND',
-                                    'flash player', fatal=False)
-                                player = 'flash player %s' % player_version
+                                player = 'flash player'
                             else:
                                 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
                                     'html5 player', fatal=False)
@@ -995,6 +1009,9 @@ class YoutubeUserIE(InfoExtractor):
                 response = json.loads(page)
             except ValueError as err:
                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
+            if 'entry' not in response['feed']:
+                # Number of videos is a multiple of self._MAX_RESULTS
+                break
 
             # Extract video identifiers
             ids_in_page = []