Merge remote-tracking branch 'liudongmiao/patch-subtitle'
authorPhilipp Hagemeister <phihag@phihag.de>
Fri, 22 Aug 2014 00:45:21 +0000 (02:45 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Fri, 22 Aug 2014 00:45:21 +0000 (02:45 +0200)
1  2 
youtube_dl/extractor/youtube.py

index 2c44f36a514ea1fc0cfaffaa34b82573a6c8eace,73a01107d722437ebb62d1a82ec72a33e49124cf..225e2b7f4681e8cce471a8a80af0f64eb14e071e
@@@ -225,7 -225,7 +225,7 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
  
          # Dash webm audio
 -        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 +        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
          '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
  
          # RTMP (unnamed)
          """Indicate the download will use the RTMP protocol."""
          self.to_screen(u'RTMP download detected')
  
 -    def _extract_signature_function(self, video_id, player_url, slen):
 +    def _signature_cache_id(self, example_sig):
 +        """ Return a string representation of a signature """
 +        return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 +
 +    def _extract_signature_function(self, video_id, player_url, example_sig):
          id_m = re.match(
              r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
              player_url)
          player_id = id_m.group('id')
  
          # Read from filesystem cache
 -        func_id = '%s_%s_%d' % (player_type, player_id, slen)
 +        func_id = '%s_%s_%s' % (
 +            player_type, player_id, self._signature_cache_id(example_sig))
          assert os.path.basename(func_id) == func_id
          cache_dir = get_cachedir(self._downloader.params)
  
                  return lambda s: u''.join(s[i] for i in cache_spec)
              except IOError:
                  pass  # No cache available
 +            except ValueError:
 +                try:
 +                    file_size = os.path.getsize(cache_fn)
 +                except (OSError, IOError) as oe:
 +                    file_size = str(oe)
 +                self._downloader.report_warning(
 +                    u'Cache %s failed (%s)' % (cache_fn, file_size))
  
          if player_type == 'js':
              code = self._download_webpage(
  
          if cache_enabled:
              try:
 -                test_string = u''.join(map(compat_chr, range(slen)))
 +                test_string = u''.join(map(compat_chr, range(len(example_sig))))
                  cache_res = res(test_string)
                  cache_spec = [ord(c) for c in cache_res]
                  try:
  
          return res
  
 -    def _print_sig_code(self, func, slen):
 +    def _print_sig_code(self, func, example_sig):
          def gen_sig_code(idxs):
              def _genslice(start, end, step):
                  starts = u'' if start == 0 else str(start)
              else:
                  yield _genslice(start, i, step)
  
 -        test_string = u''.join(map(compat_chr, range(slen)))
 +        test_string = u''.join(map(compat_chr, range(len(example_sig))))
          cache_res = func(test_string)
          cache_spec = [ord(c) for c in cache_res]
          expr_code = u' + '.join(gen_sig_code(cache_spec))
 -        code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 +        signature_id_tuple = '(%s)' % (
 +            ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 +        code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 +                u'    return %s\n') % (signature_id_tuple, expr_code)
          self.to_screen(u'Extracted signature function:\n' + code)
  
      def _parse_sig_js(self, jscode):
          if player_url.startswith(u'//'):
              player_url = u'https:' + player_url
          try:
 -            player_id = (player_url, len(s))
 +            player_id = (player_url, self._signature_cache_id(s))
              if player_id not in self._player_cache:
                  func = self._extract_signature_function(
 -                    video_id, player_url, len(s)
 +                    video_id, player_url, s
                  )
                  self._player_cache[player_id] = func
              func = self._player_cache[player_id]
              if self._downloader.params.get('youtube_print_sig_code'):
 -                self._print_sig_code(func, len(s))
 +                self._print_sig_code(func, s)
              return func(s)
          except Exception as e:
              tb = traceback.format_exc()
              raise ExtractorError(
 -                u'Automatic signature extraction failed: ' + tb, cause=e)
 +                u'Signature extraction failed: ' + tb, cause=e)
  
      def _get_available_subtitles(self, video_id, webpage):
          try:
          sub_lang_list = {}
          for l in lang_list:
              lang = l[1]
+             if lang in sub_lang_list:
+                 continue
              params = compat_urllib_parse.urlencode({
                  'lang': lang,
                  'v': video_id,
              data = compat_urllib_parse.urlencode({
                  'video_id': video_id,
                  'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 -                'sts':'16268',
 +                'sts': self._search_regex(
 +                    r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
              })
              video_info_url = proto + '://www.youtube.com/get_video_info?' + data
              video_info_webpage = self._download_webpage(video_info_url, video_id,
              url_map = {}
              for url_data_str in encoded_url_map.split(','):
                  url_data = compat_parse_qs(url_data_str)
 -                if 'itag' in url_data and 'url' in url_data:
 -                    url = url_data['url'][0]
 -                    if 'sig' in url_data:
 -                        url += '&signature=' + url_data['sig'][0]
 -                    elif 's' in url_data:
 -                        encrypted_sig = url_data['s'][0]
 -
 -                        if not age_gate:
 -                            jsplayer_url_json = self._search_regex(
 -                                r'"assets":.+?"js":\s*("[^"]+")',
 -                                video_webpage, u'JS player URL')
 -                            player_url = json.loads(jsplayer_url_json)
 +                if 'itag' not in url_data or 'url' not in url_data:
 +                    continue
 +                format_id = url_data['itag'][0]
 +                url = url_data['url'][0]
 +
 +                if 'sig' in url_data:
 +                    url += '&signature=' + url_data['sig'][0]
 +                elif 's' in url_data:
 +                    encrypted_sig = url_data['s'][0]
 +
 +                    if not age_gate:
 +                        jsplayer_url_json = self._search_regex(
 +                            r'"assets":.+?"js":\s*("[^"]+")',
 +                            video_webpage, u'JS player URL')
 +                        player_url = json.loads(jsplayer_url_json)
 +                    if player_url is None:
 +                        player_url_json = self._search_regex(
 +                            r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 +                            video_webpage, u'age gate player URL')
 +                        player_url = json.loads(player_url_json)
 +
 +                    if self._downloader.params.get('verbose'):
                          if player_url is None:
 -                            player_url_json = self._search_regex(
 -                                r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 -                                video_webpage, u'age gate player URL')
 -                            player_url = json.loads(player_url_json)
 -
 -                        if self._downloader.params.get('verbose'):
 -                            if player_url is None:
 -                                player_version = 'unknown'
 -                                player_desc = 'unknown'
 +                            player_version = 'unknown'
 +                            player_desc = 'unknown'
 +                        else:
 +                            if player_url.endswith('swf'):
 +                                player_version = self._search_regex(
 +                                    r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 +                                    u'flash player', fatal=False)
 +                                player_desc = 'flash player %s' % player_version
                              else:
 -                                if player_url.endswith('swf'):
 -                                    player_version = self._search_regex(
 -                                        r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 -                                        u'flash player', fatal=False)
 -                                    player_desc = 'flash player %s' % player_version
 -                                else:
 -                                    player_version = self._search_regex(
 -                                        r'html5player-(.+?)\.js', video_webpage,
 -                                        'html5 player', fatal=False)
 -                                    player_desc = u'html5 player %s' % player_version
 -
 -                            parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
 -                            self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
 -                                (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
 -
 -                        signature = self._decrypt_signature(
 -                            encrypted_sig, video_id, player_url, age_gate)
 -                        url += '&signature=' + signature
 -                    if 'ratebypass' not in url:
 -                        url += '&ratebypass=yes'
 -                    url_map[url_data['itag'][0]] = url
 +                                player_version = self._search_regex(
 +                                    r'html5player-([^/]+?)(?:/html5player)?\.js',
 +                                    player_url,
 +                                    'html5 player', fatal=False)
 +                                player_desc = u'html5 player %s' % player_version
 +
 +                        parts_sizes = self._signature_cache_id(encrypted_sig)
 +                        self.to_screen(u'{%s} signature length %s, %s' %
 +                            (format_id, parts_sizes, player_desc))
 +
 +                    signature = self._decrypt_signature(
 +                        encrypted_sig, video_id, player_url, age_gate)
 +                    url += '&signature=' + signature
 +                if 'ratebypass' not in url:
 +                    url += '&ratebypass=yes'
 +                url_map[format_id] = url
              formats = _map_to_format_list(url_map)
          elif video_info.get('hlsvp'):
              manifest_url = video_info['hlsvp'][0]