[extractor/common] Improve name extraction for m3u8 formats

[youtube-dl] / youtube_dl / extractor / iqiyi.py
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py

index 541fe7ae3053aa28bf3e3f4176ef5ff31db12e55..ddcb3c916e6a0610484dc5ceddbd84b507e761fd 100644 (file)
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -2,6 +2,7 @@
  from __future__ import unicode_literals
  
  import hashlib
+import itertools
  import math
  import os
  import random
@@ -13,12 +14,14 @@ from .common import InfoExtractor
  from ..compat import (
      compat_parse_qs,
      compat_str,
-    compat_urllib_parse,
+    compat_urllib_parse_urlencode,
      compat_urllib_parse_urlparse,
  )
  from ..utils import (
+    decode_packed_codes,
      ExtractorError,
      ohdave_rsa_encrypt,
+    remove_start,
      sanitized_Request,
      urlencode_postdata,
      url_basename,
@@ -124,43 +127,11 @@ class IqiyiSDK(object):
  
  
  class IqiyiSDKInterpreter(object):
-    BASE62_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
      def __init__(self, sdk_code):
          self.sdk_code = sdk_code
  
-    @classmethod
-    def base62(cls, num):
-        if num == 0:
-            return '0'
-        ret = ''
-        while num:
-            ret = cls.BASE62_TABLE[num % 62] + ret
-            num = num // 62
-        return ret
-
-    def decode_eval_codes(self):
-        self.sdk_code = self.sdk_code[5:-3]
-
-        mobj = re.search(
-            r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}",
-            self.sdk_code)
-        obfucasted_code, count, symbols = mobj.groups()
-        count = int(count)
-        symbols = symbols.split('|')
-        symbol_table = {}
-
-        while count:
-            count -= 1
-            b62count = self.base62(count)
-            symbol_table[b62count] = symbols[count] or b62count
-
-        self.sdk_code = re.sub(
-            r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
-            obfucasted_code)
-
      def run(self, target, ip, timestamp):
-        self.decode_eval_codes()
+        self.sdk_code = decode_packed_codes(self.sdk_code)
  
          functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
  
@@ -194,7 +165,7 @@ class IqiyiIE(InfoExtractor):
      IE_NAME = 'iqiyi'
      IE_DESC = '爱奇艺'
  
-    _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
+    _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html'
  
      _NETRC_MACHINE = 'iqiyi'
  
@@ -295,6 +266,16 @@ class IqiyiIE(InfoExtractor):
              },
          }],
          'expected_warnings': ['Needs a VIP account for full video'],
+    }, {
+        'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
+        'info_dict': {
+            'id': '202918101',
+            'title': '灌篮高手 国语版',
+        },
+        'playlist_count': 101,
+    }, {
+        'url': 'http://www.pps.tv/w_19rrbav0ph.html',
+        'only_matching': True,
      }]
  
      _FORMATS_MAP = [
@@ -306,6 +287,13 @@ class IqiyiIE(InfoExtractor):
          ('10', 'h1'),
      ]
  
+    AUTH_API_ERRORS = {
+        # No preview available (不允许试看鉴权失败)
+        'Q00505': 'This video requires a VIP account',
+        # End of preview time (试看结束鉴权失败)
+        'Q00506': 'Needs a VIP account for full video',
+    }
+
      def _real_initialize(self):
          self._login()
  
@@ -344,7 +332,7 @@ class IqiyiIE(InfoExtractor):
              'bird_t': timestamp,
          }
          validation_result = self._download_json(
-            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
+            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None,
              note='Validate credentials', errnote='Unable to validate credentials')
  
          MSG_MAP = {
@@ -390,12 +378,19 @@ class IqiyiIE(InfoExtractor):
              auth_req, video_id,
              note='Downloading video authentication JSON',
              errnote='Unable to download video authentication JSON')
-        if auth_result['code'] == 'Q00506':  # requires a VIP account
+
+        code = auth_result.get('code')
+        msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code
+        if code == 'Q00506':
              if do_report_warning:
-                self.report_warning('Needs a VIP account for full video')
+                self.report_warning(msg)
              return False
+        if 'data' not in auth_result:
+            if msg is not None:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True)
+            raise ExtractorError('Unexpected error from Iqiyi auth API')
  
-        return auth_result
+        return auth_result['data']
  
      def construct_video_urls(self, data, video_id, _uuid, tvid):
          def do_xor(x, y):
@@ -471,14 +466,14 @@ class IqiyiIE(InfoExtractor):
                          need_vip_warning_report = False
                          break
                      param.update({
-                        't': auth_result['data']['t'],
+                        't': auth_result['t'],
                          # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
                          'cid': 'afbe8fd3d73448c9',
                          'vid': video_id,
-                        'QY00001': auth_result['data']['u'],
+                        'QY00001': auth_result['u'],
                      })
                  api_video_url += '?' if '?' not in api_video_url else '&'
-                api_video_url += compat_urllib_parse.urlencode(param)
+                api_video_url += compat_urllib_parse_urlencode(param)
                  js = self._download_json(
                      api_video_url, video_id,
                      note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
@@ -510,34 +505,75 @@ class IqiyiIE(InfoExtractor):
              'enc': md5_text(enc_key + tail),
              'qyid': _uuid,
              'tn': random.random(),
-            'um': 0,
+            # In iQiyi's flash player, um is set to 1 if there's a logged user
+            # Some 1080P formats are only available with a logged user.
+            # Here force um=1 to trick the iQiyi server
+            'um': 1,
              'authkey': md5_text(md5_text('') + tail),
              'k_tag': 1,
          }
  
          api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
-            compat_urllib_parse.urlencode(param)
+            compat_urllib_parse_urlencode(param)
          raw_data = self._download_json(api_url, video_id)
          return raw_data
  
-    def get_enc_key(self, swf_url, video_id):
+    def get_enc_key(self, video_id):
          # TODO: automatic key extraction
          # last update at 2016-01-22 for Zombie::bite
-        enc_key = '6ab6d0280511493ba85594779759d4ed'
+        enc_key = '4a1caba4b4465345366f28da7c117d20'
          return enc_key
  
+    def _extract_playlist(self, webpage):
+        PAGE_SIZE = 50
+
+        links = re.findall(
+            r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
+            webpage)
+        if not links:
+            return
+
+        album_id = self._search_regex(
+            r'albumId\s*:\s*(\d+),', webpage, 'album ID')
+        album_title = self._search_regex(
+            r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
+
+        entries = list(map(self.url_result, links))
+
+        # Start from 2 because links in the first page are already on webpage
+        for page_num in itertools.count(2):
+            pagelist_page = self._download_webpage(
+                'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
+                album_id,
+                note='Download playlist page %d' % page_num,
+                errnote='Failed to download playlist page %d' % page_num)
+            pagelist = self._parse_json(
+                remove_start(pagelist_page, 'var tvInfoJs='), album_id)
+            vlist = pagelist['data']['vlist']
+            for item in vlist:
+                entries.append(self.url_result(item['vurl']))
+            if len(vlist) < PAGE_SIZE:
+                break
+
+        return self.playlist_result(entries, album_id, album_title)
+
      def _real_extract(self, url):
          webpage = self._download_webpage(
              url, 'temp_id', note='download video page')
+
+        # There's no simple way to determine whether an URL is a playlist or not
+        # So detect it
+        playlist_result = self._extract_playlist(webpage)
+        if playlist_result:
+            return playlist_result
+
          tvid = self._search_regex(
              r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
          video_id = self._search_regex(
              r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
-        swf_url = self._search_regex(
-            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
          _uuid = uuid.uuid4().hex
  
-        enc_key = self.get_enc_key(swf_url, video_id)
+        enc_key = self.get_enc_key(video_id)
  
          raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)