[extractor/common] Improve name extraction for m3u8 formats
[youtube-dl] / youtube_dl / extractor / iqiyi.py
index 541fe7ae3053aa28bf3e3f4176ef5ff31db12e55..ddcb3c916e6a0610484dc5ceddbd84b507e761fd 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import hashlib
+import itertools
 import math
 import os
 import random
@@ -13,12 +14,14 @@ from .common import InfoExtractor
 from ..compat import (
     compat_parse_qs,
     compat_str,
-    compat_urllib_parse,
+    compat_urllib_parse_urlencode,
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    decode_packed_codes,
     ExtractorError,
     ohdave_rsa_encrypt,
+    remove_start,
     sanitized_Request,
     urlencode_postdata,
     url_basename,
@@ -124,43 +127,11 @@ class IqiyiSDK(object):
 
 
 class IqiyiSDKInterpreter(object):
-    BASE62_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
     def __init__(self, sdk_code):
         self.sdk_code = sdk_code
 
-    @classmethod
-    def base62(cls, num):
-        if num == 0:
-            return '0'
-        ret = ''
-        while num:
-            ret = cls.BASE62_TABLE[num % 62] + ret
-            num = num // 62
-        return ret
-
-    def decode_eval_codes(self):
-        self.sdk_code = self.sdk_code[5:-3]
-
-        mobj = re.search(
-            r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}",
-            self.sdk_code)
-        obfucasted_code, count, symbols = mobj.groups()
-        count = int(count)
-        symbols = symbols.split('|')
-        symbol_table = {}
-
-        while count:
-            count -= 1
-            b62count = self.base62(count)
-            symbol_table[b62count] = symbols[count] or b62count
-
-        self.sdk_code = re.sub(
-            r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
-            obfucasted_code)
-
     def run(self, target, ip, timestamp):
-        self.decode_eval_codes()
+        self.sdk_code = decode_packed_codes(self.sdk_code)
 
         functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
 
@@ -194,7 +165,7 @@ class IqiyiIE(InfoExtractor):
     IE_NAME = 'iqiyi'
     IE_DESC = '爱奇艺'
 
-    _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
+    _VALID_URL = r'https?://(?:(?:[^.]+\.)?iqiyi\.com|www\.pps\.tv)/.+\.html'
 
     _NETRC_MACHINE = 'iqiyi'
 
@@ -295,6 +266,16 @@ class IqiyiIE(InfoExtractor):
             },
         }],
         'expected_warnings': ['Needs a VIP account for full video'],
+    }, {
+        'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
+        'info_dict': {
+            'id': '202918101',
+            'title': '灌篮高手 国语版',
+        },
+        'playlist_count': 101,
+    }, {
+        'url': 'http://www.pps.tv/w_19rrbav0ph.html',
+        'only_matching': True,
     }]
 
     _FORMATS_MAP = [
@@ -306,6 +287,13 @@ class IqiyiIE(InfoExtractor):
         ('10', 'h1'),
     ]
 
+    AUTH_API_ERRORS = {
+        # No preview available (不允许试看鉴权失败)
+        'Q00505': 'This video requires a VIP account',
+        # End of preview time (试看结束鉴权失败)
+        'Q00506': 'Needs a VIP account for full video',
+    }
+
     def _real_initialize(self):
         self._login()
 
@@ -344,7 +332,7 @@ class IqiyiIE(InfoExtractor):
             'bird_t': timestamp,
         }
         validation_result = self._download_json(
-            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
+            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None,
             note='Validate credentials', errnote='Unable to validate credentials')
 
         MSG_MAP = {
@@ -390,12 +378,19 @@ class IqiyiIE(InfoExtractor):
             auth_req, video_id,
             note='Downloading video authentication JSON',
             errnote='Unable to download video authentication JSON')
-        if auth_result['code'] == 'Q00506':  # requires a VIP account
+
+        code = auth_result.get('code')
+        msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code
+        if code == 'Q00506':
             if do_report_warning:
-                self.report_warning('Needs a VIP account for full video')
+                self.report_warning(msg)
             return False
+        if 'data' not in auth_result:
+            if msg is not None:
+                raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True)
+            raise ExtractorError('Unexpected error from Iqiyi auth API')
 
-        return auth_result
+        return auth_result['data']
 
     def construct_video_urls(self, data, video_id, _uuid, tvid):
         def do_xor(x, y):
@@ -471,14 +466,14 @@ class IqiyiIE(InfoExtractor):
                         need_vip_warning_report = False
                         break
                     param.update({
-                        't': auth_result['data']['t'],
+                        't': auth_result['t'],
                         # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as
                         'cid': 'afbe8fd3d73448c9',
                         'vid': video_id,
-                        'QY00001': auth_result['data']['u'],
+                        'QY00001': auth_result['u'],
                     })
                 api_video_url += '?' if '?' not in api_video_url else '&'
-                api_video_url += compat_urllib_parse.urlencode(param)
+                api_video_url += compat_urllib_parse_urlencode(param)
                 js = self._download_json(
                     api_video_url, video_id,
                     note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
@@ -510,34 +505,75 @@ class IqiyiIE(InfoExtractor):
             'enc': md5_text(enc_key + tail),
             'qyid': _uuid,
             'tn': random.random(),
-            'um': 0,
+            # In iQiyi's flash player, um is set to 1 if there's a logged user
+            # Some 1080P formats are only available with a logged user.
+            # Here force um=1 to trick the iQiyi server
+            'um': 1,
             'authkey': md5_text(md5_text('') + tail),
             'k_tag': 1,
         }
 
         api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
-            compat_urllib_parse.urlencode(param)
+            compat_urllib_parse_urlencode(param)
         raw_data = self._download_json(api_url, video_id)
         return raw_data
 
-    def get_enc_key(self, swf_url, video_id):
+    def get_enc_key(self, video_id):
         # TODO: automatic key extraction
         # last update at 2016-01-22 for Zombie::bite
-        enc_key = '6ab6d0280511493ba85594779759d4ed'
+        enc_key = '4a1caba4b4465345366f28da7c117d20'
         return enc_key
 
+    def _extract_playlist(self, webpage):
+        PAGE_SIZE = 50
+
+        links = re.findall(
+            r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
+            webpage)
+        if not links:
+            return
+
+        album_id = self._search_regex(
+            r'albumId\s*:\s*(\d+),', webpage, 'album ID')
+        album_title = self._search_regex(
+            r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
+
+        entries = list(map(self.url_result, links))
+
+        # Start from 2 because links in the first page are already on webpage
+        for page_num in itertools.count(2):
+            pagelist_page = self._download_webpage(
+                'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
+                album_id,
+                note='Download playlist page %d' % page_num,
+                errnote='Failed to download playlist page %d' % page_num)
+            pagelist = self._parse_json(
+                remove_start(pagelist_page, 'var tvInfoJs='), album_id)
+            vlist = pagelist['data']['vlist']
+            for item in vlist:
+                entries.append(self.url_result(item['vurl']))
+            if len(vlist) < PAGE_SIZE:
+                break
+
+        return self.playlist_result(entries, album_id, album_title)
+
     def _real_extract(self, url):
         webpage = self._download_webpage(
             url, 'temp_id', note='download video page')
+
+        # There's no simple way to determine whether an URL is a playlist or not
+        # So detect it
+        playlist_result = self._extract_playlist(webpage)
+        if playlist_result:
+            return playlist_result
+
         tvid = self._search_regex(
             r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
         video_id = self._search_regex(
             r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
-        swf_url = self._search_regex(
-            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
         _uuid = uuid.uuid4().hex
 
-        enc_key = self.get_enc_key(swf_url, video_id)
+        enc_key = self.get_enc_key(video_id)
 
         raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)