[compat] Add compat_urllib_parse_urlencode and eliminate encode_dict
[youtube-dl] / youtube_dl / extractor / iqiyi.py
index 541fe7ae3053aa28bf3e3f4176ef5ff31db12e55..9e8c9432a6947ad2ad1866e257e577c98c3ac38b 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import hashlib
+import itertools
 import math
 import os
 import random
@@ -13,12 +14,14 @@ from .common import InfoExtractor
 from ..compat import (
     compat_parse_qs,
     compat_str,
-    compat_urllib_parse,
+    compat_urllib_parse_urlencode,
     compat_urllib_parse_urlparse,
 )
 from ..utils import (
+    decode_packed_codes,
     ExtractorError,
     ohdave_rsa_encrypt,
+    remove_start,
     sanitized_Request,
     urlencode_postdata,
     url_basename,
@@ -124,43 +127,11 @@ class IqiyiSDK(object):
 
 
 class IqiyiSDKInterpreter(object):
-    BASE62_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
-
     def __init__(self, sdk_code):
         self.sdk_code = sdk_code
 
-    @classmethod
-    def base62(cls, num):
-        if num == 0:
-            return '0'
-        ret = ''
-        while num:
-            ret = cls.BASE62_TABLE[num % 62] + ret
-            num = num // 62
-        return ret
-
-    def decode_eval_codes(self):
-        self.sdk_code = self.sdk_code[5:-3]
-
-        mobj = re.search(
-            r"'([^']+)',62,(\d+),'([^']+)'\.split\('\|'\),[^,]+,{}",
-            self.sdk_code)
-        obfucasted_code, count, symbols = mobj.groups()
-        count = int(count)
-        symbols = symbols.split('|')
-        symbol_table = {}
-
-        while count:
-            count -= 1
-            b62count = self.base62(count)
-            symbol_table[b62count] = symbols[count] or b62count
-
-        self.sdk_code = re.sub(
-            r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
-            obfucasted_code)
-
     def run(self, target, ip, timestamp):
-        self.decode_eval_codes()
+        self.sdk_code = decode_packed_codes(self.sdk_code)
 
         functions = re.findall(r'input=([a-zA-Z0-9]+)\(input', self.sdk_code)
 
@@ -194,7 +165,7 @@ class IqiyiIE(InfoExtractor):
     IE_NAME = 'iqiyi'
     IE_DESC = '爱奇艺'
 
-    _VALID_URL = r'http://(?:[^.]+\.)?iqiyi\.com/.+\.html'
+    _VALID_URL = r'https?://(?:[^.]+\.)?iqiyi\.com/.+\.html'
 
     _NETRC_MACHINE = 'iqiyi'
 
@@ -295,6 +266,13 @@ class IqiyiIE(InfoExtractor):
             },
         }],
         'expected_warnings': ['Needs a VIP account for full video'],
+    }, {
+        'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html',
+        'info_dict': {
+            'id': '202918101',
+            'title': '灌篮高手 国语版',
+        },
+        'playlist_count': 101,
     }]
 
     _FORMATS_MAP = [
@@ -344,7 +322,7 @@ class IqiyiIE(InfoExtractor):
             'bird_t': timestamp,
         }
         validation_result = self._download_json(
-            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse.urlencode(validation_params), None,
+            'http://kylin.iqiyi.com/validate?' + compat_urllib_parse_urlencode(validation_params), None,
             note='Validate credentials', errnote='Unable to validate credentials')
 
         MSG_MAP = {
@@ -478,7 +456,7 @@ class IqiyiIE(InfoExtractor):
                         'QY00001': auth_result['data']['u'],
                     })
                 api_video_url += '?' if '?' not in api_video_url else '&'
-                api_video_url += compat_urllib_parse.urlencode(param)
+                api_video_url += compat_urllib_parse_urlencode(param)
                 js = self._download_json(
                     api_video_url, video_id,
                     note='Download video info of segment %d for format %s' % (segment_index + 1, format_id))
@@ -516,28 +494,66 @@ class IqiyiIE(InfoExtractor):
         }
 
         api_url = 'http://cache.video.qiyi.com/vms' + '?' + \
-            compat_urllib_parse.urlencode(param)
+            compat_urllib_parse_urlencode(param)
         raw_data = self._download_json(api_url, video_id)
         return raw_data
 
-    def get_enc_key(self, swf_url, video_id):
+    def get_enc_key(self, video_id):
         # TODO: automatic key extraction
         # last update at 2016-01-22 for Zombie::bite
-        enc_key = '6ab6d0280511493ba85594779759d4ed'
+        enc_key = '4a1caba4b4465345366f28da7c117d20'
         return enc_key
 
+    def _extract_playlist(self, webpage):
+        PAGE_SIZE = 50
+
+        links = re.findall(
+            r'<a[^>]+class="site-piclist_pic_link"[^>]+href="(http://www\.iqiyi\.com/.+\.html)"',
+            webpage)
+        if not links:
+            return
+
+        album_id = self._search_regex(
+            r'albumId\s*:\s*(\d+),', webpage, 'album ID')
+        album_title = self._search_regex(
+            r'data-share-title="([^"]+)"', webpage, 'album title', fatal=False)
+
+        entries = list(map(self.url_result, links))
+
+        # Start from 2 because links in the first page are already on webpage
+        for page_num in itertools.count(2):
+            pagelist_page = self._download_webpage(
+                'http://cache.video.qiyi.com/jp/avlist/%s/%d/%d/' % (album_id, page_num, PAGE_SIZE),
+                album_id,
+                note='Download playlist page %d' % page_num,
+                errnote='Failed to download playlist page %d' % page_num)
+            pagelist = self._parse_json(
+                remove_start(pagelist_page, 'var tvInfoJs='), album_id)
+            vlist = pagelist['data']['vlist']
+            for item in vlist:
+                entries.append(self.url_result(item['vurl']))
+            if len(vlist) < PAGE_SIZE:
+                break
+
+        return self.playlist_result(entries, album_id, album_title)
+
     def _real_extract(self, url):
         webpage = self._download_webpage(
             url, 'temp_id', note='download video page')
+
+        # There's no simple way to determine whether an URL is a playlist or not
+        # So detect it
+        playlist_result = self._extract_playlist(webpage)
+        if playlist_result:
+            return playlist_result
+
         tvid = self._search_regex(
             r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
         video_id = self._search_regex(
             r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
-        swf_url = self._search_regex(
-            r'(http://[^\'"]+MainPlayer[^.]+\.swf)', webpage, 'swf player URL')
         _uuid = uuid.uuid4().hex
 
-        enc_key = self.get_enc_key(swf_url, video_id)
+        enc_key = self.get_enc_key(video_id)
 
         raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid)