[youku] compatible for python > 3.3 or > 2.7
[youtube-dl] / youtube_dl / extractor / youku.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import sys
5 pyvs = sys.version_info[0]
6 import re
7 import base64
8
9 from .common import InfoExtractor
10 from ..utils import ExtractorError
11
12 class YoukuIE(InfoExtractor):
13     IE_NAME = 'youku'
14     _VALID_URL = r'''(?x)
15         (?:
16             http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
17             youku:)
18         (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
19     '''
20
21     _TEST = {
22             'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
23             'md5': '5f3af4192eabacc4501508d54a8cabd7',
24             'info_dict': {
25                 'id': 'XMTc1ODE5Njcy',
26                 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
27                 'ext': 'flv'
28             }
29     }
30
31     def construct_video_urls(self, data1, data2):
32         # get sid, token
33         def yk_t(s1, s2):
34             ls = list(range(256))
35             t = 0
36             for i in range(256):
37                 t = (t + ls[i] + ord(s1[i%len(s1)])) % 256
38                 ls[i], ls[t] = ls[t], ls[i]
39             s = '' if pyvs == 3 else b''
40             x, y = 0, 0
41             for i in range(len(s2)):
42                 y = (y + 1) % 256
43                 x = (x + ls[y]) % 256
44                 ls[x], ls[y] = ls[y], ls[x]
45                 if isinstance(s2[i], int):
46                     s += chr(s2[i] ^ ls[(ls[x]+ls[y]) % 256])
47                 else:
48                     s += chr(ord(s2[i]) ^ ls[(ls[x]+ls[y]) % 256])
49             return s
50
51         sid, token = yk_t(
52             'becaf9be',
53             base64.b64decode(bytes(data2['ep'], 'ascii')) \
54                 if pyvs == 3 \
55                 else base64.b64decode(data2['ep'])
56         ).split('_')
57
58         # get oip
59         oip = data2['ip']
60
61         # get fileid
62         string_ls = list(
63             'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890')
64         shuffled_string_ls = []
65         seed = data1['seed']
66         N = len(string_ls)
67         for ii in range(N):
68             seed = (seed * 0xd3 + 0x754f) % 0x10000
69             idx = seed * len(string_ls) // 0x10000
70             shuffled_string_ls.append(string_ls[idx])
71             del string_ls[idx]
72
73         fileid_dict = {}
74         for format in data1['streamtypes']:
75             streamfileid = [
76                 int(i) for i in data1['streamfileids'][format].strip('*').split('*')]
77             fileid = ''.join(
78                 [shuffled_string_ls[i] for i in streamfileid])
79             fileid_dict[format] = fileid[:8] + '%s' + fileid[10:]
80
81         def get_fileid(format, n):
82             fileid = fileid_dict[format] % hex(int(n))[2:].upper().zfill(2)
83             return fileid
84
85         # get ep
86         def generate_ep(format, n):
87             fileid = get_fileid(format, n)
88             ep_t = yk_t(
89                 'bf7e5f01',
90                 bytes('%s_%s_%s' % (sid, fileid, token), 'ascii') \
91                 if pyvs == 3 \
92                 else ('%s_%s_%s' % (sid, fileid, token))
93             )
94             ep = base64.b64encode(
95                 bytes(ep_t, 'latin') \
96                 if pyvs == 3 \
97                 else ep_t
98             ).decode()
99             ep = ep.replace('+', '%2B')
100             ep = ep.replace('/', '%2F')
101             ep = ep.replace('=', '%2D')
102             return ep
103
104         # generate video_urls
105         video_urls_dict = {}
106         for format in data1['streamtypes']:
107             video_urls = []
108             for dt in data1['segs'][format]:
109                 n = str(int(dt['no']))
110                 video_url = \
111                     'http://k.youku.com/player/getFlvPath/' + \
112                     'sid/' + sid + \
113                     '_' + str(int(n)+1).zfill(2) + \
114                     '/st/' + self.parse_ext_l(format) + \
115                     '/fileid/' + get_fileid(format, n)  + '?' + \
116                     'K=' + str(dt['k']) + \
117                     '&hd=' + self.get_hd(format) + \
118                     '&myp=0' + \
119                     '&ts=' + str(dt['seconds']) + \
120                     '&ypp=0&ctype=12&ev=1' + \
121                     '&token=' + str(token) + \
122                     '&oip=' + str(oip) + \
123                     '&ep=' + generate_ep(format, n)
124                 video_urls.append(video_url)
125             video_urls_dict[format] = video_urls
126
127         return video_urls_dict
128
129     def get_hd(self, fm):
130         hd_id_dict = {
131             'flv': '0',
132             'mp4': '1',
133             'hd2': '2',
134             'hd3': '3',
135             '3gp': '0',
136             '3gphd': '1'
137         }
138         return hd_id_dict[fm]
139
140     def parse_ext_l(self, fm):
141         ext_dict = {
142             'flv': 'flv',
143             'mp4': 'mp4',
144             'hd2': 'flv',
145             'hd3': 'flv',
146             '3gp': 'flv',
147             '3gphd': 'mp4',
148         }
149         return ext_dict[fm]
150
151     def _real_extract(self, url):
152         mobj = re.match(self._VALID_URL, url)
153         video_id = mobj.group('id')
154
155         # request basic data
156         data1_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s' % video_id
157         data2_url = 'http://v.youku.com/player/getPlayList/VideoIDS/%s/Pf/4/ctype/12/ev/1' % video_id
158
159         raw_data1 = self._download_json(data1_url, video_id)
160         raw_data2 = self._download_json(data2_url, video_id)
161         data1 = raw_data1['data'][0]
162         data2 = raw_data2['data'][0]
163
164         error_code = data1.get('error_code')
165         if error_code:
166             # -8 means blocked outside China.
167             # Chinese and English, separated by newline.
168             error = data1.get('error')
169             raise ExtractorError(
170                 error or 'Server reported error %i' %
171                 error_code,
172                 expected=True)
173
174         title = data1['title']
175
176         # generate video_urls_dict
177         video_urls_dict = self.construct_video_urls(data1, data2)
178
179         # construct info
180         entries = []
181         for fm in data1['streamtypes']:
182             #formats = []
183             video_urls = video_urls_dict[fm]
184             for i in range(len(video_urls)):
185                 if len(entries) < i+1:
186                     entries.append({'formats': []})
187                 entries[i]['formats'].append(
188                     {
189                         'url': video_urls[i],
190                         'format_id': fm,
191                         'ext': self.parse_ext_l(fm),
192                         'filesize': int(data1['segs'][fm][i]['size'])
193                     }
194                 )
195
196         for i in range(len(entries)):
197             entries[i].update(
198                 {
199                     'id': '_part%d' % (i+1),
200                     'title': title,
201                 }
202             )
203
204         if len(entries) > 1:
205             info = {
206                 '_type': 'multi_video',
207                 'id': video_id,
208                 'title': title,
209                 'entries': entries,
210             }
211         else:
212             info = entries[0]
213             info['id'] = video_id
214
215         return info