[youtube] Move swfinterp into its own file
[youtube-dl] / youtube_dl / swfinterp.py
1 from __future__ import unicode_literals
2
3 import collections
4 import io
5 import struct
6 import zlib
7
8 from .utils import ExtractorError
9
10
11 def _extract_tags(content):
12     pos = 0
13     while pos < len(content):
14         header16 = struct.unpack('<H', content[pos:pos + 2])[0]
15         pos += 2
16         tag_code = header16 >> 6
17         tag_len = header16 & 0x3f
18         if tag_len == 0x3f:
19             tag_len = struct.unpack('<I', content[pos:pos + 4])[0]
20             pos += 4
21         assert pos + tag_len <= len(content)
22         yield (tag_code, content[pos:pos + tag_len])
23         pos += tag_len
24
25
26 class _AVMClass_Object(object):
27     def __init__(self, avm_class):
28         self.avm_class = avm_class
29
30     def __repr__(self):
31         return '%s#%x' % (self.avm_class.name, id(self))
32
33
34 class _AVMClass(object):
35     def __init__(self, name_idx, name):
36         self.name_idx = name_idx
37         self.name = name
38         self.method_names = {}
39         self.method_idxs = {}
40         self.methods = {}
41         self.method_pyfunctions = {}
42         self.variables = {}
43
44     def make_object(self):
45         return _AVMClass_Object(self)
46
47
48 def _read_int(reader):
49     res = 0
50     shift = 0
51     for _ in range(5):
52         buf = reader.read(1)
53         assert len(buf) == 1
54         b = struct.unpack('<B', buf)[0]
55         res = res | ((b & 0x7f) << shift)
56         if b & 0x80 == 0:
57             break
58         shift += 7
59     return res
60
61
62 def _u30(reader):
63     res = _read_int(reader)
64     assert res & 0xf0000000 == 0
65     return res
66 u32 = _read_int
67
68
69 def _s32(reader):
70     v = _read_int(reader)
71     if v & 0x80000000 != 0:
72         v = - ((v ^ 0xffffffff) + 1)
73     return v
74
75
76 def _s24(reader):
77     bs = reader.read(3)
78     assert len(bs) == 3
79     first_byte = b'\xff' if (ord(bs[0:1]) >= 0x80) else b'\x00'
80     return struct.unpack('!i', first_byte + bs)
81
82
83 def _read_string(reader):
84     slen = _u30(reader)
85     resb = reader.read(slen)
86     assert len(resb) == slen
87     return resb.decode('utf-8')
88
89
90 def _read_bytes(count, reader):
91     if reader is None:
92         reader = code_reader
93     resb = reader.read(count)
94     assert len(resb) == count
95     return resb
96
97
98 def _read_byte(reader):
99     resb = _read_bytes(1, reader=reader)
100     res = struct.unpack('<B', resb)[0]
101     return res
102
103
104 class SWFInterpreter(object):
105     def __init__(self, file_contents):
106         if file_contents[1:3] != b'WS':
107             raise ExtractorError(
108                 'Not an SWF file; header is %r' % file_contents[:3])
109         if file_contents[:1] == b'C':
110             content = zlib.decompress(file_contents[8:])
111         else:
112             raise NotImplementedError(
113                 'Unsupported compression format %r' %
114                 file_contents[:1])
115
116         code_tag = next(tag
117                         for tag_code, tag in _extract_tags(content)
118                         if tag_code == 82)
119         p = code_tag.index(b'\0', 4) + 1
120         code_reader = io.BytesIO(code_tag[p:])
121
122         # Parse ABC (AVM2 ByteCode)
123
124         # Define a couple convenience methods
125         u30 = lambda *args: _u30(*args, reader=code_reader)
126         s32 = lambda *args: _s32(*args, reader=code_reader)
127         u32 = lambda *args: _u32(*args, reader=code_reader)
128         read_bytes = lambda *args: _read_bytes(*args, reader=code_reader)
129         read_byte = lambda *args: _read_byte(*args, reader=code_reader)
130
131         # minor_version + major_version
132         read_bytes(2 + 2)
133
134         # Constant pool
135         int_count = u30()
136         for _c in range(1, int_count):
137             s32()
138         uint_count = u30()
139         for _c in range(1, uint_count):
140             u32()
141         double_count = u30()
142         read_bytes((double_count - 1) * 8)
143         string_count = u30()
144         constant_strings = ['']
145         for _c in range(1, string_count):
146             s = _read_string(code_reader)
147             constant_strings.append(s)
148         namespace_count = u30()
149         for _c in range(1, namespace_count):
150             read_bytes(1)  # kind
151             u30()  # name
152         ns_set_count = u30()
153         for _c in range(1, ns_set_count):
154             count = u30()
155             for _c2 in range(count):
156                 u30()
157         multiname_count = u30()
158         MULTINAME_SIZES = {
159             0x07: 2,  # QName
160             0x0d: 2,  # QNameA
161             0x0f: 1,  # RTQName
162             0x10: 1,  # RTQNameA
163             0x11: 0,  # RTQNameL
164             0x12: 0,  # RTQNameLA
165             0x09: 2,  # Multiname
166             0x0e: 2,  # MultinameA
167             0x1b: 1,  # MultinameL
168             0x1c: 1,  # MultinameLA
169         }
170         self.multinames = ['']
171         for _c in range(1, multiname_count):
172             kind = u30()
173             assert kind in MULTINAME_SIZES, 'Invalid multiname kind %r' % kind
174             if kind == 0x07:
175                 u30()  # namespace_idx
176                 name_idx = u30()
177                 self.multinames.append(constant_strings[name_idx])
178             else:
179                 self.multinames.append('[MULTINAME kind: %d]' % kind)
180                 for _c2 in range(MULTINAME_SIZES[kind]):
181                     u30()
182
183         # Methods
184         method_count = u30()
185         MethodInfo = collections.namedtuple(
186             'MethodInfo',
187             ['NEED_ARGUMENTS', 'NEED_REST'])
188         method_infos = []
189         for method_id in range(method_count):
190             param_count = u30()
191             u30()  # return type
192             for _ in range(param_count):
193                 u30()  # param type
194             u30()  # name index (always 0 for youtube)
195             flags = read_byte()
196             if flags & 0x08 != 0:
197                 # Options present
198                 option_count = u30()
199                 for c in range(option_count):
200                     u30()  # val
201                     read_bytes(1)  # kind
202             if flags & 0x80 != 0:
203                 # Param names present
204                 for _ in range(param_count):
205                     u30()  # param name
206             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
207             method_infos.append(mi)
208
209         # Metadata
210         metadata_count = u30()
211         for _c in range(metadata_count):
212             u30()  # name
213             item_count = u30()
214             for _c2 in range(item_count):
215                 u30()  # key
216                 u30()  # value
217
218         def parse_traits_info():
219             trait_name_idx = u30()
220             kind_full = read_byte()
221             kind = kind_full & 0x0f
222             attrs = kind_full >> 4
223             methods = {}
224             if kind in [0x00, 0x06]:  # Slot or Const
225                 u30()  # Slot id
226                 u30()  # type_name_idx
227                 vindex = u30()
228                 if vindex != 0:
229                     read_byte()  # vkind
230             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
231                 u30()  # disp_id
232                 method_idx = u30()
233                 methods[self.multinames[trait_name_idx]] = method_idx
234             elif kind == 0x04:  # Class
235                 u30()  # slot_id
236                 u30()  # classi
237             elif kind == 0x05:  # Function
238                 u30()  # slot_id
239                 function_idx = u30()
240                 methods[function_idx] = self.multinames[trait_name_idx]
241             else:
242                 raise ExtractorError('Unsupported trait kind %d' % kind)
243
244             if attrs & 0x4 != 0:  # Metadata present
245                 metadata_count = u30()
246                 for _c3 in range(metadata_count):
247                     u30()  # metadata index
248
249             return methods
250
251         # Classes
252         class_count = u30()
253         classes = []
254         for class_id in range(class_count):
255             name_idx = u30()
256             classes.append(_AVMClass(name_idx, self.multinames[name_idx]))
257             u30()  # super_name idx
258             flags = read_byte()
259             if flags & 0x08 != 0:  # Protected namespace is present
260                 u30()  # protected_ns_idx
261             intrf_count = u30()
262             for _c2 in range(intrf_count):
263                 u30()
264             u30()  # iinit
265             trait_count = u30()
266             for _c2 in range(trait_count):
267                 parse_traits_info()
268         assert len(classes) == class_count
269         self._classes_by_name = dict((c.name, c) for c in classes)
270
271         for avm_class in classes:
272             u30()  # cinit
273             trait_count = u30()
274             for _c2 in range(trait_count):
275                 trait_methods = parse_traits_info()
276                 avm_class.method_names.update(trait_methods.items())
277                 avm_class.method_idxs.update(dict(
278                     (idx, name)
279                     for name, idx in trait_methods.items()))
280
281         # Scripts
282         script_count = u30()
283         for _c in range(script_count):
284             u30()  # init
285             trait_count = u30()
286             for _c2 in range(trait_count):
287                 parse_traits_info()
288
289         # Method bodies
290         method_body_count = u30()
291         Method = collections.namedtuple('Method', ['code', 'local_count'])
292         for _c in range(method_body_count):
293             method_idx = u30()
294             u30()  # max_stack
295             local_count = u30()
296             u30()  # init_scope_depth
297             u30()  # max_scope_depth
298             code_length = u30()
299             code = read_bytes(code_length)
300             for avm_class in classes:
301                 if method_idx in avm_class.method_idxs:
302                     m = Method(code, local_count)
303                     avm_class.methods[avm_class.method_idxs[method_idx]] = m
304             exception_count = u30()
305             for _c2 in range(exception_count):
306                 u30()  # from
307                 u30()  # to
308                 u30()  # target
309                 u30()  # exc_type
310                 u30()  # var_name
311             trait_count = u30()
312             for _c2 in range(trait_count):
313                 parse_traits_info()
314
315         assert p + code_reader.tell() == len(code_tag)
316
317     def extract_class(self, class_name):
318         try:
319             return self._classes_by_name[class_name]
320         except KeyError:
321             raise ExtractorError('Class %r not found' % class_name)
322
323     def extract_function(self, avm_class, func_name):
324         if func_name in avm_class.method_pyfunctions:
325             return avm_class.method_pyfunctions[func_name]
326         if func_name in self._classes_by_name:
327             return self._classes_by_name[func_name].make_object()
328         if func_name not in avm_class.methods:
329             raise ExtractorError('Cannot find function %r' % func_name)
330         m = avm_class.methods[func_name]
331
332         def resfunc(args):
333             # Helper functions
334             coder = io.BytesIO(m.code)
335             s24 = lambda: _s24(coder)
336             u30 = lambda: _u30(coder)
337
338             print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args)))
339             registers = ['(this)'] + list(args) + [None] * m.local_count
340             stack = []
341             while True:
342                 opcode = _read_byte(coder)
343                 print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack))
344                 if opcode == 17:  # iftrue
345                     offset = s24()
346                     value = stack.pop()
347                     if value:
348                         coder.seek(coder.tell() + offset)
349                 elif opcode == 36:  # pushbyte
350                     v = _read_byte(coder)
351                     stack.append(v)
352                 elif opcode == 44:  # pushstring
353                     idx = u30()
354                     stack.append(constant_strings[idx])
355                 elif opcode == 48:  # pushscope
356                     # We don't implement the scope register, so we'll just
357                     # ignore the popped value
358                     new_scope = stack.pop()
359                 elif opcode == 70:  # callproperty
360                     index = u30()
361                     mname = self.multinames[index]
362                     arg_count = u30()
363                     args = list(reversed(
364                         [stack.pop() for _ in range(arg_count)]))
365                     obj = stack.pop()
366                     if mname == 'split':
367                         assert len(args) == 1
368                         assert isinstance(args[0], compat_str)
369                         assert isinstance(obj, compat_str)
370                         if args[0] == '':
371                             res = list(obj)
372                         else:
373                             res = obj.split(args[0])
374                         stack.append(res)
375                     elif mname == 'slice':
376                         assert len(args) == 1
377                         assert isinstance(args[0], int)
378                         assert isinstance(obj, list)
379                         res = obj[args[0]:]
380                         stack.append(res)
381                     elif mname == 'join':
382                         assert len(args) == 1
383                         assert isinstance(args[0], compat_str)
384                         assert isinstance(obj, list)
385                         res = args[0].join(obj)
386                         stack.append(res)
387                     elif mname in avm_class.method_pyfunctions:
388                         stack.append(avm_class.method_pyfunctions[mname](args))
389                     else:
390                         raise NotImplementedError(
391                             'Unsupported property %r on %r'
392                             % (mname, obj))
393                 elif opcode == 72:  # returnvalue
394                     res = stack.pop()
395                     return res
396                 elif opcode == 74:  # constructproperty
397                     index = u30()
398                     arg_count = u30()
399                     args = list(reversed(
400                         [stack.pop() for _ in range(arg_count)]))
401                     obj = stack.pop()
402
403                     mname = self.multinames[index]
404                     construct_method = self.extract_function(
405                         obj.avm_class, mname)
406                     # We do not actually call the constructor for now;
407                     # we just pretend it does nothing
408                     stack.append(obj)
409                 elif opcode == 79:  # callpropvoid
410                     index = u30()
411                     mname = self.multinames[index]
412                     arg_count = u30()
413                     args = list(reversed(
414                         [stack.pop() for _ in range(arg_count)]))
415                     obj = stack.pop()
416                     if mname == 'reverse':
417                         assert isinstance(obj, list)
418                         obj.reverse()
419                     else:
420                         raise NotImplementedError(
421                             'Unsupported (void) property %r on %r'
422                             % (mname, obj))
423                 elif opcode == 86:  # newarray
424                     arg_count = u30()
425                     arr = []
426                     for i in range(arg_count):
427                         arr.append(stack.pop())
428                     arr = arr[::-1]
429                     stack.append(arr)
430                 elif opcode == 93:  # findpropstrict
431                     index = u30()
432                     mname = self.multinames[index]
433                     res = self.extract_function(avm_class, mname)
434                     stack.append(res)
435                 elif opcode == 94:  # findproperty
436                     index = u30()
437                     mname = self.multinames[index]
438                     res = avm_class.variables.get(mname)
439                     stack.append(res)
440                 elif opcode == 96:  # getlex
441                     index = u30()
442                     mname = self.multinames[index]
443                     res = avm_class.variables.get(mname, None)
444                     stack.append(res)
445                 elif opcode == 97:  # setproperty
446                     index = u30()
447                     value = stack.pop()
448                     idx = self.multinames[index]
449                     obj = stack.pop()
450                     obj[idx] = value
451                 elif opcode == 98:  # getlocal
452                     index = u30()
453                     stack.append(registers[index])
454                 elif opcode == 99:  # setlocal
455                     index = u30()
456                     value = stack.pop()
457                     registers[index] = value
458                 elif opcode == 102:  # getproperty
459                     index = u30()
460                     pname = self.multinames[index]
461                     if pname == 'length':
462                         obj = stack.pop()
463                         assert isinstance(obj, list)
464                         stack.append(len(obj))
465                     else:  # Assume attribute access
466                         idx = stack.pop()
467                         assert isinstance(idx, int)
468                         obj = stack.pop()
469                         assert isinstance(obj, list)
470                         stack.append(obj[idx])
471                 elif opcode == 128:  # coerce
472                     u30()
473                 elif opcode == 133:  # coerce_s
474                     assert isinstance(stack[-1], (type(None), compat_str))
475                 elif opcode == 164:  # modulo
476                     value2 = stack.pop()
477                     value1 = stack.pop()
478                     res = value1 % value2
479                     stack.append(res)
480                 elif opcode == 175:  # greaterequals
481                     value2 = stack.pop()
482                     value1 = stack.pop()
483                     result = value1 >= value2
484                     stack.append(result)
485                 elif opcode == 208:  # getlocal_0
486                     stack.append(registers[0])
487                 elif opcode == 209:  # getlocal_1
488                     stack.append(registers[1])
489                 elif opcode == 210:  # getlocal_2
490                     stack.append(registers[2])
491                 elif opcode == 211:  # getlocal_3
492                     stack.append(registers[3])
493                 elif opcode == 214:  # setlocal_2
494                     registers[2] = stack.pop()
495                 elif opcode == 215:  # setlocal_3
496                     registers[3] = stack.pop()
497                 else:
498                     raise NotImplementedError(
499                         'Unsupported opcode %d' % opcode)
500
501         avm_class.method_pyfunctions[func_name] = resfunc
502         return resfunc
503