[swfinterp] Add support for calling methods on objects
[youtube-dl] / youtube_dl / swfinterp.py
1 from __future__ import unicode_literals
2
3 import collections
4 import io
5 import struct
6 import zlib
7
8 from .utils import (
9     compat_str,
10     ExtractorError,
11 )
12
13
14 def _extract_tags(file_contents):
15     if file_contents[1:3] != b'WS':
16         raise ExtractorError(
17             'Not an SWF file; header is %r' % file_contents[:3])
18     if file_contents[:1] == b'C':
19         content = zlib.decompress(file_contents[8:])
20     else:
21         raise NotImplementedError(
22             'Unsupported compression format %r' %
23             file_contents[:1])
24
25     # Determine number of bits in framesize rectangle
26     framesize_nbits = struct.unpack('!B', content[:1])[0] >> 3
27     framesize_len = (5 + 4 * framesize_nbits + 7) // 8
28
29     pos = framesize_len + 2 + 2
30     while pos < len(content):
31         header16 = struct.unpack('<H', content[pos:pos + 2])[0]
32         pos += 2
33         tag_code = header16 >> 6
34         tag_len = header16 & 0x3f
35         if tag_len == 0x3f:
36             tag_len = struct.unpack('<I', content[pos:pos + 4])[0]
37             pos += 4
38         assert pos + tag_len <= len(content), \
39             ('Tag %d ends at %d+%d - that\'s longer than the file (%d)'
40                 % (tag_code, pos, tag_len, len(content)))
41         yield (tag_code, content[pos:pos + tag_len])
42         pos += tag_len
43
44
45 class _AVMClass_Object(object):
46     def __init__(self, avm_class):
47         self.avm_class = avm_class
48
49     def __repr__(self):
50         return '%s#%x' % (self.avm_class.name, id(self))
51
52
53 class _ScopeDict(dict):
54     def __init__(self, avm_class):
55         super(_ScopeDict, self).__init__()
56         self.avm_class = avm_class
57
58     def __repr__(self):
59         return '%s__Scope(%s)' % (
60             self.avm_class.name,
61             super(_ScopeDict, self).__repr__())
62
63
64 class _AVMClass(object):
65     def __init__(self, name_idx, name):
66         self.name_idx = name_idx
67         self.name = name
68         self.method_names = {}
69         self.method_idxs = {}
70         self.methods = {}
71         self.method_pyfunctions = {}
72
73         self.variables = _ScopeDict(self)
74
75     def make_object(self):
76         return _AVMClass_Object(self)
77
78     def __repr__(self):
79         return '_AVMClass(%s)' % (self.name)
80
81     def register_methods(self, methods):
82         self.method_names.update(methods.items())
83         self.method_idxs.update(dict(
84             (idx, name)
85             for name, idx in methods.items()))
86
87
88 def _read_int(reader):
89     res = 0
90     shift = 0
91     for _ in range(5):
92         buf = reader.read(1)
93         assert len(buf) == 1
94         b = struct.unpack('<B', buf)[0]
95         res = res | ((b & 0x7f) << shift)
96         if b & 0x80 == 0:
97             break
98         shift += 7
99     return res
100
101
102 def _u30(reader):
103     res = _read_int(reader)
104     assert res & 0xf0000000 == 0
105     return res
106 u32 = _read_int
107
108
109 def _s32(reader):
110     v = _read_int(reader)
111     if v & 0x80000000 != 0:
112         v = - ((v ^ 0xffffffff) + 1)
113     return v
114
115
116 def _s24(reader):
117     bs = reader.read(3)
118     assert len(bs) == 3
119     last_byte = b'\xff' if (ord(bs[2:3]) >= 0x80) else b'\x00'
120     return struct.unpack('<i', bs + last_byte)[0]
121
122
123 def _read_string(reader):
124     slen = _u30(reader)
125     resb = reader.read(slen)
126     assert len(resb) == slen
127     return resb.decode('utf-8')
128
129
130 def _read_bytes(count, reader):
131     assert count >= 0
132     resb = reader.read(count)
133     assert len(resb) == count
134     return resb
135
136
137 def _read_byte(reader):
138     resb = _read_bytes(1, reader=reader)
139     res = struct.unpack('<B', resb)[0]
140     return res
141
142
143 class SWFInterpreter(object):
144     def __init__(self, file_contents):
145         code_tag = next(tag
146                         for tag_code, tag in _extract_tags(file_contents)
147                         if tag_code == 82)
148         p = code_tag.index(b'\0', 4) + 1
149         code_reader = io.BytesIO(code_tag[p:])
150
151         # Parse ABC (AVM2 ByteCode)
152
153         # Define a couple convenience methods
154         u30 = lambda *args: _u30(*args, reader=code_reader)
155         s32 = lambda *args: _s32(*args, reader=code_reader)
156         u32 = lambda *args: _u32(*args, reader=code_reader)
157         read_bytes = lambda *args: _read_bytes(*args, reader=code_reader)
158         read_byte = lambda *args: _read_byte(*args, reader=code_reader)
159
160         # minor_version + major_version
161         read_bytes(2 + 2)
162
163         # Constant pool
164         int_count = u30()
165         for _c in range(1, int_count):
166             s32()
167         uint_count = u30()
168         for _c in range(1, uint_count):
169             u32()
170         double_count = u30()
171         read_bytes(max(0, (double_count - 1)) * 8)
172         string_count = u30()
173         self.constant_strings = ['']
174         for _c in range(1, string_count):
175             s = _read_string(code_reader)
176             self.constant_strings.append(s)
177         namespace_count = u30()
178         for _c in range(1, namespace_count):
179             read_bytes(1)  # kind
180             u30()  # name
181         ns_set_count = u30()
182         for _c in range(1, ns_set_count):
183             count = u30()
184             for _c2 in range(count):
185                 u30()
186         multiname_count = u30()
187         MULTINAME_SIZES = {
188             0x07: 2,  # QName
189             0x0d: 2,  # QNameA
190             0x0f: 1,  # RTQName
191             0x10: 1,  # RTQNameA
192             0x11: 0,  # RTQNameL
193             0x12: 0,  # RTQNameLA
194             0x09: 2,  # Multiname
195             0x0e: 2,  # MultinameA
196             0x1b: 1,  # MultinameL
197             0x1c: 1,  # MultinameLA
198         }
199         self.multinames = ['']
200         for _c in range(1, multiname_count):
201             kind = u30()
202             assert kind in MULTINAME_SIZES, 'Invalid multiname kind %r' % kind
203             if kind == 0x07:
204                 u30()  # namespace_idx
205                 name_idx = u30()
206                 self.multinames.append(self.constant_strings[name_idx])
207             else:
208                 self.multinames.append('[MULTINAME kind: %d]' % kind)
209                 for _c2 in range(MULTINAME_SIZES[kind]):
210                     u30()
211
212         # Methods
213         method_count = u30()
214         MethodInfo = collections.namedtuple(
215             'MethodInfo',
216             ['NEED_ARGUMENTS', 'NEED_REST'])
217         method_infos = []
218         for method_id in range(method_count):
219             param_count = u30()
220             u30()  # return type
221             for _ in range(param_count):
222                 u30()  # param type
223             u30()  # name index (always 0 for youtube)
224             flags = read_byte()
225             if flags & 0x08 != 0:
226                 # Options present
227                 option_count = u30()
228                 for c in range(option_count):
229                     u30()  # val
230                     read_bytes(1)  # kind
231             if flags & 0x80 != 0:
232                 # Param names present
233                 for _ in range(param_count):
234                     u30()  # param name
235             mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
236             method_infos.append(mi)
237
238         # Metadata
239         metadata_count = u30()
240         for _c in range(metadata_count):
241             u30()  # name
242             item_count = u30()
243             for _c2 in range(item_count):
244                 u30()  # key
245                 u30()  # value
246
247         def parse_traits_info():
248             trait_name_idx = u30()
249             kind_full = read_byte()
250             kind = kind_full & 0x0f
251             attrs = kind_full >> 4
252             methods = {}
253             if kind in [0x00, 0x06]:  # Slot or Const
254                 u30()  # Slot id
255                 u30()  # type_name_idx
256                 vindex = u30()
257                 if vindex != 0:
258                     read_byte()  # vkind
259             elif kind in [0x01, 0x02, 0x03]:  # Method / Getter / Setter
260                 u30()  # disp_id
261                 method_idx = u30()
262                 methods[self.multinames[trait_name_idx]] = method_idx
263             elif kind == 0x04:  # Class
264                 u30()  # slot_id
265                 u30()  # classi
266             elif kind == 0x05:  # Function
267                 u30()  # slot_id
268                 function_idx = u30()
269                 methods[function_idx] = self.multinames[trait_name_idx]
270             else:
271                 raise ExtractorError('Unsupported trait kind %d' % kind)
272
273             if attrs & 0x4 != 0:  # Metadata present
274                 metadata_count = u30()
275                 for _c3 in range(metadata_count):
276                     u30()  # metadata index
277
278             return methods
279
280         # Classes
281         class_count = u30()
282         classes = []
283         for class_id in range(class_count):
284             name_idx = u30()
285
286             cname = self.multinames[name_idx]
287             avm_class = _AVMClass(name_idx, cname)
288             classes.append(avm_class)
289
290             u30()  # super_name idx
291             flags = read_byte()
292             if flags & 0x08 != 0:  # Protected namespace is present
293                 u30()  # protected_ns_idx
294             intrf_count = u30()
295             for _c2 in range(intrf_count):
296                 u30()
297             u30()  # iinit
298             trait_count = u30()
299             for _c2 in range(trait_count):
300                 trait_methods = parse_traits_info()
301                 avm_class.register_methods(trait_methods)
302
303         assert len(classes) == class_count
304         self._classes_by_name = dict((c.name, c) for c in classes)
305
306         for avm_class in classes:
307             u30()  # cinit
308             trait_count = u30()
309             for _c2 in range(trait_count):
310                 trait_methods = parse_traits_info()
311                 avm_class.register_methods(trait_methods)
312
313         # Scripts
314         script_count = u30()
315         for _c in range(script_count):
316             u30()  # init
317             trait_count = u30()
318             for _c2 in range(trait_count):
319                 parse_traits_info()
320
321         # Method bodies
322         method_body_count = u30()
323         Method = collections.namedtuple('Method', ['code', 'local_count'])
324         for _c in range(method_body_count):
325             method_idx = u30()
326             u30()  # max_stack
327             local_count = u30()
328             u30()  # init_scope_depth
329             u30()  # max_scope_depth
330             code_length = u30()
331             code = read_bytes(code_length)
332             for avm_class in classes:
333                 if method_idx in avm_class.method_idxs:
334                     m = Method(code, local_count)
335                     avm_class.methods[avm_class.method_idxs[method_idx]] = m
336             exception_count = u30()
337             for _c2 in range(exception_count):
338                 u30()  # from
339                 u30()  # to
340                 u30()  # target
341                 u30()  # exc_type
342                 u30()  # var_name
343             trait_count = u30()
344             for _c2 in range(trait_count):
345                 parse_traits_info()
346
347         assert p + code_reader.tell() == len(code_tag)
348
349     def extract_class(self, class_name):
350         try:
351             return self._classes_by_name[class_name]
352         except KeyError:
353             raise ExtractorError('Class %r not found' % class_name)
354
355     def extract_function(self, avm_class, func_name):
356         print('Extracting %s.%s' % (avm_class.name, func_name))
357         if func_name in avm_class.method_pyfunctions:
358             return avm_class.method_pyfunctions[func_name]
359         if func_name in self._classes_by_name:
360             return self._classes_by_name[func_name].make_object()
361         if func_name not in avm_class.methods:
362             raise ExtractorError('Cannot find function %s.%s' % (
363                 avm_class.name, func_name))
364         m = avm_class.methods[func_name]
365
366         def resfunc(args):
367             # Helper functions
368             coder = io.BytesIO(m.code)
369             s24 = lambda: _s24(coder)
370             u30 = lambda: _u30(coder)
371
372             print('Invoking %s.%s(%r)' % (avm_class.name, func_name, tuple(args)))
373             registers = [avm_class.variables] + list(args) + [None] * m.local_count
374             stack = []
375             scopes = collections.deque([
376                 self._classes_by_name, avm_class.variables])
377             while True:
378                 opcode = _read_byte(coder)
379                 print('opcode: %r, stack(%d): %r' % (opcode, len(stack), stack))
380                 if opcode == 17:  # iftrue
381                     offset = s24()
382                     value = stack.pop()
383                     if value:
384                         coder.seek(coder.tell() + offset)
385                 elif opcode == 18:  # iffalse
386                     offset = s24()
387                     value = stack.pop()
388                     if not value:
389                         coder.seek(coder.tell() + offset)
390                 elif opcode == 36:  # pushbyte
391                     v = _read_byte(coder)
392                     stack.append(v)
393                 elif opcode == 42:  # dup
394                     value = stack[-1]
395                     stack.append(value)
396                 elif opcode == 44:  # pushstring
397                     idx = u30()
398                     stack.append(self.constant_strings[idx])
399                 elif opcode == 48:  # pushscope
400                     new_scope = stack.pop()
401                     scopes.append(new_scope)
402                 elif opcode == 70:  # callproperty
403                     index = u30()
404                     mname = self.multinames[index]
405                     arg_count = u30()
406                     args = list(reversed(
407                         [stack.pop() for _ in range(arg_count)]))
408                     obj = stack.pop()
409
410                     if isinstance(obj, _AVMClass_Object):
411                         func = self.extract_function(obj.avm_class, mname)
412                         res = func(args)
413                         stack.append(res)
414                         continue
415                     elif isinstance(obj, _ScopeDict):
416                         if mname in obj.avm_class.method_names:
417                             func = self.extract_function(obj.avm_class, mname)
418                             res = func(args)
419                         else:
420                             res = obj[mname]
421                         stack.append(res)
422                         continue
423                     elif isinstance(obj, compat_str):
424                         if mname == 'split':
425                             assert len(args) == 1
426                             assert isinstance(args[0], compat_str)
427                             if args[0] == '':
428                                 res = list(obj)
429                             else:
430                                 res = obj.split(args[0])
431                             stack.append(res)
432                             continue
433                     elif isinstance(obj, list):
434                         if mname == 'slice':
435                             assert len(args) == 1
436                             assert isinstance(args[0], int)
437                             res = obj[args[0]:]
438                             stack.append(res)
439                             continue
440                         elif mname == 'join':
441                             assert len(args) == 1
442                             assert isinstance(args[0], compat_str)
443                             res = args[0].join(obj)
444                             stack.append(res)
445                             continue
446                     raise NotImplementedError(
447                         'Unsupported property %r on %r'
448                         % (mname, obj))
449                 elif opcode == 72:  # returnvalue
450                     res = stack.pop()
451                     return res
452                 elif opcode == 74:  # constructproperty
453                     index = u30()
454                     arg_count = u30()
455                     args = list(reversed(
456                         [stack.pop() for _ in range(arg_count)]))
457                     obj = stack.pop()
458
459                     mname = self.multinames[index]
460                     assert isinstance(obj, _AVMClass)
461                     construct_method = self.extract_function(
462                         obj, mname)
463                     # We do not actually call the constructor for now;
464                     # we just pretend it does nothing
465                     stack.append(obj.make_object())
466                 elif opcode == 79:  # callpropvoid
467                     index = u30()
468                     mname = self.multinames[index]
469                     arg_count = u30()
470                     args = list(reversed(
471                         [stack.pop() for _ in range(arg_count)]))
472                     obj = stack.pop()
473                     if mname == 'reverse':
474                         assert isinstance(obj, list)
475                         obj.reverse()
476                     else:
477                         raise NotImplementedError(
478                             'Unsupported (void) property %r on %r'
479                             % (mname, obj))
480                 elif opcode == 86:  # newarray
481                     arg_count = u30()
482                     arr = []
483                     for i in range(arg_count):
484                         arr.append(stack.pop())
485                     arr = arr[::-1]
486                     stack.append(arr)
487                 elif opcode == 93:  # findpropstrict
488                     index = u30()
489                     mname = self.multinames[index]
490                     for s in reversed(scopes):
491                         if mname in s:
492                             res = s
493                             break
494                     else:
495                         res = scopes[0]
496                     stack.append(res[mname])
497                 elif opcode == 94:  # findproperty
498                     index = u30()
499                     mname = self.multinames[index]
500                     for s in reversed(scopes):
501                         if mname in s:
502                             res = s
503                             break
504                     else:
505                         res = avm_class.variables
506                     stack.append(res)
507                 elif opcode == 96:  # getlex
508                     index = u30()
509                     mname = self.multinames[index]
510                     for s in reversed(scopes):
511                         if mname in s:
512                             scope = s
513                             break
514                     else:
515                         scope = avm_class.variables
516                     # I cannot find where static variables are initialized
517                     # so let's just return None
518                     res = scope.get(mname)
519                     stack.append(res)
520                 elif opcode == 97:  # setproperty
521                     index = u30()
522                     value = stack.pop()
523                     idx = self.multinames[index]
524                     obj = stack.pop()
525                     obj[idx] = value
526                 elif opcode == 98:  # getlocal
527                     index = u30()
528                     stack.append(registers[index])
529                 elif opcode == 99:  # setlocal
530                     index = u30()
531                     value = stack.pop()
532                     registers[index] = value
533                 elif opcode == 102:  # getproperty
534                     index = u30()
535                     pname = self.multinames[index]
536                     if pname == 'length':
537                         obj = stack.pop()
538                         assert isinstance(obj, list)
539                         stack.append(len(obj))
540                     else:  # Assume attribute access
541                         idx = stack.pop()
542                         assert isinstance(idx, int)
543                         obj = stack.pop()
544                         assert isinstance(obj, list)
545                         stack.append(obj[idx])
546                 elif opcode == 115:  # convert_
547                     value = stack.pop()
548                     intvalue = int(value)
549                     stack.append(intvalue)
550                 elif opcode == 128:  # coerce
551                     u30()
552                 elif opcode == 133:  # coerce_s
553                     assert isinstance(stack[-1], (type(None), compat_str))
554                 elif opcode == 160:  # add
555                     value2 = stack.pop()
556                     value1 = stack.pop()
557                     res = value1 + value2
558                     stack.append(res)
559                 elif opcode == 161:  # subtract
560                     value2 = stack.pop()
561                     value1 = stack.pop()
562                     res = value1 - value2
563                     stack.append(res)
564                 elif opcode == 164:  # modulo
565                     value2 = stack.pop()
566                     value1 = stack.pop()
567                     res = value1 % value2
568                     stack.append(res)
569                 elif opcode == 175:  # greaterequals
570                     value2 = stack.pop()
571                     value1 = stack.pop()
572                     result = value1 >= value2
573                     stack.append(result)
574                 elif opcode == 208:  # getlocal_0
575                     stack.append(registers[0])
576                 elif opcode == 209:  # getlocal_1
577                     stack.append(registers[1])
578                 elif opcode == 210:  # getlocal_2
579                     stack.append(registers[2])
580                 elif opcode == 211:  # getlocal_3
581                     stack.append(registers[3])
582                 elif opcode == 212:  # setlocal_0
583                     registers[0] = stack.pop()
584                 elif opcode == 213:  # setlocal_1
585                     registers[1] = stack.pop()
586                 elif opcode == 214:  # setlocal_2
587                     registers[2] = stack.pop()
588                 elif opcode == 215:  # setlocal_3
589                     registers[3] = stack.pop()
590                 else:
591                     raise NotImplementedError(
592                         'Unsupported opcode %d' % opcode)
593
594         avm_class.method_pyfunctions[func_name] = resfunc
595         return resfunc
596