[streamango] Fix extraction (closes #14160)
[youtube-dl] / youtube_dl / extractor / streamango.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import (
8     determine_ext,
9     int_or_none,
10     js_to_json,
11 )
12
13
14 class StreamangoIE(InfoExtractor):
15     _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
16     _TESTS = [{
17         'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
18         'md5': 'e992787515a182f55e38fc97588d802a',
19         'info_dict': {
20             'id': 'clapasobsptpkdfe',
21             'ext': 'mp4',
22             'title': '20170315_150006.mp4',
23         }
24     }, {
25         # no og:title
26         'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4',
27         'info_dict': {
28             'id': 'foqebrpftarclpob',
29             'ext': 'mp4',
30             'title': 'foqebrpftarclpob',
31         },
32         'params': {
33             'skip_download': True,
34         },
35     }, {
36         'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
37         'only_matching': True,
38     }]
39
40     def _real_extract(self, url):
41         def decrypt_src(str_, val):
42             k = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA'
43             str_ = re.sub(r'[^A-Za-z0-9+/=]', '', str_)
44             src = ''
45             sm = [None] * 4
46             i = 0
47             str_len = len(str_)
48             while i < str_len:
49                 for j in range(4):
50                     sm[j % 4] = k.index(str_[i])
51                     i += 1
52                 charCode = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val
53                 src += chr(charCode)
54                 if (sm[2] != 0x40):
55                     charCode = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2)
56                     src += chr(charCode)
57                 if (sm[3] != 0x40):
58                     charCode = ((sm[2] & 0x3) << 0x6) | sm[3]
59                     src += chr(charCode)
60             return src
61
62         video_id = self._match_id(url)
63
64         webpage = self._download_webpage(url, video_id)
65
66         title = self._og_search_title(webpage, default=video_id)
67
68         formats = []
69         for format_ in re.findall(r'\(\s*({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
70             mobj = re.search(r'(src\s*:\s*[^(]\(([^)]*)\)[\s,]*)', format_)
71             if mobj is None:
72                 continue
73             format_ = format_.replace(mobj.group(0), '')
74
75             video = self._parse_json(
76                 format_, video_id, transform_source=js_to_json, fatal=False)
77             if not video:
78                 continue
79
80             mobj = re.search(r'[\'"](?P<src>[^\'"]+)[\'"]\s*,\s*(?P<val>\d+)', mobj.group(1))
81             if mobj is None:
82                 continue
83
84             src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val')))
85             ext = determine_ext(src, default_ext=None)
86             if video.get('type') == 'application/dash+xml' or ext == 'mpd':
87                 formats.extend(self._extract_mpd_formats(
88                     src, video_id, mpd_id='dash', fatal=False))
89             else:
90                 formats.append({
91                     'url': src,
92                     'ext': ext or 'mp4',
93                     'width': int_or_none(video.get('width')),
94                     'height': int_or_none(video.get('height')),
95                     'tbr': int_or_none(video.get('bitrate')),
96                 })
97         self._sort_formats(formats)
98
99         return {
100             'id': video_id,
101             'url': url,
102             'title': title,
103             'formats': formats,
104         }