[openload] Fix extraction (closes #9706)
[youtube-dl] / youtube_dl / extractor / openload.py
1 # coding: utf-8
2 from __future__ import unicode_literals, division
3
4 import math
5
6 from .common import InfoExtractor
7 from ..compat import compat_chr
8 from ..utils import (
9     decode_png,
10     determine_ext,
11     ExtractorError,
12 )
13
14
15 class OpenloadIE(InfoExtractor):
16     _VALID_URL = r'https://openload.(?:co|io)/(?:f|embed)/(?P<id>[a-zA-Z0-9-_]+)'
17
18     _TESTS = [{
19         'url': 'https://openload.co/f/kUEfGclsU9o',
20         'md5': 'bf1c059b004ebc7a256f89408e65c36e',
21         'info_dict': {
22             'id': 'kUEfGclsU9o',
23             'ext': 'mp4',
24             'title': 'skyrim_no-audio_1080.mp4',
25             'thumbnail': 're:^https?://.*\.jpg$',
26         },
27     }, {
28         'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4',
29         'only_matching': True,
30     }, {
31         'url': 'https://openload.io/f/ZAn6oz-VZGE/',
32         'only_matching': True,
33     }, {
34         'url': 'https://openload.co/f/_-ztPaZtMhM/',
35         'only_matching': True,
36     }, {
37         # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout
38         # for title and ext
39         'url': 'https://openload.co/embed/Sxz5sADo82g/',
40         'only_matching': True,
41     }]
42
43     def _real_extract(self, url):
44         video_id = self._match_id(url)
45         webpage = self._download_webpage(url, video_id)
46
47         if 'File not found' in webpage:
48             raise ExtractorError('File not found', expected=True)
49
50         # The following extraction logic is proposed by @Belderak and @gdkchan
51         # and declared to be used freely in youtube-dl
52         # See https://github.com/rg3/youtube-dl/issues/9706
53
54         numbers_js = self._download_webpage(
55             'https://openload.co/assets/js/obfuscator/n.js', video_id,
56             note='Downloading signature numbers')
57         signums = self._search_regex(
58             r'window\.signatureNumbers\s*=\s*[\'"](?P<data>[a-z]+)[\'"]',
59             numbers_js, 'signature numbers', group='data')
60
61         linkimg_uri = self._search_regex(
62             r'<img[^>]+id="linkimg"[^>]+src="([^"]+)"', webpage, 'link image')
63         linkimg = self._request_webpage(
64             linkimg_uri, video_id, note=False).read()
65
66         width, height, pixels = decode_png(linkimg)
67
68         output = ''
69         for y in range(height):
70             for x in range(width):
71                 r, g, b = pixels[y][3 * x:3 * x + 3]
72                 if r == 0 and g == 0 and b == 0:
73                     break
74                 else:
75                     output += compat_chr(r)
76                     output += compat_chr(g)
77                     output += compat_chr(b)
78
79         img_str_length = len(output) // 200
80         img_str = [[0 for x in range(img_str_length)] for y in range(10)]
81
82         sig_str_length = len(signums) // 260
83         sig_str = [[0 for x in range(sig_str_length)] for y in range(10)]
84
85         for i in range(10):
86             for j in range(img_str_length):
87                 begin = i * img_str_length * 20 + j * 20
88                 img_str[i][j] = output[begin:begin + 20]
89             for j in range(sig_str_length):
90                 begin = i * sig_str_length * 26 + j * 26
91                 sig_str[i][j] = signums[begin:begin + 26]
92
93         parts = []
94         # TODO: find better names for str_, chr_ and sum_
95         str_ = ''
96         for i in [2, 3, 5, 7]:
97             str_ = ''
98             sum_ = float(99)
99             for j in range(len(sig_str[i])):
100                 for chr_idx in range(len(img_str[i][j])):
101                     if sum_ > float(122):
102                         sum_ = float(98)
103                     chr_ = compat_chr(int(math.floor(sum_)))
104                     if sig_str[i][j][chr_idx] == chr_ and j >= len(str_):
105                         sum_ += float(2.5)
106                         str_ += img_str[i][j][chr_idx]
107             parts.append(str_.replace(',', ''))
108
109         video_url = 'https://openload.co/stream/%s~%s~%s~%s' % (parts[3], parts[1], parts[2], parts[0])
110
111         title = self._og_search_title(webpage, default=None) or self._search_regex(
112             r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
113             'title', default=None) or self._html_search_meta(
114             'description', webpage, 'title', fatal=True)
115
116         return {
117             'id': video_id,
118             'title': title,
119             'thumbnail': self._og_search_thumbnail(webpage, default=None),
120             'url': video_url,
121             # Seems all videos have extensions in their titles
122             'ext': determine_ext(title),
123         }