[vshare] Fix extraction (closes #14473)
[youtube-dl] / youtube_dl / extractor / vshare.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_chr
8 from ..utils import decode_packed_codes
9
10
11 class VShareIE(InfoExtractor):
12     _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
13     _TESTS = [{
14         'url': 'https://vshare.io/d/0f64ce6',
15         'md5': '17b39f55b5497ae8b59f5fbce8e35886',
16         'info_dict': {
17             'id': '0f64ce6',
18             'title': 'vl14062007715967',
19             'ext': 'mp4',
20         }
21     }, {
22         'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1',
23         'only_matching': True,
24     }]
25
26     def _extract_packed(self, webpage):
27         packed = self._search_regex(r'(eval\(function.+)', webpage, 'packed code')
28         unpacked = decode_packed_codes(packed)
29         digits = self._search_regex(r'\[((?:\d+,?)+)\]', unpacked, 'digits')
30         digits = digits.split(',')
31         digits = [int(digit) for digit in digits]
32         key_digit = self._search_regex(r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit')
33         chars = [compat_chr(d - int(key_digit)) for d in digits]
34         return ''.join(chars)
35
36     def _real_extract(self, url):
37         video_id = self._match_id(url)
38
39         webpage = self._download_webpage(
40             'https://vshare.io/v/%s/width-650/height-430/1' % video_id, video_id)
41
42         title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title')
43         title = title.split(' - ')[0]
44
45         unpacked = self._extract_packed(webpage)
46         video_urls = re.findall(r'<source src="([^"]+)', unpacked)
47         formats = [{'url': video_url} for video_url in video_urls]
48         return {
49             'id': video_id,
50             'title': title,
51             'formats': formats,
52         }
53
54     @staticmethod
55     def _extract_urls(webpage):
56         return re.findall(
57             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)',
58             webpage)