[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / kinopoisk.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5 from ..utils import (
6     dict_get,
7     int_or_none,
8 )
9
10
11 class KinoPoiskIE(InfoExtractor):
12     _GEO_COUNTRIES = ['RU']
13     _VALID_URL = r'https?://(?:www\.)?kinopoisk\.ru/film/(?P<id>\d+)'
14     _TESTS = [{
15         'url': 'https://www.kinopoisk.ru/film/81041/watch/',
16         'md5': '4f71c80baea10dfa54a837a46111d326',
17         'info_dict': {
18             'id': '81041',
19             'ext': 'mp4',
20             'title': 'Алеша попович и тугарин змей',
21             'description': 'md5:43787e673d68b805d0aa1df5a5aea701',
22             'thumbnail': r're:^https?://.*',
23             'duration': 4533,
24             'age_limit': 12,
25         },
26         'params': {
27             'format': 'bestvideo',
28         },
29     }, {
30         'url': 'https://www.kinopoisk.ru/film/81041',
31         'only_matching': True,
32     }]
33
34     def _real_extract(self, url):
35         video_id = self._match_id(url)
36
37         webpage = self._download_webpage(
38             'https://ott-widget.kinopoisk.ru/v1/kp/', video_id,
39             query={'kpId': video_id})
40
41         data = self._parse_json(
42             self._search_regex(
43                 r'(?s)<script[^>]+\btype=["\']application/json[^>]+>(.+?)<',
44                 webpage, 'data'),
45             video_id)['models']
46
47         film = data['filmStatus']
48         title = film.get('title') or film['originalTitle']
49
50         formats = self._extract_m3u8_formats(
51             data['playlistEntity']['uri'], video_id, 'mp4',
52             entry_protocol='m3u8_native', m3u8_id='hls')
53         self._sort_formats(formats)
54
55         description = dict_get(
56             film, ('descriptscription', 'description',
57                    'shortDescriptscription', 'shortDescription'))
58         thumbnail = film.get('coverUrl') or film.get('posterUrl')
59         duration = int_or_none(film.get('duration'))
60         age_limit = int_or_none(film.get('restrictionAge'))
61
62         return {
63             'id': video_id,
64             'title': title,
65             'description': description,
66             'thumbnail': thumbnail,
67             'duration': duration,
68             'age_limit': age_limit,
69             'formats': formats,
70         }