[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / zype.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_HTTPError
8 from ..utils import (
9     dict_get,
10     ExtractorError,
11     int_or_none,
12     js_to_json,
13     parse_iso8601,
14 )
15
16
17 class ZypeIE(InfoExtractor):
18     _ID_RE = r'[\da-fA-F]+'
19     _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)='
20     _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE))
21     _TEST = {
22         'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false',
23         'md5': 'eaee31d474c76a955bdaba02a505c595',
24         'info_dict': {
25             'id': '5b400b834b32992a310622b9',
26             'ext': 'mp4',
27             'title': 'Smoky Barbecue Favorites',
28             'thumbnail': r're:^https?://.*\.jpe?g',
29             'description': 'md5:5ff01e76316bd8d46508af26dc86023b',
30             'timestamp': 1504915200,
31             'upload_date': '20170909',
32         },
33     }
34
35     @staticmethod
36     def _extract_urls(webpage):
37         return [
38             mobj.group('url')
39             for mobj in re.finditer(
40                 r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE),
41                 webpage)]
42
43     def _real_extract(self, url):
44         video_id = self._match_id(url)
45
46         try:
47             response = self._download_json(re.sub(
48                 r'\.(?:js|html)\?', '.json?', url), video_id)['response']
49         except ExtractorError as e:
50             if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401, 403):
51                 raise ExtractorError(self._parse_json(
52                     e.cause.read().decode(), video_id)['message'], expected=True)
53             raise
54
55         body = response['body']
56         video = response['video']
57         title = video['title']
58
59         if isinstance(body, dict):
60             formats = []
61             for output in body.get('outputs', []):
62                 output_url = output.get('url')
63                 if not output_url:
64                     continue
65                 name = output.get('name')
66                 if name == 'm3u8':
67                     formats = self._extract_m3u8_formats(
68                         output_url, video_id, 'mp4',
69                         'm3u8_native', m3u8_id='hls', fatal=False)
70                 else:
71                     f = {
72                         'format_id': name,
73                         'tbr': int_or_none(output.get('bitrate')),
74                         'url': output_url,
75                     }
76                     if name in ('m4a', 'mp3'):
77                         f['vcodec'] = 'none'
78                     else:
79                         f.update({
80                             'height': int_or_none(output.get('height')),
81                             'width': int_or_none(output.get('width')),
82                         })
83                     formats.append(f)
84             text_tracks = body.get('subtitles') or []
85         else:
86             m3u8_url = self._search_regex(
87                 r'(["\'])(?P<url>(?:(?!\1).)+\.m3u8(?:(?!\1).)*)\1',
88                 body, 'm3u8 url', group='url')
89             formats = self._extract_m3u8_formats(
90                 m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
91             text_tracks = self._search_regex(
92                 r'textTracks\s*:\s*(\[[^]]+\])',
93                 body, 'text tracks', default=None)
94             if text_tracks:
95                 text_tracks = self._parse_json(
96                     text_tracks, video_id, js_to_json, False)
97         self._sort_formats(formats)
98
99         subtitles = {}
100         if text_tracks:
101             for text_track in text_tracks:
102                 tt_url = dict_get(text_track, ('file', 'src'))
103                 if not tt_url:
104                     continue
105                 subtitles.setdefault(text_track.get('label') or 'English', []).append({
106                     'url': tt_url,
107                 })
108
109         thumbnails = []
110         for thumbnail in video.get('thumbnails', []):
111             thumbnail_url = thumbnail.get('url')
112             if not thumbnail_url:
113                 continue
114             thumbnails.append({
115                 'url': thumbnail_url,
116                 'width': int_or_none(thumbnail.get('width')),
117                 'height': int_or_none(thumbnail.get('height')),
118             })
119
120         return {
121             'id': video_id,
122             'display_id': video.get('friendly_title'),
123             'title': title,
124             'thumbnails': thumbnails,
125             'description': dict_get(video, ('description', 'ott_description', 'short_description')),
126             'timestamp': parse_iso8601(video.get('published_at')),
127             'duration': int_or_none(video.get('duration')),
128             'view_count': int_or_none(video.get('request_count')),
129             'average_rating': int_or_none(video.get('rating')),
130             'season_number': int_or_none(video.get('season')),
131             'episode_number': int_or_none(video.get('episode')),
132             'formats': formats,
133             'subtitles': subtitles,
134         }