[crackle] Fix extraction (closes #15969)
[youtube-dl] / youtube_dl / extractor / crackle.py
1 # coding: utf-8
2 from __future__ import unicode_literals, division
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     determine_ext,
10     float_or_none,
11     int_or_none,
12     parse_age_limit,
13     parse_duration,
14 )
15
16
17 class CrackleIE(InfoExtractor):
18     _GEO_COUNTRIES = ['US']
19     _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)'
20     _TEST = {
21         'url': 'https://www.crackle.com/andromeda/2502343',
22         'info_dict': {
23             'id': '2502343',
24             'ext': 'mp4',
25             'title': 'Under The Night',
26             'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a',
27             'duration': 2583,
28             'view_count': int,
29             'average_rating': 0,
30             'age_limit': 14,
31             'genre': 'Action, Sci-Fi',
32             'creator': 'Allan Kroeker',
33             'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe',
34             'release_year': 2000,
35             'series': 'Andromeda',
36             'episode': 'Under The Night',
37             'season_number': 1,
38             'episode_number': 1,
39         },
40         'params': {
41             # m3u8 download
42             'skip_download': True,
43         }
44     }
45
46     def _real_extract(self, url):
47         video_id = self._match_id(url)
48
49         media = self._download_json(
50             'https://web-api-us.crackle.com/Service.svc/details/media/%s/%s'
51             % (video_id, self._GEO_COUNTRIES[0]), video_id, query={
52                 'disableProtocols': 'true',
53                 'format': 'json'
54             })
55
56         title = media['Title']
57
58         formats = []
59         for e in media['MediaURLs']:
60             if e.get('UseDRM') is True:
61                 continue
62             format_url = e.get('Path')
63             if not format_url or not isinstance(format_url, compat_str):
64                 continue
65             ext = determine_ext(format_url)
66             if ext == 'm3u8':
67                 formats.extend(self._extract_m3u8_formats(
68                     format_url, video_id, 'mp4', entry_protocol='m3u8_native',
69                     m3u8_id='hls', fatal=False))
70             elif ext == 'mpd':
71                 formats.extend(self._extract_mpd_formats(
72                     format_url, video_id, mpd_id='dash', fatal=False))
73         self._sort_formats(formats)
74
75         description = media.get('Description')
76         duration = int_or_none(media.get(
77             'DurationInSeconds')) or parse_duration(media.get('Duration'))
78         view_count = int_or_none(media.get('CountViews'))
79         average_rating = float_or_none(media.get('UserRating'))
80         age_limit = parse_age_limit(media.get('Rating'))
81         genre = media.get('Genre')
82         release_year = int_or_none(media.get('ReleaseYear'))
83         creator = media.get('Directors')
84         artist = media.get('Cast')
85
86         if media.get('MediaTypeDisplayValue') == 'Full Episode':
87             series = media.get('ShowName')
88             episode = title
89             season_number = int_or_none(media.get('Season'))
90             episode_number = int_or_none(media.get('Episode'))
91         else:
92             series = episode = season_number = episode_number = None
93
94         subtitles = {}
95         cc_files = media.get('ClosedCaptionFiles')
96         if isinstance(cc_files, list):
97             for cc_file in cc_files:
98                 if not isinstance(cc_file, dict):
99                     continue
100                 cc_url = cc_file.get('Path')
101                 if not cc_url or not isinstance(cc_url, compat_str):
102                     continue
103                 lang = cc_file.get('Locale') or 'en'
104                 subtitles.setdefault(lang, []).append({'url': cc_url})
105
106         thumbnails = []
107         images = media.get('Images')
108         if isinstance(images, list):
109             for image_key, image_url in images.items():
110                 mobj = re.search(r'Img_(\d+)[xX](\d+)', image_key)
111                 if not mobj:
112                     continue
113                 thumbnails.append({
114                     'url': image_url,
115                     'width': int(mobj.group(1)),
116                     'height': int(mobj.group(2)),
117                 })
118
119         return {
120             'id': video_id,
121             'title': title,
122             'description': description,
123             'duration': duration,
124             'view_count': view_count,
125             'average_rating': average_rating,
126             'age_limit': age_limit,
127             'genre': genre,
128             'creator': creator,
129             'artist': artist,
130             'release_year': release_year,
131             'series': series,
132             'episode': episode,
133             'season_number': season_number,
134             'episode_number': episode_number,
135             'thumbnails': thumbnails,
136             'subtitles': subtitles,
137             'formats': formats,
138         }