[mediaset] Add support for new videos
[youtube-dl] / youtube_dl / extractor / mediaset.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     determine_ext,
10     parse_duration,
11     try_get,
12     unified_strdate,
13     ExtractorError
14 )
15
16
17 class MediasetIE(InfoExtractor):
18     _VALID_URL = r'''(?x)
19                     (?:
20                         mediaset:|
21                         https?://
22                             (?:www\.)?video\.mediaset\.it/
23                             (?:
24                                 (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
25                                 player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
26                             )
27                     )(?P<id>[0-9]+)
28                     '''
29     _TESTS = [{
30         # full episode
31         'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
32         'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
33         'info_dict': {
34             'id': '661824',
35             'ext': 'mp4',
36             'title': 'Quarta puntata',
37             'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
38             'thumbnail': r're:^https?://.*\.jpg$',
39             'duration': 1414,
40             'creator': 'mediaset',
41             'upload_date': '20161107',
42             'series': 'Hello Goodbye',
43             'categories': ['reality'],
44         },
45         'expected_warnings': ['is not a supported codec'],
46     }, {
47         'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html',
48         'md5': '1276f966ac423d16ba255ce867de073e',
49         'info_dict': {
50             'id': '846685',
51             'ext': 'mp4',
52             'title': 'Puntata del 25 maggio',
53             'description': 'md5:ee2e456e3eb1dba5e814596655bb5296',
54             'thumbnail': r're:^https?://.*\.jpg$',
55             'duration': 6565,
56             'creator': 'mediaset',
57             'upload_date': '20180525',
58             'series': 'Matrix',
59             'categories': ['infotainment'],
60         },
61         'expected_warnings': ['is not a supported codec'],
62     }, {
63         # clip
64         'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
65         'only_matching': True,
66     }, {
67         # iframe simple
68         'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
69         'only_matching': True,
70     }, {
71         # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
72         'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
73         'only_matching': True,
74     }, {
75         'url': 'mediaset:661824',
76         'only_matching': True,
77     }]
78
79     @staticmethod
80     def _extract_urls(webpage):
81         return [
82             mobj.group('url')
83             for mobj in re.finditer(
84                 r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
85                 webpage)]
86
87     def _real_extract(self, url):
88         video_id = self._match_id(url)
89
90         media_info = self._download_json(
91             'https://www.video.mediaset.it/html/metainfo.sjson',
92             video_id, 'Downloading media info', query={
93                 'id': video_id
94             })['video']
95
96         media_id = try_get(media_info, lambda x: x['guid']) or video_id
97
98         video_list = self._download_json(
99             'http://cdnsel01.mediaset.net/GetCdn2018.aspx',
100             video_id, 'Downloading video CDN JSON', query={
101                 'streamid': media_id,
102                 'format': 'json',
103             })['videoList']
104
105         formats = []
106         for format_url in video_list:
107             if '.ism' in format_url:
108                 try:
109                     formats.extend(self._extract_ism_formats(
110                         format_url, video_id, ism_id='mss', fatal=False))
111                 except ExtractorError:
112                     pass
113             else:
114                 formats.append({
115                     'url': format_url,
116                     'format_id': determine_ext(format_url),
117                 })
118         self._sort_formats(formats)
119
120         title = media_info['title']
121
122         creator = try_get(
123             media_info, lambda x: x['brand-info']['publisher'], compat_str)
124         category = try_get(
125             media_info, lambda x: x['brand-info']['category'], compat_str)
126         categories = [category] if category else None
127
128         return {
129             'id': video_id,
130             'title': title,
131             'description': media_info.get('short-description'),
132             'thumbnail': media_info.get('thumbnail'),
133             'duration': parse_duration(media_info.get('duration')),
134             'creator': creator,
135             'upload_date': unified_strdate(media_info.get('production-date')),
136             'webpage_url': media_info.get('url'),
137             'series': media_info.get('brand-value'),
138             'categories': categories,
139             'formats': formats,
140         }