[youtube] fix extraction for embed restricted live streams(fixes #16433)
[youtube-dl] / youtube_dl / extractor / mediaset.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     determine_ext,
10     parse_duration,
11     try_get,
12     unified_strdate,
13 )
14
15
16 class MediasetIE(InfoExtractor):
17     _VALID_URL = r'''(?x)
18                     (?:
19                         mediaset:|
20                         https?://
21                             (?:www\.)?video\.mediaset\.it/
22                             (?:
23                                 (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
24                                 player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
25                             )
26                     )(?P<id>[0-9]+)
27                     '''
28     _TESTS = [{
29         # full episode
30         'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
31         'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
32         'info_dict': {
33             'id': '661824',
34             'ext': 'mp4',
35             'title': 'Quarta puntata',
36             'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
37             'thumbnail': r're:^https?://.*\.jpg$',
38             'duration': 1414,
39             'creator': 'mediaset',
40             'upload_date': '20161107',
41             'series': 'Hello Goodbye',
42             'categories': ['reality'],
43         },
44         'expected_warnings': ['is not a supported codec'],
45     }, {
46         # clip
47         'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
48         'only_matching': True,
49     }, {
50         # iframe simple
51         'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
52         'only_matching': True,
53     }, {
54         # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
55         'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
56         'only_matching': True,
57     }, {
58         'url': 'mediaset:661824',
59         'only_matching': True,
60     }]
61
62     @staticmethod
63     def _extract_urls(webpage):
64         return [
65             mobj.group('url')
66             for mobj in re.finditer(
67                 r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
68                 webpage)]
69
70     def _real_extract(self, url):
71         video_id = self._match_id(url)
72
73         video_list = self._download_json(
74             'http://cdnsel01.mediaset.net/GetCdn.aspx',
75             video_id, 'Downloading video CDN JSON', query={
76                 'streamid': video_id,
77                 'format': 'json',
78             })['videoList']
79
80         formats = []
81         for format_url in video_list:
82             if '.ism' in format_url:
83                 formats.extend(self._extract_ism_formats(
84                     format_url, video_id, ism_id='mss', fatal=False))
85             else:
86                 formats.append({
87                     'url': format_url,
88                     'format_id': determine_ext(format_url),
89                 })
90         self._sort_formats(formats)
91
92         mediainfo = self._download_json(
93             'http://plr.video.mediaset.it/html/metainfo.sjson',
94             video_id, 'Downloading video info JSON', query={
95                 'id': video_id,
96             })['video']
97
98         title = mediainfo['title']
99
100         creator = try_get(
101             mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
102         category = try_get(
103             mediainfo, lambda x: x['brand-info']['category'], compat_str)
104         categories = [category] if category else None
105
106         return {
107             'id': video_id,
108             'title': title,
109             'description': mediainfo.get('short-description'),
110             'thumbnail': mediainfo.get('thumbnail'),
111             'duration': parse_duration(mediainfo.get('duration')),
112             'creator': creator,
113             'upload_date': unified_strdate(mediainfo.get('production-date')),
114             'webpage_url': mediainfo.get('url'),
115             'series': mediainfo.get('brand-value'),
116             'categories': categories,
117             'formats': formats,
118         }