[mediaset] Add support for shortcut
[youtube-dl] / youtube_dl / extractor / mediaset.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     determine_ext,
10     parse_duration,
11     try_get,
12     unified_strdate,
13 )
14
15
16 class MediasetIE(InfoExtractor):
17     _VALID_URL = r'''(?x)
18                     (?:
19                         mediaset:|
20                         https?://
21                             (?:www\.)?video\.mediaset\.it/
22                             (?:
23                                 (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
24                                 player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
25                             )
26                     )(?P<id>[0-9]+)
27                     '''
28     _TESTS = [{
29         # full episode
30         'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
31         'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
32         'info_dict': {
33             'id': '661824',
34             'ext': 'mp4',
35             'title': 'Quarta puntata',
36             'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
37             'thumbnail': r're:^https?://.*\.jpg$',
38             'duration': 1414,
39             'creator': 'mediaset',
40             'release_date': '20161107',
41             'series': 'Hello Goodbye',
42             'categories': ['reality'],
43         },
44         'expected_warnings': ['is not a supported codec'],
45     }, {
46         # clip
47         'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
48         'only_matching': True,
49     }, {
50         # iframe simple
51         'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
52         'only_matching': True,
53     }, {
54         # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
55         'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
56         'only_matching': True,
57     }, {
58         'url': 'mediaset:661824',
59         'only_matching': True,
60     }]
61
62     def _real_extract(self, url):
63         video_id = self._match_id(url)
64
65         video_list = self._download_json(
66             'http://cdnsel01.mediaset.net/GetCdn.aspx',
67             video_id, 'Downloading video CDN JSON', query={
68                 'streamid': video_id,
69                 'format': 'json',
70             })['videoList']
71
72         formats = []
73         for format_url in video_list:
74             if '.ism' in format_url:
75                 formats.extend(self._extract_ism_formats(
76                     format_url, video_id, ism_id='mss', fatal=False))
77             else:
78                 formats.append({
79                     'url': format_url,
80                     'format_id': determine_ext(format_url),
81                 })
82         self._sort_formats(formats)
83
84         mediainfo = self._download_json(
85             'http://plr.video.mediaset.it/html/metainfo.sjson',
86             video_id, 'Downloading video info JSON', query={
87                 'id': video_id,
88             })['video']
89
90         title = mediainfo['title']
91
92         creator = try_get(
93             mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
94         category = try_get(
95             mediainfo, lambda x: x['brand-info']['category'], compat_str)
96         categories = [category] if category else None
97
98         return {
99             'id': video_id,
100             'title': title,
101             'description': mediainfo.get('short-description'),
102             'thumbnail': mediainfo.get('thumbnail'),
103             'duration': parse_duration(mediainfo.get('duration')),
104             'creator': creator,
105             'release_date': unified_strdate(mediainfo.get('production-date')),
106             'webpage_url': mediainfo.get('url'),
107             'series': mediainfo.get('brand-value'),
108             'categories': categories,
109             'formats': formats,
110         }