git.bitcoin.ninja Git - youtube-dl/blob - youtube_dl/extractor/mediaset.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_str
   8 from ..utils import (
   9     determine_ext,
  10     parse_duration,
  11     try_get,
  12     unified_strdate,
  13     ExtractorError
  14 )
  15
  16
  17 class MediasetIE(InfoExtractor):
  18     _VALID_URL = r'''(?x)
  19                     (?:
  20                         mediaset:|
  21                         https?://
  22                             (?:www\.)?video\.mediaset\.it/
  23                             (?:
  24                                 (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
  25                                 player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
  26                             )
  27                     )(?P<id>[0-9]+)
  28                     '''
  29     _TESTS = [{
  30         # full episode
  31         'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
  32         'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
  33         'info_dict': {
  34             'id': '661824',
  35             'ext': 'mp4',
  36             'title': 'Quarta puntata',
  37             'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
  38             'thumbnail': r're:^https?://.*\.jpg$',
  39             'duration': 1414,
  40             'creator': 'mediaset',
  41             'upload_date': '20161107',
  42             'series': 'Hello Goodbye',
  43             'categories': ['reality'],
  44         },
  45         'expected_warnings': ['is not a supported codec'],
  46     }, {
  47         'url': 'http://www.video.mediaset.it/video/matrix/full_chiambretti/puntata-del-25-maggio_846685.html',
  48         'md5': '1276f966ac423d16ba255ce867de073e',
  49         'info_dict': {
  50             'id': '846685',
  51             'ext': 'mp4',
  52             'title': 'Puntata del 25 maggio',
  53             'description': 'md5:ee2e456e3eb1dba5e814596655bb5296',
  54             'thumbnail': r're:^https?://.*\.jpg$',
  55             'duration': 6565,
  56             'creator': 'mediaset',
  57             'upload_date': '20180525',
  58             'series': 'Matrix',
  59             'categories': ['infotainment'],
  60         },
  61         'expected_warnings': ['is not a supported codec'],
  62     }, {
  63         # clip
  64         'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
  65         'only_matching': True,
  66     }, {
  67         # iframe simple
  68         'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
  69         'only_matching': True,
  70     }, {
  71         # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
  72         'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
  73         'only_matching': True,
  74     }, {
  75         'url': 'mediaset:661824',
  76         'only_matching': True,
  77     }]
  78
  79     @staticmethod
  80     def _extract_urls(webpage):
  81         return [
  82             mobj.group('url')
  83             for mobj in re.finditer(
  84                 r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
  85                 webpage)]
  86
  87     def _real_extract(self, url):
  88         video_id = self._match_id(url)
  89
  90         media_info = self._download_json(
  91             'https://www.video.mediaset.it/html/metainfo.sjson',
  92             video_id, 'Downloading media info', query={
  93                 'id': video_id
  94             })['video']
  95
  96         media_id = try_get(media_info, lambda x: x['guid']) or video_id
  97
  98         video_list = self._download_json(
  99             'http://cdnsel01.mediaset.net/GetCdn2018.aspx',
 100             video_id, 'Downloading video CDN JSON', query={
 101                 'streamid': media_id,
 102                 'format': 'json',
 103             })['videoList']
 104
 105         formats = []
 106         for format_url in video_list:
 107             if '.ism' in format_url:
 108                 try:
 109                     formats.extend(self._extract_ism_formats(
 110                         format_url, video_id, ism_id='mss', fatal=False))
 111                 except ExtractorError:
 112                     pass
 113             else:
 114                 formats.append({
 115                     'url': format_url,
 116                     'format_id': determine_ext(format_url),
 117                 })
 118         self._sort_formats(formats)
 119
 120         title = media_info['title']
 121
 122         creator = try_get(
 123             media_info, lambda x: x['brand-info']['publisher'], compat_str)
 124         category = try_get(
 125             media_info, lambda x: x['brand-info']['category'], compat_str)
 126         categories = [category] if category else None
 127
 128         return {
 129             'id': video_id,
 130             'title': title,
 131             'description': media_info.get('short-description'),
 132             'thumbnail': media_info.get('thumbnail'),
 133             'duration': parse_duration(media_info.get('duration')),
 134             'creator': creator,
 135             'upload_date': unified_strdate(media_info.get('production-date')),
 136             'webpage_url': media_info.get('url'),
 137             'series': media_info.get('brand-value'),
 138             'categories': categories,
 139             'formats': formats,
 140         }