_ Git - youtube-dl/blob - youtube_dl/extractor/theplatform.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5 import time
   6 import hmac
   7 import binascii
   8 import hashlib
   9
  10
  11 from .common import InfoExtractor
  12 from ..utils import (
  13     determine_ext,
  14     ExtractorError,
  15     xpath_with_ns,
  16     unsmuggle_url,
  17     int_or_none,
  18 )
  19
  20 default_ns = 'http://www.w3.org/2005/SMIL21/Language'
  21 _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
  22
  23
  24 class ThePlatformIE(InfoExtractor):
  25     _VALID_URL = r'''(?x)
  26         (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
  27            (?:(?P<media>(?:[^/]+/)+select/media/)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
  28          |theplatform:)(?P<id>[^/\?&]+)'''
  29
  30     _TESTS = [{
  31         # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/
  32         'url': 'http://link.theplatform.com/s/dJ5BDC/e9I_cZgTgIPd/meta.smil?format=smil&Tracking=true&mbr=true',
  33         'info_dict': {
  34             'id': 'e9I_cZgTgIPd',
  35             'ext': 'flv',
  36             'title': 'Blackberry\'s big, bold Z30',
  37             'description': 'The Z30 is Blackberry\'s biggest, baddest mobile messaging device yet.',
  38             'duration': 247,
  39         },
  40         'params': {
  41             # rtmp download
  42             'skip_download': True,
  43         },
  44     }, {
  45         # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/
  46         'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT',
  47         'info_dict': {
  48             'id': '22d_qsQ6MIRT',
  49             'ext': 'flv',
  50             'description': 'md5:ac330c9258c04f9d7512cf26b9595409',
  51             'title': 'Tesla Model S: A second step towards a cleaner motoring future',
  52         },
  53         'params': {
  54             # rtmp download
  55             'skip_download': True,
  56         }
  57     }, {
  58         'url': 'https://player.theplatform.com/p/D6x-PC/pulse_preview/embed/select/media/yMBg9E8KFxZD',
  59         'info_dict': {
  60             'id': 'yMBg9E8KFxZD',
  61             'ext': 'mp4',
  62             'description': 'md5:644ad9188d655b742f942bf2e06b002d',
  63             'title': 'HIGHLIGHTS: USA bag first ever series Cup win',
  64         }
  65     }, {
  66         'url': 'http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7',
  67         'only_matching': True,
  68     }]
  69
  70     @staticmethod
  71     def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
  72         flags = '10' if include_qs else '00'
  73         expiration_date = '%x' % (int(time.time()) + life)
  74
  75         def str_to_hex(str):
  76             return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
  77
  78         def hex_to_str(hex):
  79             return binascii.a2b_hex(hex)
  80
  81         relative_path = url.split('http://link.theplatform.com/s/')[1].split('?')[0]
  82         clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path))
  83         checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
  84         sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
  85         return '%s&sig=%s' % (url, sig)
  86
  87     def _real_extract(self, url):
  88         url, smuggled_data = unsmuggle_url(url, {})
  89
  90         mobj = re.match(self._VALID_URL, url)
  91         provider_id = mobj.group('provider_id')
  92         video_id = mobj.group('id')
  93
  94         if not provider_id:
  95             provider_id = 'dJ5BDC'
  96
  97         path = provider_id
  98         if mobj.group('media'):
  99             path += '/media'
 100         path += '/' + video_id
 101
 102         if smuggled_data.get('force_smil_url', False):
 103             smil_url = url
 104         elif mobj.group('config'):
 105             config_url = url + '&form=json'
 106             config_url = config_url.replace('swf/', 'config/')
 107             config_url = config_url.replace('onsite/', 'onsite/config/')
 108             config = self._download_json(config_url, video_id, 'Downloading config')
 109             if 'releaseUrl' in config:
 110                 release_url = config['releaseUrl']
 111             else:
 112                 release_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
 113             smil_url = release_url + '&format=SMIL&formats=MPEG4&manifest=f4m'
 114         else:
 115             smil_url = 'http://link.theplatform.com/s/%s/meta.smil?format=smil&mbr=true' % path
 116
 117         sig = smuggled_data.get('sig')
 118         if sig:
 119             smil_url = self._sign_url(smil_url, sig['key'], sig['secret'])
 120
 121         meta = self._download_xml(smil_url, video_id)
 122         try:
 123             error_msg = next(
 124                 n.attrib['abstract']
 125                 for n in meta.findall(_x('.//smil:ref'))
 126                 if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
 127         except StopIteration:
 128             pass
 129         else:
 130             raise ExtractorError(error_msg, expected=True)
 131
 132         info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
 133         info_json = self._download_webpage(info_url, video_id)
 134         info = json.loads(info_json)
 135
 136         subtitles = {}
 137         captions = info.get('captions')
 138         if isinstance(captions, list):
 139             for caption in captions:
 140                 lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
 141                 subtitles[lang] = [{
 142                     'ext': 'srt' if mime == 'text/srt' else 'ttml',
 143                     'url': src,
 144                 }]
 145
 146         formats = self._parse_smil_formats(
 147             meta, smil_url, video_id, namespace=default_ns,
 148             # the parameters are from syfy.com, other sites may use others,
 149             # they also work for nbc.com
 150             f4m_params={'g': 'UXWGVKRWHFSP', 'hdcore': '3.0.3'},
 151             transform_rtmp_url=lambda streamer, src: (streamer, 'mp4:' + src))
 152
 153         for _format in formats:
 154             ext = determine_ext(_format['url'])
 155             if ext == 'once':
 156                 _format['ext'] = 'mp4'
 157
 158         self._sort_formats(formats)
 159
 160         return {
 161             'id': video_id,
 162             'title': info['title'],
 163             'subtitles': subtitles,
 164             'formats': formats,
 165             'description': info['description'],
 166             'thumbnail': info['defaultThumbnailUrl'],
 167             'duration': int_or_none(info.get('duration'), 1000),
 168         }