_ Git - youtube-dl/blob - youtube_dl/extractor/msn.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5 from .common import InfoExtractor
   6
   7 from ..utils import (
   8     unescapeHTML,
   9     int_or_none,
  10 )
  11
  12 class MSNIE(InfoExtractor):
  13     _VALID_URL = r'https?://(?:www\.)?msn\.com/[a-z-]{2,5}(?:/[a-z]+)+/(?P<display_id>[a-z-]+)/[a-z]{2}-(?P<id>[a-zA-Z]+)'
  14     _TESTS = [{
  15         'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE',
  16         'info_dict': {
  17             'id': 'BBqQYNE',
  18             'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message',
  19             'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25',
  20             'duration': 104,
  21             'ext': 'mp4',
  22         },
  23         'params': {
  24             # m3u8 download
  25             'skip_download': True,
  26         }
  27     }, {
  28         'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
  29         'info_dict': {
  30             'id': 'BBt6ZKf',
  31             'title': 'All That Bling: Self-Made Millionaire Child Builds Fashion & Jewellery Empire',
  32             'description': 'md5:8e683bd5c729d5fb16d96539a582aa5e',
  33             'duration': 350,
  34             'ext': 'mp4',
  35         },
  36         'params': {
  37             # m3u8 download
  38             'skip_download': True,
  39         }
  40     }]
  41
  42     def _real_extract(self, url):
  43         mobj = re.match(self._VALID_URL, url)
  44         video_id, display_id = mobj.group('id', 'display_id')
  45
  46         webpage = self._download_webpage(url, display_id)
  47
  48         self.report_extraction(display_id)
  49         video_data = self._parse_json(self._html_search_regex(r'data-metadata\s*=\s*["\'](.+)["\']',
  50             webpage, 'video data'), display_id)
  51
  52         formats = []
  53         for video_file in video_data.get('videoFiles', []):
  54             if not '.ism' in video_file.get('url', '.ism'):
  55                 formats.append({
  56                     'url': unescapeHTML(video_file.get('url')),
  57                     'ext': 'mp4',
  58                     'width': int_or_none(video_file.get('width')),
  59                     'height': int_or_none(video_file.get('height')),
  60                 })
  61             elif 'm3u8' in video_file.get('url'):
  62                 formats.extend(self._extract_m3u8_formats(
  63                     video_file.get('url'), display_id, 'mp4'))
  64             # There (often) exists an Microsoft Smooth Streaming manifest
  65             # (.ism) which is not yet supported
  66             # (https://github.com/rg3/youtube-dl/issues/8118)
  67
  68         self._sort_formats(formats)
  69
  70         subtitles = {}
  71         for f in video_data.get('files', []):
  72             if f.get('formatCode', '') == '3100':
  73                 lang = f.get('culture', '')
  74                 if not lang:
  75                     continue
  76                 subtitles.setdefault(lang, []).append({
  77                     'ext': 'ttml',
  78                     'url': unescapeHTML(f.get('url')),
  79                 })
  80
  81         return {
  82             'id': video_id,
  83             'title': video_data['title'],
  84             'formats': formats,
  85             'thumbnail': video_data.get('headlineImage', {}).get('url'),
  86             'description': video_data.get('description'),
  87             'creator': video_data.get('creator'),
  88             'subtitles': subtitles,
  89             'duration': int_or_none(video_data.get('durationSecs')),
  90         }