[MSN] add new extractor
[youtube-dl] / youtube_dl / extractor / msn.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5 from .common import InfoExtractor
6
7 from ..utils import (
8     unescapeHTML,
9     int_or_none,
10 )
11
12 class MSNIE(InfoExtractor):
13     _VALID_URL = r'https?://(?:www\.)?msn\.com/[a-z-]{2,5}(?:/[a-z]+)+/(?P<display_id>[a-z-]+)/[a-z]{2}-(?P<id>[a-zA-Z]+)'
14     _TESTS = [{
15         'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE',
16         'info_dict': {
17             'id': 'BBqQYNE',
18             'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message',
19             'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25',
20             'duration': 104,
21             'ext': 'mp4',
22         },
23         'params': {
24             # m3u8 download
25             'skip_download': True,
26         }
27     }, {
28         'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
29         'info_dict': {
30             'id': 'BBt6ZKf',
31             'title': 'All That Bling: Self-Made Millionaire Child Builds Fashion & Jewellery Empire',
32             'description': 'md5:8e683bd5c729d5fb16d96539a582aa5e',
33             'duration': 350,
34             'ext': 'mp4',
35         },
36         'params': {
37             # m3u8 download
38             'skip_download': True,
39         }
40     }]
41
42     def _real_extract(self, url):
43         mobj = re.match(self._VALID_URL, url)
44         video_id, display_id = mobj.group('id', 'display_id')
45
46         webpage = self._download_webpage(url, display_id)
47
48         self.report_extraction(display_id)
49         video_data = self._parse_json(self._html_search_regex(r'data-metadata\s*=\s*["\'](.+)["\']',
50             webpage, 'video data'), display_id)
51
52         formats = []
53         for video_file in video_data.get('videoFiles', []):
54             if not '.ism' in video_file.get('url', '.ism'):
55                 formats.append({
56                     'url': unescapeHTML(video_file.get('url')),
57                     'ext': 'mp4',
58                     'width': int_or_none(video_file.get('width')),
59                     'height': int_or_none(video_file.get('height')),
60                 })
61             elif 'm3u8' in video_file.get('url'):
62                 formats.extend(self._extract_m3u8_formats(
63                     video_file.get('url'), display_id, 'mp4'))
64             # There (often) exists an Microsoft Smooth Streaming manifest
65             # (.ism) which is not yet supported
66             # (https://github.com/rg3/youtube-dl/issues/8118)
67
68         self._sort_formats(formats)
69
70         subtitles = {}
71         for f in video_data.get('files', []):
72             if f.get('formatCode', '') == '3100':
73                 lang = f.get('culture', '')
74                 if not lang:
75                     continue
76                 subtitles.setdefault(lang, []).append({
77                     'ext': 'ttml',
78                     'url': unescapeHTML(f.get('url')),
79                 })
80
81         return {
82             'id': video_id,
83             'title': video_data['title'],
84             'formats': formats,
85             'thumbnail': video_data.get('headlineImage', {}).get('url'),
86             'description': video_data.get('description'),
87             'creator': video_data.get('creator'),
88             'subtitles': subtitles,
89             'duration': int_or_none(video_data.get('durationSecs')),
90         }