168e5e90152b44d76dcbbbeb1b274db5dcbf5827
[youtube-dl] / youtube_dl / extractor / wistia.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     ExtractorError,
8     int_or_none,
9     float_or_none,
10     unescapeHTML,
11 )
12
13
14 class WistiaIE(InfoExtractor):
15     _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]{10})'
16     _EMBED_BASE_URL = 'http://fast.wistia.com/embed/'
17
18     _TESTS = [{
19         'url': 'http://fast.wistia.net/embed/iframe/sh7fpupwlt',
20         'md5': 'cafeb56ec0c53c18c97405eecb3133df',
21         'info_dict': {
22             'id': 'sh7fpupwlt',
23             'ext': 'mov',
24             'title': 'Being Resourceful',
25             'description': 'a Clients From Hell Video Series video from worldwidewebhosting',
26             'upload_date': '20131204',
27             'timestamp': 1386185018,
28             'duration': 117,
29         },
30     }, {
31         'url': 'wistia:sh7fpupwlt',
32         'only_matching': True,
33     }, {
34         # with hls video
35         'url': 'wistia:807fafadvk',
36         'only_matching': True,
37     }, {
38         'url': 'http://fast.wistia.com/embed/iframe/sh7fpupwlt',
39         'only_matching': True,
40     }, {
41         'url': 'http://fast.wistia.net/embed/medias/sh7fpupwlt.json',
42         'only_matching': True,
43     }]
44
45     # https://wistia.com/support/embed-and-share/video-on-your-website
46     @staticmethod
47     def _extract_url(webpage):
48         urls = WistiaIE._extract_urls(webpage)
49         return urls[0] if urls else None
50
51     @staticmethod
52     def _extract_urls(webpage):
53         urls = []
54         for match in re.finditer(
55                 r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage):
56             urls.append(unescapeHTML(match.group('url')))
57         for match in re.finditer(
58                 r'''(?sx)
59                     <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2
60                 ''', webpage):
61             urls.append('wistia:%s' % match.group('id'))
62         for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage):
63             urls.append('wistia:%s' % match.group('id'))
64         return urls
65
66     def _real_extract(self, url):
67         video_id = self._match_id(url)
68
69         data_json = self._download_json(
70             self._EMBED_BASE_URL + 'medias/%s.json' % video_id, video_id,
71             # Some videos require this.
72             headers={
73                 'Referer': url if url.startswith('http') else self._EMBED_BASE_URL + 'iframe/' + video_id,
74             })
75
76         if data_json.get('error'):
77             raise ExtractorError(
78                 'Error while getting the playlist', expected=True)
79
80         data = data_json['media']
81         title = data['name']
82
83         formats = []
84         thumbnails = []
85         for a in data['assets']:
86             aurl = a.get('url')
87             if not aurl:
88                 continue
89             astatus = a.get('status')
90             atype = a.get('type')
91             if (astatus is not None and astatus != 2) or atype in ('preview', 'storyboard'):
92                 continue
93             elif atype in ('still', 'still_image'):
94                 thumbnails.append({
95                     'url': aurl,
96                     'width': int_or_none(a.get('width')),
97                     'height': int_or_none(a.get('height')),
98                     'filesize': int_or_none(a.get('size')),
99                 })
100             else:
101                 aext = a.get('ext')
102                 display_name = a.get('display_name')
103                 format_id = atype
104                 if atype and atype.endswith('_video') and display_name:
105                     format_id = '%s-%s' % (atype[:-6], display_name)
106                 f = {
107                     'format_id': format_id,
108                     'url': aurl,
109                     'tbr': int_or_none(a.get('bitrate')) or None,
110                     'preference': 1 if atype == 'original' else None,
111                 }
112                 if display_name == 'Audio':
113                     f.update({
114                         'vcodec': 'none',
115                     })
116                 else:
117                     f.update({
118                         'width': int_or_none(a.get('width')),
119                         'height': int_or_none(a.get('height')),
120                         'vcodec': a.get('codec'),
121                     })
122                 if a.get('container') == 'm3u8' or aext == 'm3u8':
123                     ts_f = f.copy()
124                     ts_f.update({
125                         'ext': 'ts',
126                         'format_id': f['format_id'].replace('hls-', 'ts-'),
127                         'url': f['url'].replace('.bin', '.ts'),
128                     })
129                     formats.append(ts_f)
130                     f.update({
131                         'ext': 'mp4',
132                         'protocol': 'm3u8_native',
133                     })
134                 else:
135                     f.update({
136                         'container': a.get('container'),
137                         'ext': aext,
138                         'filesize': int_or_none(a.get('size')),
139                     })
140                 formats.append(f)
141
142         self._sort_formats(formats)
143
144         subtitles = {}
145         for caption in data.get('captions', []):
146             language = caption.get('language')
147             if not language:
148                 continue
149             subtitles[language] = [{
150                 'url': self._EMBED_BASE_URL + 'captions/' + video_id + '.vtt?language=' + language,
151             }]
152
153         return {
154             'id': video_id,
155             'title': title,
156             'description': data.get('seoDescription'),
157             'formats': formats,
158             'thumbnails': thumbnails,
159             'duration': float_or_none(data.get('duration')),
160             'timestamp': int_or_none(data.get('createdAt')),
161             'subtitles': subtitles,
162         }