[theintercept] Add new extractor
[youtube-dl] / youtube_dl / extractor / theintercept.py
1 # encoding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7
8 from ..utils import (
9     ExtractorError,
10 )
11
12 class TheInterceptIE(InfoExtractor):
13     _VALID_URL = r'https://theintercept.com/fieldofvision/(?P<id>.+?)/'
14     _TESTS = [{
15         'url': 'https://theintercept.com/fieldofvision/thisisacoup-episode-four-surrender-or-die/',
16         'info_dict': {
17             'id': 'thisisacoup-episode-four-surrender-or-die',
18             'ext': 'mp4',
19             'title': '#ThisIsACoup – Episode Four: Surrender or Die',
20             'upload_date': '20151218',
21             'description': 'md5:74dd27f0e2fbd50817829f97eaa33140',
22         }
23     }]
24
25     def _real_extract(self, url):
26         display_id = self._match_id(url)
27         webpage = self._download_webpage(url, display_id)
28
29         mobj = re.search(r'initialStoreTree =(?P<json_data>.+})', webpage)
30         if mobj is None:
31             raise ExtractorError('Unable to extract initialStoreTree')
32         json_data = self._parse_json(mobj.group('json_data'), display_id)
33
34         info = None
35         for post in json_data['resources']['posts'].values():
36             if post['slug'] == display_id:
37                 info = post
38                 break
39         if info is None:
40             raise ExtractorError('Unable to find info for %s'%display_id)
41
42         title = info['title']
43         description = info['excerpt']
44         upload_date = info['date'][:10].replace('-', '')
45         video_id = info['fov_videoid']
46         creator = ','.join([a['display_name'] for a in info['authors']])
47         thumbnail = self._og_search_property('image', webpage)
48         content_id = thumbnail.split('/')[-1].split('.')[0]
49         content_url = 'https://content.jwplatform.com/jw6/{content_id}.xml'.format(content_id=content_id)
50         content = self._download_xml(content_url, video_id)
51
52         formats = []
53         for source in content.findall('.//{http://rss.jwpcdn.com/}source'):
54             if source.attrib['file'].endswith('.m3u8'):
55                 formats.extend(self._extract_m3u8_formats(
56                     source.attrib['file'], video_id, 'mp4', preference=1, m3u8_id='hls'))
57
58         return {
59             'creator': creator,
60             'description': description,
61             'display_id': display_id,
62             'formats': formats,
63             'id': video_id,
64             'id': video_id,
65             'thumbnail': thumbnail,
66             'title': title,
67             'upload_date': upload_date,
68         }