[expressen] Add extractor
[youtube-dl] / youtube_dl / extractor / expressen.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5 from ..utils import (
6     determine_ext,
7     int_or_none,
8     unescapeHTML,
9     unified_timestamp,
10 )
11
12
13 class ExpressenIE(InfoExtractor):
14     _VALID_URL = r'https?://(?:www\.)?expressen\.se/tv/(?:[^/]+/)*(?P<id>[^/?#&]+)'
15     _TESTS = [{
16         'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/',
17         'md5': '2fbbe3ca14392a6b1b36941858d33a45',
18         'info_dict': {
19             'id': '8690962',
20             'ext': 'mp4',
21             'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden',
22             'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba',
23             'thumbnail': r're:^https?://.*\.jpg$',
24             'duration': 788,
25             'timestamp': 1526639109,
26             'upload_date': '20180518',
27         },
28     }, {
29         'url': 'https://www.expressen.se/tv/kultur/kulturdebatt-med-expressens-karin-olsson/',
30         'only_matching': True,
31     }]
32
33     def _real_extract(self, url):
34         display_id = self._match_id(url)
35
36         webpage = self._download_webpage(url, display_id)
37
38         def extract_data(name):
39             return self._parse_json(
40                 self._search_regex(
41                     r'data-%s=(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
42                     webpage, 'info', group='value'),
43                 display_id, transform_source=unescapeHTML)
44
45         info = extract_data('video-tracking-info')
46         video_id = info['videoId']
47
48         data = extract_data('article-data')
49         stream = data['stream']
50
51         if determine_ext(stream) == 'm3u8':
52             formats = self._extract_m3u8_formats(
53                 stream, display_id, 'mp4', entry_protocol='m3u8_native',
54                 m3u8_id='hls')
55         else:
56             formats = [{
57                 'url': stream,
58             }]
59         self._sort_formats(formats)
60
61         title = info.get('titleRaw') or data['title']
62         description = info.get('descriptionRaw')
63         thumbnail = info.get('socialMediaImage') or data.get('image')
64         duration = int_or_none(info.get('videoTotalSecondsDuration') or
65                                data.get('totalSecondsDuration'))
66         timestamp = unified_timestamp(info.get('publishDate'))
67
68         return {
69             'id': video_id,
70             'display_id': display_id,
71             'title': title,
72             'description': description,
73             'thumbnail': thumbnail,
74             'duration': duration,
75             'timestamp': timestamp,
76             'formats': formats,
77         }