[heise] Add new extractor
[youtube-dl] / youtube_dl / extractor / heise.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import (
8     ExtractorError,
9     compat_urllib_parse,
10     get_meta_content,
11     parse_iso8601,
12 )
13
14
15 class HeiseIE(InfoExtractor):
16     _VALID_URL = (
17         r'^https?://(?:www\.)?heise\.de/video/artikel/' +
18         r'.+?(?P<id>[0-9]+)\.html$'
19     )
20     _TEST = {
21         'url': (
22             'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' +
23             'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
24         ),
25         'md5': 'ffed432483e922e88545ad9f2f15d30e',
26         'info_dict': {
27             'id': '2404147',
28             'ext': 'mp4',
29             'title': (
30                 "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " +
31                 "Peilsender Smartphone"
32             ),
33             'format_id': 'mp4_720',
34             'timestamp': 1411812600,
35             'upload_date': '20140927',
36         }
37     }
38
39     _CONFIG = (
40         r'".+?\?sequenz=(?P<sequenz>.+?)&container=(?P<container>.+?)' +
41         r'(?:&hd=(?P<hd>.+?))?(?:&signature=(?P<signature>.+?))?&callback=\?"'
42     )
43     _PREFIX = 'http://www.heise.de/videout/info?'
44
45     def _warn(self, fmt, *args):
46         self.report_warning(fmt.format(*args), self._id)
47
48     def _parse_config_url(self, html):
49         m = re.search(self._CONFIG, html)
50         if not m:
51             raise ExtractorError('No config found')
52
53         qs = compat_urllib_parse.urlencode(dict((k, v) for k, v
54                                                 in m.groupdict().items()
55                                                 if v is not None))
56         return self._PREFIX + qs
57
58     def _real_extract(self, url):
59         mobj = re.match(self._VALID_URL, url)
60         self._id = mobj.group('id')
61
62         html = self._download_webpage(url, self._id)
63         config = self._download_json(self._parse_config_url(html), self._id)
64
65         info = {
66             'id': self._id
67         }
68
69         title = get_meta_content('fulltitle', html)
70         if title:
71             info['title'] = title
72         elif config.get('title'):
73             info['title'] = config['title']
74         else:
75             self._warn('title: not found')
76             info['title'] = 'heise'
77
78         if (not config.get('formats') or
79                 not hasattr(config['formats'], 'items')):
80             raise ExtractorError('No formats found')
81
82         formats = []
83         for t, rs in config['formats'].items():
84             if not rs or not hasattr(rs, 'items'):
85                 self._warn('formats: {0}: no resolutions', t)
86                 continue
87
88             for res, obj in rs.items():
89                 format_id = '{0}_{1}'.format(t, res)
90
91                 if (not obj or not obj.get('url') or
92                         not isinstance(obj['url'], str)):
93                     self._warn('formats: {0}: no url', format_id)
94                     continue
95
96                 fmt = {
97                     'url': obj['url'],
98                     'format_id': format_id
99                 }
100                 try:
101                     fmt['height'] = int(res)
102                 except ValueError as e:
103                     self._warn('formats: {0}: height: {1}', t, e)
104
105                 formats.append(fmt)
106
107         self._sort_formats(formats)
108         info['formats'] = formats
109
110         if config.get('poster') and isinstance(config['poster'], str):
111             info['thumbnail'] = config['poster']
112
113         date = get_meta_content('date', html)
114         if date and isinstance(date, str):
115             try:
116                 info['timestamp'] = parse_iso8601(date)
117             except ValueError as e:
118                 self._warn('timestamp: {0}', e)
119
120         return info