Merge remote-tracking branch 'd912e3/heise'
[youtube-dl] / youtube_dl / extractor / heise.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..utils import (
8     ExtractorError,
9     compat_urllib_parse,
10     get_meta_content,
11     parse_iso8601,
12 )
13
14
15 class HeiseIE(InfoExtractor):
16     _VALID_URL = (
17         r'^https?://(?:www\.)?heise\.de/video/artikel/' +
18         r'.+?(?P<id>[0-9]+)\.html$'
19     )
20     _TEST = {
21         'url': (
22             'http://www.heise.de/video/artikel/Podcast-c-t-uplink-3-3-' +
23             'Owncloud-Tastaturen-Peilsender-Smartphone-2404147.html'
24         ),
25         'md5': 'ffed432483e922e88545ad9f2f15d30e',
26         'info_dict': {
27             'id': '2404147',
28             'ext': 'mp4',
29             'title': (
30                 "Podcast: c't uplink 3.3 – Owncloud / Tastaturen / " +
31                 "Peilsender Smartphone"
32             ),
33             'format_id': 'mp4_720',
34             'timestamp': 1411812600,
35             'upload_date': '20140927',
36         }
37     }
38
39     _CONFIG = (
40         r'".+?\?sequenz=(?P<sequenz>.+?)&container=(?P<container>.+?)' +
41         r'(?:&hd=(?P<hd>.+?))?(?:&signature=(?P<signature>.+?))?&callback=\?"'
42     )
43     _PREFIX = 'http://www.heise.de/videout/info?'
44
45     def _warn(self, fmt, *args):
46         self.report_warning(fmt.format(*args), self._id)
47
48     def _parse_config_url(self, html):
49         m = re.search(self._CONFIG, html)
50         if not m:
51             raise ExtractorError('No config found')
52
53         qs = compat_urllib_parse.urlencode(dict((k, v) for k, v
54                                                 in m.groupdict().items()
55                                                 if v is not None))
56         return self._PREFIX + qs
57
58     def _real_extract(self, url):
59         mobj = re.match(self._VALID_URL, url)
60         self._id = mobj.group('id')
61
62         html = self._download_webpage(url, self._id)
63         config = self._download_json(self._parse_config_url(html), self._id)
64
65         info = {
66             'id': self._id
67         }
68
69         title = get_meta_content('fulltitle', html)
70         if title:
71             info['title'] = title
72         elif config.get('title'):
73             info['title'] = config['title']
74         else:
75             self._warn('title: not found')
76             info['title'] = 'heise'
77
78         if (not config.get('formats') or
79                 not hasattr(config['formats'], 'items')):
80             raise ExtractorError('No formats found')
81
82         formats = []
83         for t, rs in config['formats'].items():
84             if not rs or not hasattr(rs, 'items'):
85                 self._warn('formats: {0}: no resolutions', t)
86                 continue
87
88             for res, obj in rs.items():
89                 format_id = '{0}_{1}'.format(t, res)
90
91                 if not obj or not obj.get('url'):
92                     self._warn('formats: {0}: no url', format_id)
93                     continue
94
95                 fmt = {
96                     'url': obj['url'],
97                     'format_id': format_id
98                 }
99                 try:
100                     fmt['height'] = int(res)
101                 except ValueError as e:
102                     self._warn('formats: {0}: height: {1}', t, e)
103
104                 formats.append(fmt)
105
106         self._sort_formats(formats)
107         info['formats'] = formats
108
109         if config.get('poster'):
110             info['thumbnail'] = config['poster']
111
112         date = get_meta_content('date', html)
113         if date:
114             try:
115                 info['timestamp'] = parse_iso8601(date)
116             except ValueError as e:
117                 self._warn('timestamp: {0}', e)
118
119         return info