[ina] improve extraction
[youtube-dl] / youtube_dl / extractor / ina.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5 from ..utils import (
6     int_or_none,
7     strip_or_none,
8     xpath_attr,
9     xpath_text,
10 )
11
12
13 class InaIE(InfoExtractor):
14     _VALID_URL = r'https?://(?:www\.)?ina\.fr/video/(?P<id>[A-Z0-9_]+)'
15     _TESTS = [{
16         'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
17         'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
18         'info_dict': {
19             'id': 'I12055569',
20             'ext': 'mp4',
21             'title': 'François Hollande "Je crois que c\'est clair"',
22             'description': 'md5:3f09eb072a06cb286b8f7e4f77109663',
23         }
24     }, {
25         'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html',
26         'only_matching': True,
27     }]
28
29     def _real_extract(self, url):
30         video_id = self._match_id(url)
31         info_doc = self._download_xml(
32             'http://player.ina.fr/notices/%s.mrss' % video_id, video_id)
33         item = info_doc.find('channel/item')
34         title = xpath_text(item, 'title', fatal=True)
35         media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/')
36         content = item.find(media_ns_xpath('content'))
37
38         get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url')
39         formats = []
40         for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)):
41             q_url = get_furl(q)
42             if not q_url:
43                 continue
44             formats.append({
45                 'format_id': q,
46                 'url': q_url,
47                 'width': w,
48                 'height': h,
49             })
50         if not formats:
51             formats = [{
52                 'url': get_furl('player') or content.attrib['url'],
53             }]
54
55         thumbnails = []
56         for thumbnail in content.findall(media_ns_xpath('thumbnail')):
57             thumbnail_url = thumbnail.get('url')
58             if not thumbnail_url:
59                 continue
60             thumbnails.append({
61                 'url': thumbnail_url,
62                 'height': int_or_none(thumbnail.get('height')),
63                 'width': int_or_none(thumbnail.get('width')),
64             })
65
66         return {
67             'id': video_id,
68             'formats': formats,
69             'title': title,
70             'description': strip_or_none(xpath_text(item, 'description')),
71             'thumbnails': thumbnails,
72         }