b061b9566168758465ad56f43b1f74b89b2cce10
[youtube-dl] / youtube_dl / extractor / arte.py
1 import re
2 import json
3
4 from .common import InfoExtractor
5 from ..utils import (
6     # This is used by the not implemented extractLiveStream method
7     compat_urllib_parse,
8
9     ExtractorError,
10     unified_strdate,
11 )
12
13 class ArteTvIE(InfoExtractor):
14     _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
15     _LIVE_URL = r'index-[0-9]+\.html$'
16
17     IE_NAME = u'arte.tv'
18
19     # TODO implement Live Stream
20     # def extractLiveStream(self, url):
21     #     video_lang = url.split('/')[-4]
22     #     info = self.grep_webpage(
23     #         url,
24     #         r'src="(.*?/videothek_js.*?\.js)',
25     #         0,
26     #         [
27     #             (1, 'url', u'Invalid URL: %s' % url)
28     #         ]
29     #     )
30     #     http_host = url.split('/')[2]
31     #     next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
32     #     info = self.grep_webpage(
33     #         next_url,
34     #         r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
35     #             '(http://.*?\.swf).*?' +
36     #             '(rtmp://.*?)\'',
37     #         re.DOTALL,
38     #         [
39     #             (1, 'path',   u'could not extract video path: %s' % url),
40     #             (2, 'player', u'could not extract video player: %s' % url),
41     #             (3, 'url',    u'could not extract video url: %s' % url)
42     #         ]
43     #     )
44     #     video_url = u'%s/%s' % (info.get('url'), info.get('path'))
45
46     def _real_extract(self, url):
47         mobj = re.match(self._VALID_URL, url)
48         name = mobj.group('name')
49         # This is not a real id, it can be for example AJT for the news
50         # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
51         video_id = mobj.group('id')
52
53         if re.search(self._LIVE_URL, video_id) is not None:
54             raise ExtractorError(u'Arte live streams are not yet supported, sorry')
55             # self.extractLiveStream(url)
56             # return
57
58         webpage = self._download_webpage(url, video_id)
59         json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
60
61         json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
62         self.report_extraction(video_id)
63         info = json.loads(json_info)
64         player_info = info['videoJsonPlayer']
65
66         info_dict = {'id': player_info['VID'],
67                      'title': player_info['VTI'],
68                      'description': player_info['VDE'],
69                      'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
70                      'thumbnail': player_info['programImage'],
71                      }
72
73         formats = player_info['VSR'].values()
74         # We order the formats by quality
75         formats = sorted(formats, key=lambda f: int(f['height']))
76         # Pick the best quality
77         format_info = formats[-1]
78         if format_info['mediaType'] == u'rtmp':
79             info_dict['url'] = format_info['streamer']
80             info_dict['play_path'] = 'mp4:' + format_info['url']
81             info_dict['ext'] = 'mp4'
82         else:
83             info_dict['url'] = format_info['url']
84             info_dict['ext'] = 'mp4'
85
86         return info_dict