[orf] Modernize
[youtube-dl] / youtube_dl / extractor / orf.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import json
5 import re
6 import calendar
7 import datetime
8
9 from .common import InfoExtractor
10 from ..utils import (
11     HEADRequest,
12     unified_strdate,
13     ExtractorError,
14 )
15
16
17 class ORFTVthekIE(InfoExtractor):
18     IE_NAME = 'orf:tvthek'
19     IE_DESC = 'ORF TVthek'
20     _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'
21
22     _TEST = {
23         'url': 'http://tvthek.orf.at/program/Aufgetischt/2745173/Aufgetischt-Mit-der-Steirischen-Tafelrunde/8891389',
24         'playlist': [{
25             'md5': '2942210346ed779588f428a92db88712',
26             'info_dict': {
27                 'id': '8896777',
28                 'ext': 'mp4',
29                 'title': 'Aufgetischt: Mit der Steirischen Tafelrunde',
30                 'description': 'md5:c1272f0245537812d4e36419c207b67d',
31                 'duration': 2668,
32                 'upload_date': '20141208',
33             },
34         }],
35         'skip': 'Blocked outside of Austria',
36     }
37
38     def _real_extract(self, url):
39         playlist_id = self._match_id(url)
40         webpage = self._download_webpage(url, playlist_id)
41
42         data_json = self._search_regex(
43             r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
44         all_data = json.loads(data_json)
45
46         def get_segments(all_data):
47             for data in all_data:
48                 if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
49                     return data['values']['segments']
50
51         sdata = get_segments(all_data)
52         if not sdata:
53             raise ExtractorError('Unable to extract segments')
54
55         def quality_to_int(s):
56             m = re.search('([0-9]+)', s)
57             if m is None:
58                 return -1
59             return int(m.group(1))
60
61         entries = []
62         for sd in sdata:
63             video_id = sd['id']
64             formats = [{
65                 'preference': -10 if fd['delivery'] == 'hls' else None,
66                 'format_id': '%s-%s-%s' % (
67                     fd['delivery'], fd['quality'], fd['quality_string']),
68                 'url': fd['src'],
69                 'protocol': fd['protocol'],
70                 'quality': quality_to_int(fd['quality']),
71             } for fd in sd['playlist_item_array']['sources']]
72
73             # Check for geoblocking.
74             # There is a property is_geoprotection, but that's always false
75             geo_str = sd.get('geoprotection_string')
76             if geo_str:
77                 try:
78                     http_url = next(
79                         f['url']
80                         for f in formats
81                         if re.match(r'^https?://.*\.mp4$', f['url']))
82                 except StopIteration:
83                     pass
84                 else:
85                     req = HEADRequest(http_url)
86                     self._request_webpage(
87                         req, video_id,
88                         note='Testing for geoblocking',
89                         errnote=((
90                             'This video seems to be blocked outside of %s. '
91                             'You may want to try the streaming-* formats.')
92                             % geo_str),
93                         fatal=False)
94
95             self._sort_formats(formats)
96
97             upload_date = unified_strdate(sd['created_date'])
98             entries.append({
99                 '_type': 'video',
100                 'id': video_id,
101                 'title': sd['header'],
102                 'formats': formats,
103                 'description': sd.get('description'),
104                 'duration': int(sd['duration_in_seconds']),
105                 'upload_date': upload_date,
106                 'thumbnail': sd.get('image_full_url'),
107             })
108
109         return {
110             '_type': 'playlist',
111             'entries': entries,
112             'id': playlist_id,
113         }
114
115
116 # Audios on ORF radio are only available for 7 days, so we can't add tests.
117
118
119 class ORFOE1IE(InfoExtractor):
120     IE_NAME = 'orf:oe1'
121     IE_DESC = 'Radio Ã–sterreich 1'
122     _VALID_URL = r'http://oe1\.orf\.at/programm/(?P<id>[0-9]+)'
123
124     def _real_extract(self, url):
125         show_id = self._match_id(url)
126         data = self._download_json(
127             'http://oe1.orf.at/programm/%s/konsole' % show_id,
128             show_id
129         )
130
131         timestamp = datetime.datetime.strptime('%s %s' % (
132             data['item']['day_label'],
133             data['item']['time']
134         ), '%d.%m.%Y %H:%M')
135         unix_timestamp = calendar.timegm(timestamp.utctimetuple())
136
137         return {
138             'id': show_id,
139             'title': data['item']['title'],
140             'url': data['item']['url_stream'],
141             'ext': 'mp3',
142             'description': data['item'].get('info'),
143             'timestamp': unix_timestamp
144         }
145
146
147 class ORFFM4IE(InfoExtractor):
148     IE_DESC = 'orf:fm4'
149     IE_DESC = 'radio FM4'
150     _VALID_URL = r'http://fm4\.orf\.at/7tage/?#(?P<date>[0-9]+)/(?P<show>\w+)'
151
152     def _real_extract(self, url):
153         mobj = re.match(self._VALID_URL, url)
154         show_date = mobj.group('date')
155         show_id = mobj.group('show')
156
157         data = self._download_json(
158             'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id),
159             show_id
160         )
161
162         def extract_entry_dict(info, title, subtitle):
163             return {
164                 'id': info['loopStreamId'].replace('.mp3', ''),
165                 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'],
166                 'title': title,
167                 'description': subtitle,
168                 'duration': (info['end'] - info['start']) / 1000,
169                 'timestamp': info['start'] / 1000,
170                 'ext': 'mp3'
171             }
172
173         entries = [extract_entry_dict(t, data['title'], data['subtitle']) for t in data['streams']]
174
175         return {
176             '_type': 'playlist',
177             'id': show_id,
178             'title': data['title'],
179             'description': data['subtitle'],
180             'entries': entries
181         }