Merge remote-tracking branch 'origin/master'
[youtube-dl] / youtube_dl / extractor / orf.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import json
5 import re
6
7 from .common import InfoExtractor
8 from ..utils import (
9     HEADRequest,
10     unified_strdate,
11     ExtractorError,
12 )
13
14
15 class ORFIE(InfoExtractor):
16     _VALID_URL = r'https?://tvthek\.orf\.at/(?:programs/.+?/episodes|topics/.+?|program/[^/]+)/(?P<id>\d+)'
17
18     _TEST = {
19         'url': 'http://tvthek.orf.at/program/matinee-Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7317210/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319746/Was-Sie-schon-immer-ueber-Klassik-wissen-wollten/7319747',
20         'file': '7319747.mp4',
21         'md5': 'bd803c5d8c32d3c64a0ea4b4eeddf375',
22         'info_dict': {
23             'title': 'Was Sie schon immer über Klassik wissen wollten',
24             'description': 'md5:0ddf0d5f0060bd53f744edaa5c2e04a4',
25             'duration': 3508,
26             'upload_date': '20140105',
27         },
28         'skip': 'Blocked outside of Austria',
29     }
30
31     def _real_extract(self, url):
32         mobj = re.match(self._VALID_URL, url)
33         playlist_id = mobj.group('id')
34         webpage = self._download_webpage(url, playlist_id)
35
36         data_json = self._search_regex(
37             r'initializeAdworx\((.+?)\);\n', webpage, 'video info')
38         all_data = json.loads(data_json)
39
40         def get_segments(all_data):
41             for data in all_data:
42                 if data['name'] == 'Tracker::EPISODE_DETAIL_PAGE_OVER_PROGRAM':
43                     return data['values']['segments']
44
45         sdata = get_segments(all_data)
46         if not sdata:
47             raise ExtractorError('Unable to extract segments')
48
49         def quality_to_int(s):
50             m = re.search('([0-9]+)', s)
51             if m is None:
52                 return -1
53             return int(m.group(1))
54
55         entries = []
56         for sd in sdata:
57             video_id = sd['id']
58             formats = [{
59                 'preference': -10 if fd['delivery'] == 'hls' else None,
60                 'format_id': '%s-%s-%s' % (
61                     fd['delivery'], fd['quality'], fd['quality_string']),
62                 'url': fd['src'],
63                 'protocol': fd['protocol'],
64                 'quality': quality_to_int(fd['quality']),
65             } for fd in sd['playlist_item_array']['sources']]
66
67             # Check for geoblocking.
68             # There is a property is_geoprotection, but that's always false
69             geo_str = sd.get('geoprotection_string')
70             if geo_str:
71                 try:
72                     http_url = next(
73                         f['url']
74                         for f in formats
75                         if re.match(r'^https?://.*\.mp4$', f['url']))
76                 except StopIteration:
77                     pass
78                 else:
79                     req = HEADRequest(http_url)
80                     self._request_webpage(
81                         req, video_id,
82                         note='Testing for geoblocking',
83                         errnote=((
84                             'This video seems to be blocked outside of %s. '
85                             'You may want to try the streaming-* formats.')
86                             % geo_str),
87                         fatal=False)
88
89             self._sort_formats(formats)
90
91             upload_date = unified_strdate(sd['created_date'])
92             entries.append({
93                 '_type': 'video',
94                 'id': video_id,
95                 'title': sd['header'],
96                 'formats': formats,
97                 'description': sd.get('description'),
98                 'duration': int(sd['duration_in_seconds']),
99                 'upload_date': upload_date,
100                 'thumbnail': sd.get('image_full_url'),
101             })
102
103         return {
104             '_type': 'playlist',
105             'entries': entries,
106             'id': playlist_id,
107         }