2 from __future__ import unicode_literals
6 from .common import InfoExtractor
7 from .generic import GenericIE
11 get_element_by_attribute,
21 class ARDMediathekIE(InfoExtractor):
22 IE_NAME = 'ARD:mediathek'
23 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
26 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
27 'only_matching': True,
29 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
33 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
34 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
36 'skip': 'Blocked outside of Germany',
39 def _extract_media_info(self, media_info_url, webpage, video_id):
40 media_info = self._download_json(
41 media_info_url, video_id, 'Downloading media JSON')
43 formats = self._extract_formats(media_info, video_id)
46 if '"fsk"' in webpage:
48 'This video is only available after 20:00', expected=True)
49 elif media_info.get('_geoblocked'):
50 raise ExtractorError('This video is not available due to geo restriction', expected=True)
52 self._sort_formats(formats)
54 duration = int_or_none(media_info.get('_duration'))
55 thumbnail = media_info.get('_previewImage')
58 subtitle_url = media_info.get('_subtitleUrl')
68 'thumbnail': thumbnail,
70 'subtitles': subtitles,
73 def _extract_formats(self, media_info, video_id):
74 type_ = media_info.get('_type')
75 media_array = media_info.get('_mediaArray', [])
77 for num, media in enumerate(media_array):
78 for stream in media.get('_mediaStreamArray', []):
79 stream_urls = stream.get('_stream')
82 if not isinstance(stream_urls, list):
83 stream_urls = [stream_urls]
84 quality = stream.get('_quality')
85 server = stream.get('_server')
86 for stream_url in stream_urls:
87 ext = determine_ext(stream_url)
89 formats.extend(self._extract_f4m_formats(
90 stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
91 video_id, preference=-1, f4m_id='hds'))
93 formats.extend(self._extract_m3u8_formats(
94 stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
96 if server and server.startswith('rtmp'):
99 'play_path': stream_url,
100 'format_id': 'a%s-rtmp-%s' % (num, quality),
102 elif stream_url.startswith('http'):
105 'format_id': 'a%s-%s-%s' % (num, ext, quality)
109 m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
112 'width': int(m.group('width')),
113 'height': int(m.group('height')),
120 def _real_extract(self, url):
121 # determine video id from url
122 m = re.match(self._VALID_URL, url)
124 numid = re.search(r'documentId=([0-9]+)', url)
126 video_id = numid.group(1)
128 video_id = m.group('video_id')
130 webpage = self._download_webpage(url, video_id)
132 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
133 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
135 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
136 raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
138 if re.search(r'[\?&]rss($|[=&])', url):
139 doc = parse_xml(webpage)
141 return GenericIE()._extract_rss(url, video_id, doc)
143 title = self._html_search_regex(
144 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
145 r'<meta name="dcterms.title" content="(.*?)"/>',
146 r'<h4 class="headline">(.*?)</h4>'],
148 description = self._html_search_meta(
149 'dcterms.abstract', webpage, 'description', default=None)
150 if description is None:
151 description = self._html_search_meta(
152 'description', webpage, 'meta description')
154 # Thumbnail is sometimes not present.
155 # It is in the mobile version, but that seems to use a different URL
156 # structure altogether.
157 thumbnail = self._og_search_thumbnail(webpage, default=None)
159 media_streams = re.findall(r'''(?x)
160 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
161 "([^"]+)"''', webpage)
164 QUALITIES = qualities(['lo', 'hi', 'hq'])
166 for furl in set(media_streams):
167 if furl.endswith('.f4m'):
170 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
171 fid = fid_m.group(1) if fid_m else None
173 'quality': QUALITIES(fid),
177 self._sort_formats(formats)
181 else: # request JSON file
182 info = self._extract_media_info(
183 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
188 'description': description,
189 'thumbnail': thumbnail,
195 class ARDIE(InfoExtractor):
196 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
198 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
199 'md5': 'd216c3a86493f9322545e045ddc3eb35',
201 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
205 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
206 'upload_date': '20140804',
207 'thumbnail': 're:^https?://.*\.jpg$',
211 def _real_extract(self, url):
212 mobj = re.match(self._VALID_URL, url)
213 display_id = mobj.group('display_id')
215 player_url = mobj.group('mainurl') + '~playerXml.xml'
216 doc = self._download_xml(player_url, display_id)
217 video_node = doc.find('./video')
218 upload_date = unified_strdate(xpath_text(
219 video_node, './broadcastDate'))
220 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
223 for a in video_node.findall('.//asset'):
225 'format_id': a.attrib['type'],
226 'width': int_or_none(a.find('./frameWidth').text),
227 'height': int_or_none(a.find('./frameHeight').text),
228 'vbr': int_or_none(a.find('./bitrateVideo').text),
229 'abr': int_or_none(a.find('./bitrateAudio').text),
230 'vcodec': a.find('./codecVideo').text,
231 'tbr': int_or_none(a.find('./totalBitrate').text),
233 if a.find('./serverPrefix').text:
234 f['url'] = a.find('./serverPrefix').text
235 f['playpath'] = a.find('./fileName').text
237 f['url'] = a.find('./fileName').text
239 self._sort_formats(formats)
242 'id': mobj.group('id'),
244 'display_id': display_id,
245 'title': video_node.find('./title').text,
246 'duration': parse_duration(video_node.find('./duration').text),
247 'upload_date': upload_date,
248 'thumbnail': thumbnail,
252 class SportschauIE(ARDMediathekIE):
253 IE_NAME = 'Sportschau'
254 _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
256 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
258 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
260 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
261 'thumbnail': 're:^https?://.*\.jpg$',
262 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
266 'skip_download': True,
270 def _real_extract(self, url):
271 mobj = re.match(self._VALID_URL, url)
272 video_id = mobj.group('id')
273 base_url = mobj.group('baseurl')
275 webpage = self._download_webpage(url, video_id)
276 title = get_element_by_attribute('class', 'headline', webpage)
277 description = self._html_search_meta('description', webpage, 'description')
279 info = self._extract_media_info(
280 base_url + '-mc_defaultQuality-h.json', webpage, video_id)
284 'description': description,