2 from __future__ import unicode_literals
6 from .common import InfoExtractor
7 from .generic import GenericIE
11 get_element_by_attribute,
21 class ARDMediathekIE(InfoExtractor):
22 IE_NAME = 'ARD:mediathek'
23 _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
26 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
27 'only_matching': True,
29 'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
33 'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
34 'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
36 'skip': 'Blocked outside of Germany',
39 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086',
40 'md5': '219d94d8980b4f538c7fcb0865eb7f2c',
44 'title': 'Tod eines Fußballers',
45 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef',
50 def _extract_media_info(self, media_info_url, webpage, video_id):
51 media_info = self._download_json(
52 media_info_url, video_id, 'Downloading media JSON')
54 formats = self._extract_formats(media_info, video_id)
57 if '"fsk"' in webpage:
59 'This video is only available after 20:00', expected=True)
60 elif media_info.get('_geoblocked'):
61 raise ExtractorError('This video is not available due to geo restriction', expected=True)
63 self._sort_formats(formats)
65 duration = int_or_none(media_info.get('_duration'))
66 thumbnail = media_info.get('_previewImage')
69 subtitle_url = media_info.get('_subtitleUrl')
79 'thumbnail': thumbnail,
81 'subtitles': subtitles,
84 def _extract_formats(self, media_info, video_id):
85 type_ = media_info.get('_type')
86 media_array = media_info.get('_mediaArray', [])
88 for num, media in enumerate(media_array):
89 for stream in media.get('_mediaStreamArray', []):
90 stream_urls = stream.get('_stream')
93 if not isinstance(stream_urls, list):
94 stream_urls = [stream_urls]
95 quality = stream.get('_quality')
96 server = stream.get('_server')
97 for stream_url in stream_urls:
98 ext = determine_ext(stream_url)
100 formats.extend(self._extract_f4m_formats(
101 stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
102 video_id, preference=-1, f4m_id='hds'))
104 formats.extend(self._extract_m3u8_formats(
105 stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
107 if server and server.startswith('rtmp'):
110 'play_path': stream_url,
111 'format_id': 'a%s-rtmp-%s' % (num, quality),
113 elif stream_url.startswith('http'):
116 'format_id': 'a%s-%s-%s' % (num, ext, quality)
120 m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
123 'width': int(m.group('width')),
124 'height': int(m.group('height')),
131 def _real_extract(self, url):
132 # determine video id from url
133 m = re.match(self._VALID_URL, url)
135 numid = re.search(r'documentId=([0-9]+)', url)
137 video_id = numid.group(1)
139 video_id = m.group('video_id')
141 webpage = self._download_webpage(url, video_id)
143 if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
144 raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
146 if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
147 raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
149 if re.search(r'[\?&]rss($|[=&])', url):
150 doc = parse_xml(webpage)
152 return GenericIE()._extract_rss(url, video_id, doc)
154 title = self._html_search_regex(
155 [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
156 r'<meta name="dcterms.title" content="(.*?)"/>',
157 r'<h4 class="headline">(.*?)</h4>'],
159 description = self._html_search_meta(
160 'dcterms.abstract', webpage, 'description', default=None)
161 if description is None:
162 description = self._html_search_meta(
163 'description', webpage, 'meta description')
165 # Thumbnail is sometimes not present.
166 # It is in the mobile version, but that seems to use a different URL
167 # structure altogether.
168 thumbnail = self._og_search_thumbnail(webpage, default=None)
170 media_streams = re.findall(r'''(?x)
171 mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
172 "([^"]+)"''', webpage)
175 QUALITIES = qualities(['lo', 'hi', 'hq'])
177 for furl in set(media_streams):
178 if furl.endswith('.f4m'):
181 fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
182 fid = fid_m.group(1) if fid_m else None
184 'quality': QUALITIES(fid),
188 self._sort_formats(formats)
192 else: # request JSON file
193 info = self._extract_media_info(
194 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
199 'description': description,
200 'thumbnail': thumbnail,
206 class ARDIE(InfoExtractor):
207 _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
209 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
210 'md5': 'd216c3a86493f9322545e045ddc3eb35',
212 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
216 'title': 'Die Story im Ersten: Mission unter falscher Flagge',
217 'upload_date': '20140804',
218 'thumbnail': 're:^https?://.*\.jpg$',
222 def _real_extract(self, url):
223 mobj = re.match(self._VALID_URL, url)
224 display_id = mobj.group('display_id')
226 player_url = mobj.group('mainurl') + '~playerXml.xml'
227 doc = self._download_xml(player_url, display_id)
228 video_node = doc.find('./video')
229 upload_date = unified_strdate(xpath_text(
230 video_node, './broadcastDate'))
231 thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
234 for a in video_node.findall('.//asset'):
236 'format_id': a.attrib['type'],
237 'width': int_or_none(a.find('./frameWidth').text),
238 'height': int_or_none(a.find('./frameHeight').text),
239 'vbr': int_or_none(a.find('./bitrateVideo').text),
240 'abr': int_or_none(a.find('./bitrateAudio').text),
241 'vcodec': a.find('./codecVideo').text,
242 'tbr': int_or_none(a.find('./totalBitrate').text),
244 if a.find('./serverPrefix').text:
245 f['url'] = a.find('./serverPrefix').text
246 f['playpath'] = a.find('./fileName').text
248 f['url'] = a.find('./fileName').text
250 self._sort_formats(formats)
253 'id': mobj.group('id'),
255 'display_id': display_id,
256 'title': video_node.find('./title').text,
257 'duration': parse_duration(video_node.find('./duration').text),
258 'upload_date': upload_date,
259 'thumbnail': thumbnail,
263 class SportschauIE(ARDMediathekIE):
264 IE_NAME = 'Sportschau'
265 _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
267 'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
269 'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
271 'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
272 'thumbnail': 're:^https?://.*\.jpg$',
273 'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
277 'skip_download': True,
281 def _real_extract(self, url):
282 mobj = re.match(self._VALID_URL, url)
283 video_id = mobj.group('id')
284 base_url = mobj.group('baseurl')
286 webpage = self._download_webpage(url, video_id)
287 title = get_element_by_attribute('class', 'headline', webpage)
288 description = self._html_search_meta('description', webpage, 'description')
290 info = self._extract_media_info(
291 base_url + '-mc_defaultQuality-h.json', webpage, video_id)
295 'description': description,