_ Git - youtube-dl/blob - youtube_dl/extractor/ard.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from .generic import GenericIE
   8 from ..utils import (
   9     determine_ext,
  10     ExtractorError,
  11     get_element_by_attribute,
  12     qualities,
  13     int_or_none,
  14     parse_duration,
  15     unified_strdate,
  16     xpath_text,
  17     parse_xml,
  18 )
  19
  20
  21 class ARDMediathekIE(InfoExtractor):
  22     IE_NAME = 'ARD:mediathek'
  23     _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?'
  24
  25     _TESTS = [{
  26         'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
  27         'only_matching': True,
  28     }, {
  29         'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
  30         'info_dict': {
  31             'id': '22490580',
  32             'ext': 'mp4',
  33             'title': 'Das Wunder von Wolbeck (Video tgl. ab 20 Uhr)',
  34             'description': 'Auf einem restaurierten Hof bei Wolbeck wird der Heilpraktiker Raffael Lembeck eines morgens von seiner Frau Stella tot aufgefunden. Das Opfer war offensichtlich in seiner Praxis zu Fall gekommen und ist dann verblutet, erklärt Prof. Boerne am Tatort.',
  35         },
  36         'skip': 'Blocked outside of Germany',
  37     }]
  38
  39     def _extract_media_info(self, media_info_url, webpage, video_id):
  40         media_info = self._download_json(
  41             media_info_url, video_id, 'Downloading media JSON')
  42
  43         formats = self._extract_formats(media_info, video_id)
  44
  45         if not formats:
  46             if '"fsk"' in webpage:
  47                 raise ExtractorError(
  48                     'This video is only available after 20:00', expected=True)
  49             elif media_info.get('_geoblocked'):
  50                 raise ExtractorError('This video is not available due to geo restriction', expected=True)
  51
  52         self._sort_formats(formats)
  53
  54         duration = int_or_none(media_info.get('_duration'))
  55         thumbnail = media_info.get('_previewImage')
  56
  57         subtitles = {}
  58         subtitle_url = media_info.get('_subtitleUrl')
  59         if subtitle_url:
  60             subtitles['de'] = [{
  61                 'ext': 'srt',
  62                 'url': subtitle_url,
  63             }]
  64
  65         return {
  66             'id': video_id,
  67             'duration': duration,
  68             'thumbnail': thumbnail,
  69             'formats': formats,
  70             'subtitles': subtitles,
  71         }
  72
  73     def _extract_formats(self, media_info, video_id):
  74         type_ = media_info.get('_type')
  75         media_array = media_info.get('_mediaArray', [])
  76         formats = []
  77         for num, media in enumerate(media_array):
  78             for stream in media.get('_mediaStreamArray', []):
  79                 stream_urls = stream.get('_stream')
  80                 if not stream_urls:
  81                     continue
  82                 if not isinstance(stream_urls, list):
  83                     stream_urls = [stream_urls]
  84                 quality = stream.get('_quality')
  85                 server = stream.get('_server')
  86                 for stream_url in stream_urls:
  87                     ext = determine_ext(stream_url)
  88                     if ext == 'f4m':
  89                         formats.extend(self._extract_f4m_formats(
  90                             stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
  91                             video_id, preference=-1, f4m_id='hds'))
  92                     elif ext == 'm3u8':
  93                         formats.extend(self._extract_m3u8_formats(
  94                             stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
  95                     else:
  96                         if server and server.startswith('rtmp'):
  97                             f = {
  98                                 'url': server,
  99                                 'play_path': stream_url,
 100                                 'format_id': 'a%s-rtmp-%s' % (num, quality),
 101                             }
 102                         elif stream_url.startswith('http'):
 103                             f = {
 104                                 'url': stream_url,
 105                                 'format_id': 'a%s-%s-%s' % (num, ext, quality)
 106                             }
 107                         else:
 108                             continue
 109                         m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
 110                         if m:
 111                             f.update({
 112                                 'width': int(m.group('width')),
 113                                 'height': int(m.group('height')),
 114                             })
 115                         if type_ == 'audio':
 116                             f['vcodec'] = 'none'
 117                         formats.append(f)
 118         return formats
 119
 120     def _real_extract(self, url):
 121         # determine video id from url
 122         m = re.match(self._VALID_URL, url)
 123
 124         numid = re.search(r'documentId=([0-9]+)', url)
 125         if numid:
 126             video_id = numid.group(1)
 127         else:
 128             video_id = m.group('video_id')
 129
 130         webpage = self._download_webpage(url, video_id)
 131
 132         if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
 133             raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
 134
 135         if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
 136             raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
 137
 138         if re.search(r'[\?&]rss($|[=&])', url):
 139             doc = parse_xml(webpage)
 140             if doc.tag == 'rss':
 141                 return GenericIE()._extract_rss(url, video_id, doc)
 142
 143         title = self._html_search_regex(
 144             [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
 145              r'<meta name="dcterms.title" content="(.*?)"/>',
 146              r'<h4 class="headline">(.*?)</h4>'],
 147             webpage, 'title')
 148         description = self._html_search_meta(
 149             'dcterms.abstract', webpage, 'description', default=None)
 150         if description is None:
 151             description = self._html_search_meta(
 152                 'description', webpage, 'meta description')
 153
 154         # Thumbnail is sometimes not present.
 155         # It is in the mobile version, but that seems to use a different URL
 156         # structure altogether.
 157         thumbnail = self._og_search_thumbnail(webpage, default=None)
 158
 159         media_streams = re.findall(r'''(?x)
 160             mediaCollection\.addMediaStream\([0-9]+,\s*[0-9]+,\s*"[^"]*",\s*
 161             "([^"]+)"''', webpage)
 162
 163         if media_streams:
 164             QUALITIES = qualities(['lo', 'hi', 'hq'])
 165             formats = []
 166             for furl in set(media_streams):
 167                 if furl.endswith('.f4m'):
 168                     fid = 'f4m'
 169                 else:
 170                     fid_m = re.match(r'.*\.([^.]+)\.[^.]+$', furl)
 171                     fid = fid_m.group(1) if fid_m else None
 172                 formats.append({
 173                     'quality': QUALITIES(fid),
 174                     'format_id': fid,
 175                     'url': furl,
 176                 })
 177             self._sort_formats(formats)
 178             info = {
 179                 'formats': formats,
 180             }
 181         else:  # request JSON file
 182             info = self._extract_media_info(
 183                 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
 184
 185         info.update({
 186             'id': video_id,
 187             'title': title,
 188             'description': description,
 189             'thumbnail': thumbnail,
 190         })
 191
 192         return info
 193
 194
 195 class ARDIE(InfoExtractor):
 196     _VALID_URL = '(?P<mainurl>https?://(www\.)?daserste\.de/[^?#]+/videos/(?P<display_id>[^/?#]+)-(?P<id>[0-9]+))\.html'
 197     _TEST = {
 198         'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html',
 199         'md5': 'd216c3a86493f9322545e045ddc3eb35',
 200         'info_dict': {
 201             'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge',
 202             'id': '100',
 203             'ext': 'mp4',
 204             'duration': 2600,
 205             'title': 'Die Story im Ersten: Mission unter falscher Flagge',
 206             'upload_date': '20140804',
 207             'thumbnail': 're:^https?://.*\.jpg$',
 208         }
 209     }
 210
 211     def _real_extract(self, url):
 212         mobj = re.match(self._VALID_URL, url)
 213         display_id = mobj.group('display_id')
 214
 215         player_url = mobj.group('mainurl') + '~playerXml.xml'
 216         doc = self._download_xml(player_url, display_id)
 217         video_node = doc.find('./video')
 218         upload_date = unified_strdate(xpath_text(
 219             video_node, './broadcastDate'))
 220         thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
 221
 222         formats = []
 223         for a in video_node.findall('.//asset'):
 224             f = {
 225                 'format_id': a.attrib['type'],
 226                 'width': int_or_none(a.find('./frameWidth').text),
 227                 'height': int_or_none(a.find('./frameHeight').text),
 228                 'vbr': int_or_none(a.find('./bitrateVideo').text),
 229                 'abr': int_or_none(a.find('./bitrateAudio').text),
 230                 'vcodec': a.find('./codecVideo').text,
 231                 'tbr': int_or_none(a.find('./totalBitrate').text),
 232             }
 233             if a.find('./serverPrefix').text:
 234                 f['url'] = a.find('./serverPrefix').text
 235                 f['playpath'] = a.find('./fileName').text
 236             else:
 237                 f['url'] = a.find('./fileName').text
 238             formats.append(f)
 239         self._sort_formats(formats)
 240
 241         return {
 242             'id': mobj.group('id'),
 243             'formats': formats,
 244             'display_id': display_id,
 245             'title': video_node.find('./title').text,
 246             'duration': parse_duration(video_node.find('./duration').text),
 247             'upload_date': upload_date,
 248             'thumbnail': thumbnail,
 249         }
 250
 251
 252 class SportschauIE(ARDMediathekIE):
 253     IE_NAME = 'Sportschau'
 254     _VALID_URL = r'(?P<baseurl>https?://(?:www\.)?sportschau\.de/(?:[^/]+/)+video(?P<id>[^/#?]+))\.html'
 255     _TEST = {
 256         'url': 'http://www.sportschau.de/tourdefrance/videoseppeltkokainhatnichtsmitklassischemdopingzutun100.html',
 257         'info_dict': {
 258             'id': 'seppeltkokainhatnichtsmitklassischemdopingzutun100',
 259             'ext': 'mp4',
 260             'title': 'Seppelt: "Kokain hat nichts mit klassischem Doping zu tun"',
 261             'thumbnail': 're:^https?://.*\.jpg$',
 262             'description': 'Der ARD-Doping Experte Hajo Seppelt gibt seine Einschätzung zum ersten Dopingfall der diesjährigen Tour de France um den Italiener Luca Paolini ab.',
 263         },
 264         'params': {
 265             # m3u8 download
 266             'skip_download': True,
 267         },
 268     }
 269
 270     def _real_extract(self, url):
 271         mobj = re.match(self._VALID_URL, url)
 272         video_id = mobj.group('id')
 273         base_url = mobj.group('baseurl')
 274
 275         webpage = self._download_webpage(url, video_id)
 276         title = get_element_by_attribute('class', 'headline', webpage)
 277         description = self._html_search_meta('description', webpage, 'description')
 278
 279         info = self._extract_media_info(
 280             base_url + '-mc_defaultQuality-h.json', webpage, video_id)
 281
 282         info.update({
 283             'title': title,
 284             'description': description,
 285         })
 286
 287         return info