[ard] Extract all formats

[youtube-dl] / youtube_dl / extractor / ard.py
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py

index ef94c72395723b31bd444e80b6ba12d990acf38b..55f940d57f523eab6fd84240b3cc2c1f7c4d4dab 100644 (file)
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -4,15 +4,16 @@ from __future__ import unicode_literals
  import re
  
  from .common import InfoExtractor
+from .generic import GenericIE
  from ..utils import (
      determine_ext,
      ExtractorError,
      qualities,
-    compat_urllib_parse_urlparse,
-    compat_urllib_parse,
      int_or_none,
      parse_duration,
      unified_strdate,
+    xpath_text,
+    parse_xml,
  )
  
  
@@ -22,13 +23,7 @@ class ARDMediathekIE(InfoExtractor):
  
      _TESTS = [{
          'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht',
-        'file': '22429276.mp4',
-        'md5': '469751912f1de0816a9fc9df8336476c',
-        'info_dict': {
-            'title': 'Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?',
-            'description': 'Das Erste Mediathek [ARD]: Vertrauen ist gut, Spionieren ist besser - Geht so deutsch-amerikanische Freundschaft?, Anne Will, Über die Spionage-Affäre diskutieren Clemens Binninger, Katrin Göring-Eckardt, Georg Mascolo, Andrew B. Denison und Constanze Kurz.. Das Video zur Sendung Anne Will am Mittwoch, 16.07.2014',
-        },
-        'skip': 'Blocked outside of Germany',
+        'only_matching': True,
      }, {
          'url': 'http://www.ardmediathek.de/tv/Tatort/Das-Wunder-von-Wolbeck-Video-tgl-ab-20/Das-Erste/Video?documentId=22490580&bcastId=602916',
          'info_dict': {
@@ -40,6 +35,87 @@ class ARDMediathekIE(InfoExtractor):
          'skip': 'Blocked outside of Germany',
      }]
  
+    def _extract_media_info(self, media_info_url, webpage, video_id):
+        media_info = self._download_json(
+            media_info_url, video_id, 'Downloading media JSON')
+
+        formats = self._extract_formats(media_info, video_id)
+
+        if not formats:
+            if '"fsk"' in webpage:
+                raise ExtractorError(
+                    'This video is only available after 20:00', expected=True)
+            elif media_info.get('_geoblocked'):
+                raise ExtractorError('This video is not available due to geo restriction', expected=True)
+
+        self._sort_formats(formats)
+
+        duration = int_or_none(media_info.get('_duration'))
+        thumbnail = media_info.get('_previewImage')
+
+        subtitles = {}
+        subtitle_url = media_info.get('_subtitleUrl')
+        if subtitle_url:
+            subtitles['de'] = [{
+                'ext': 'srt',
+                'url': subtitle_url,
+            }]
+
+        return {
+            'id': video_id,
+            'duration': duration,
+            'thumbnail': thumbnail,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
+
+    def _extract_formats(self, media_info, video_id):
+        type_ = media_info.get('_type')
+        media_array = media_info.get('_mediaArray', [])
+        formats = []
+        for num, media in enumerate(media_array):
+            for stream in media.get('_mediaStreamArray', []):
+                stream_urls = stream.get('_stream')
+                if not stream_urls:
+                    continue
+                if not isinstance(stream_urls, list):
+                    stream_urls = [stream_urls]
+                quality = stream.get('_quality')
+                server = stream.get('_server')
+                for stream_url in stream_urls:
+                    ext = determine_ext(stream_url)
+                    if ext == 'f4m':
+                        formats.extend(self._extract_f4m_formats(
+                            stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124',
+                            video_id, preference=-1, f4m_id='hds'))
+                    elif ext == 'm3u8':
+                        formats.extend(self._extract_m3u8_formats(
+                            stream_url, video_id, 'mp4', preference=1, m3u8_id='hls'))
+                    else:
+                        if server and server.startswith('rtmp'):
+                            f = {
+                                'url': server,
+                                'play_path': stream_url,
+                                'format_id': 'a%s-rtmp-%s' % (num, quality),
+                            }
+                        elif stream_url.startswith('http'):
+                            f = {
+                                'url': stream_url,
+                                'format_id': 'a%s-%s-%s' % (num, ext, quality)
+                            }
+                        else:
+                            continue
+                        m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url)
+                        if m:
+                            f.update({
+                                'width': int(m.group('width')),
+                                'height': int(m.group('height')),
+                            })
+                        if type_ == 'audio':
+                            f['vcodec'] = 'none'
+                        formats.append(f)
+        return formats
+
      def _real_extract(self, url):
          # determine video id from url
          m = re.match(self._VALID_URL, url)
@@ -50,14 +126,19 @@ class ARDMediathekIE(InfoExtractor):
          else:
              video_id = m.group('video_id')
  
-        urlp = compat_urllib_parse_urlparse(url)
-        url = urlp._replace(path=compat_urllib_parse.quote(urlp.path.encode('utf-8'))).geturl()
-
          webpage = self._download_webpage(url, video_id)
  
          if '>Der gewünschte Beitrag ist nicht mehr verfügbar.<' in webpage:
              raise ExtractorError('Video %s is no longer available' % video_id, expected=True)
  
+        if 'Diese Sendung ist für Jugendliche unter 12 Jahren nicht geeignet. Der Clip ist deshalb nur von 20 bis 6 Uhr verfügbar.' in webpage:
+            raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
+
+        if re.search(r'[\?&]rss($|[=&])', url):
+            doc = parse_xml(webpage)
+            if doc.tag == 'rss':
+                return GenericIE()._extract_rss(url, video_id, doc)
+
          title = self._html_search_regex(
              [r'<h1(?:\s+class="boxTopHeadline")?>(.*?)</h1>',
               r'<meta name="dcterms.title" content="(.*?)"/>',
@@ -92,46 +173,22 @@ class ARDMediathekIE(InfoExtractor):
                      'format_id': fid,
                      'url': furl,
                  })
+            self._sort_formats(formats)
+            info = {
+                'formats': formats,
+            }
          else:  # request JSON file
-            media_info = self._download_json(
-                'http://www.ardmediathek.de/play/media/%s' % video_id, video_id)
-            # The second element of the _mediaArray contains the standard http urls
-            streams = media_info['_mediaArray'][1]['_mediaStreamArray']
-            if not streams:
-                if '"fsk"' in webpage:
-                    raise ExtractorError('This video is only available after 20:00')
-
-            formats = []
-            for s in streams:
-                if type(s['_stream']) == list:
-                    for index, url in enumerate(s['_stream'][::-1]):
-                        quality = s['_quality'] + index
-                        formats.append({
-                            'quality': quality,
-                            'url': url,
-                            'format_id': '%s-%s' % (determine_ext(url), quality)
-                        })
-                    continue
-
-                format = {
-                    'quality': s['_quality'],
-                    'url': s['_stream'],
-                }
+            info = self._extract_media_info(
+                'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
  
-                format['format_id'] = '%s-%s' % (
-                    determine_ext(format['url']), format['quality'])
-
-                formats.append(format)
-
-        self._sort_formats(formats)
-
-        return {
+        info.update({
              'id': video_id,
              'title': title,
              'description': description,
-            'formats': formats,
              'thumbnail': thumbnail,
-        }
+        })
+
+        return info
  
  
  class ARDIE(InfoExtractor):
@@ -157,8 +214,9 @@ class ARDIE(InfoExtractor):
          player_url = mobj.group('mainurl') + '~playerXml.xml'
          doc = self._download_xml(player_url, display_id)
          video_node = doc.find('./video')
-        upload_date = unified_strdate(video_node.find('./broadcastDate').text)
-        thumbnail = video_node.find('.//teaserImage//variant/url').text
+        upload_date = unified_strdate(xpath_text(
+            video_node, './broadcastDate'))
+        thumbnail = xpath_text(video_node, './/teaserImage//variant/url')
  
          formats = []
          for a in video_node.findall('.//asset'):
@@ -188,4 +246,3 @@ class ARDIE(InfoExtractor):
              'upload_date': upload_date,
              'thumbnail': thumbnail,
          }
-