[youtube] Fix extraction.

[youtube-dl] / youtube_dl / extractor / nexx.py
diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py

index 6f40d7f89d9765b136d4fdf73d2a79dc260b98dc..586c1b7eb0197a3973edb3096c9457716f2bb982 100644 (file)
--- a/youtube_dl/extractor/nexx.py
+++ b/youtube_dl/extractor/nexx.py
@@ -29,14 +29,13 @@ class NexxIE(InfoExtractor):
      _TESTS = [{
          # movie
          'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
-        'md5': '828cea195be04e66057b846288295ba1',
+        'md5': '31899fd683de49ad46f4ee67e53e83fe',
          'info_dict': {
              'id': '128907',
              'ext': 'mp4',
              'title': 'Stiftung Warentest',
              'alt_title': 'Wie ein Test abläuft',
              'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
-            'release_year': 2013,
              'creator': 'SPIEGEL TV',
              'thumbnail': r're:^https?://.*\.jpg$',
              'duration': 2509,
@@ -62,6 +61,7 @@ class NexxIE(InfoExtractor):
          'params': {
              'skip_download': True,
          },
+        'skip': 'HTTP Error 404: Not Found',
      }, {
          # does not work via arc
          'url': 'nexx:741:1269984',
@@ -71,7 +71,6 @@ class NexxIE(InfoExtractor):
              'ext': 'mp4',
              'title': '1 TAG ohne KLO... wortwörtlich! 😑',
              'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑',
-            'description': 'md5:4604539793c49eda9443ab5c5b1d612f',
              'thumbnail': r're:^https?://.*\.jpg$',
              'duration': 607,
              'timestamp': 1518614955,
@@ -109,7 +108,7 @@ class NexxIE(InfoExtractor):
      @staticmethod
      def _extract_domain_id(webpage):
          mobj = re.search(
-            r'<script\b[^>]+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)',
+            r'<script\b[^>]+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P<id>\d+)',
              webpage)
          return mobj.group('id') if mobj else None
  
@@ -124,7 +123,7 @@ class NexxIE(InfoExtractor):
          domain_id = NexxIE._extract_domain_id(webpage)
          if domain_id:
              for video_id in re.findall(
-                    r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)',
+                    r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)',
                      webpage):
                  entries.append(
                      'https://api.nexx.cloud/v3/%s/videos/byid/%s'
@@ -169,42 +168,54 @@ class NexxIE(InfoExtractor):
              ps += '/%s/%s' % (s[0:2], s[2:4])
          ps += '/%s/%s_' % (video_id, hash)
  
-        formats = [{
-            'url': 'http://%s%s2500_var.mp4' % (stream_data['cdnPathHTTP'], ps),
-            'format_id': '%s-http' % cdn,
-        }]
-
-        def make_url(root, protocol):
-            t = 'http://' + root + ps
-            fd = stream_data['azureFileDistribution'].split(',')
-            cdn_provider = stream_data['cdnProvider']
-
-            def p0(p):
-                return '_%s' % int(p[0]) if stream_data['applyAzureStructure'] == 1 else ''
-
-            if cdn_provider == 'ak':
-                t += ','
-                for i in fd:
-                    p = i.split(':')
-                    t += p[1] + p0(p) + ','
-                t += '.mp4.csmil/master.m3u8'
-            elif cdn_provider == 'ce':
-                k = t.split('/')
-                h = k.pop()
-                t = '/'.join(k)
-                t += '/asset.ism/manifest.' + ('m3u8' if protocol == 'hls' else 'mpd') + '?dcp_ver=aos4&videostream='
-                for i in fd:
-                    p = i.split(':')
-                    a = '%s%s%s.mp4:%s' % (h, p[1], p0(p), int(p[0]) * 1000)
-                    t += a + ','
-                t = t[:-1] + '&audiostream=' + a.split(':')[0]
-            return t
+        t = 'http://%s' + ps
+        fd = stream_data['azureFileDistribution'].split(',')
+        cdn_provider = stream_data['cdnProvider']
+
+        def p0(p):
+            return '_%s' % p if stream_data['applyAzureStructure'] == 1 else ''
+
+        formats = []
+        if cdn_provider == 'ak':
+            t += ','
+            for i in fd:
+                p = i.split(':')
+                t += p[1] + p0(int(p[0])) + ','
+            t += '.mp4.csmil/master.%s'
+        elif cdn_provider == 'ce':
+            k = t.split('/')
+            h = k.pop()
+            http_base = t = '/'.join(k)
+            http_base = http_base % stream_data['cdnPathHTTP']
+            t += '/asset.ism/manifest.%s?dcp_ver=aos4&videostream='
+            for i in fd:
+                p = i.split(':')
+                tbr = int(p[0])
+                filename = '%s%s%s.mp4' % (h, p[1], p0(tbr))
+                f = {
+                    'url': http_base + '/' + filename,
+                    'format_id': '%s-http-%d' % (cdn, tbr),
+                    'tbr': tbr,
+                }
+                width_height = p[1].split('x')
+                if len(width_height) == 2:
+                    f.update({
+                        'width': int_or_none(width_height[0]),
+                        'height': int_or_none(width_height[1]),
+                    })
+                formats.append(f)
+                a = filename + ':%s' % (tbr * 1000)
+                t += a + ','
+            t = t[:-1] + '&audiostream=' + a.split(':')[0]
+        else:
+            assert False
  
-        formats.extend(self._extract_mpd_formats(
-            make_url(stream_data['cdnPathDASH'], 'dash'), video_id,
-            mpd_id='%s-dash' % cdn, fatal=False))
+        if cdn_provider == 'ce':
+            formats.extend(self._extract_mpd_formats(
+                t % (stream_data['cdnPathDASH'], 'mpd'), video_id,
+                mpd_id='%s-dash' % cdn, fatal=False))
          formats.extend(self._extract_m3u8_formats(
-            make_url(stream_data['cdnPathHLS'], 'hls'), video_id, 'mp4',
+            t % (stream_data['cdnPathHLS'], 'm3u8'), video_id, 'mp4',
              entry_protocol='m3u8_native', m3u8_id='%s-hls' % cdn, fatal=False))
  
          return formats
@@ -284,13 +295,23 @@ class NexxIE(InfoExtractor):
  
          video = None
  
+        def find_video(result):
+            if isinstance(result, dict):
+                return result
+            elif isinstance(result, list):
+                vid = int(video_id)
+                for v in result:
+                    if try_get(v, lambda x: x['general']['ID'], int) == vid:
+                        return v
+            return None
+
          response = self._download_json(
              'https://arc.nexx.cloud/api/video/%s.json' % video_id,
              video_id, fatal=False)
          if response and isinstance(response, dict):
              result = response.get('result')
-            if result and isinstance(result, dict):
-                video = result
+            if result:
+                video = find_video(result)
  
          # not all videos work via arc, e.g. nexx:741:1269984
          if not video:
@@ -337,7 +358,7 @@ class NexxIE(InfoExtractor):
              request_token = hashlib.md5(
                  ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest()
  
-            video = self._call_api(
+            result = self._call_api(
                  domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={
                      'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description',
                      'addInteractionOptions': '1',
@@ -352,6 +373,7 @@ class NexxIE(InfoExtractor):
                      'X-Request-CID': cid,
                      'X-Request-Token': request_token,
                  })
+            video = find_video(result)
  
          general = video['general']
          title = general['title']
@@ -388,8 +410,8 @@ class NexxIE(InfoExtractor):
  
  
  class NexxEmbedIE(InfoExtractor):
-    _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)'
-    _TEST = {
+    _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)'
+    _TESTS = [{
          'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
          'md5': '16746bfc28c42049492385c989b26c4a',
          'info_dict': {
@@ -398,7 +420,6 @@ class NexxEmbedIE(InfoExtractor):
              'title': 'Nervenkitzel Achterbahn',
              'alt_title': 'Karussellbauer in Deutschland',
              'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
-            'release_year': 2005,
              'creator': 'SPIEGEL TV',
              'thumbnail': r're:^https?://.*\.jpg$',
              'duration': 2761,
@@ -409,7 +430,10 @@ class NexxEmbedIE(InfoExtractor):
              'format': 'bestvideo',
              'skip_download': True,
          },
-    }
+    }, {
+        'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7',
+        'only_matching': True,
+    }]
  
      @staticmethod
      def _extract_urls(webpage):