[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / ciscolive.py
index 32f64571379c9f3d9a810a7b065517dc2dfcf201..da404e4dc57b997834a61d217b39e84d497db173 100644 (file)
@@ -1,6 +1,8 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import itertools
+
 from .common import InfoExtractor
 from ..compat import (
     compat_parse_qs,
@@ -29,12 +31,12 @@ class CiscoLiveBaseIE(InfoExtractor):
         'rfWidgetId': RAINFOCUS_WIDGET_ID,
     }
 
-    def _call_api(self, ep, rf_id, query, referrer):
+    def _call_api(self, ep, rf_id, query, referrer, note=None):
         headers = self.HEADERS.copy()
         headers['Referer'] = referrer
         return self._download_json(
-            self.RAINFOCUS_API_URL % ep, rf_id, data=urlencode_postdata(query),
-            headers=headers)
+            self.RAINFOCUS_API_URL % ep, rf_id, note=note,
+            data=urlencode_postdata(query), headers=headers)
 
     def _parse_rf_item(self, rf_item):
         event_name = rf_item.get('eventName')
@@ -63,8 +65,8 @@ class CiscoLiveBaseIE(InfoExtractor):
 
 
 class CiscoLiveSessionIE(CiscoLiveBaseIE):
-    _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/\??[^#]*#/session/(?P<id>[^/?&]+)'
-    _TEST = {
+    _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/[^#]*#/session/(?P<id>[^/?&]+)'
+    _TESTS = [{
         'url': 'https://ciscolive.cisco.com/on-demand-library/?#/session/1423353499155001FoSs',
         'md5': 'c98acf395ed9c9f766941c70f5352e22',
         'info_dict': {
@@ -77,10 +79,13 @@ class CiscoLiveSessionIE(CiscoLiveBaseIE):
             'uploader_id': '5647924234001',
             'location': '16B Mezz.',
         },
-        'params': {
-            'proxy': '127.0.0.1:8118',
-        }
-    }
+    }, {
+        'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.event=ciscoliveemea2019#/session/15361595531500013WOU',
+        'only_matching': True,
+    }, {
+        'url': 'https://www.ciscolive.com/global/on-demand-library.html?#/session/1490051371645001kNaS',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         rf_id = self._match_id(url)
@@ -89,19 +94,19 @@ class CiscoLiveSessionIE(CiscoLiveBaseIE):
 
 
 class CiscoLiveSearchIE(CiscoLiveBaseIE):
-    _VALID_URL = r'https?://ciscolive\.cisco\.com/on-demand-library/'
+    _VALID_URL = r'https?://(?:www\.)?ciscolive(?:\.cisco)?\.com/(?:global/)?on-demand-library(?:\.html|/)'
     _TESTS = [{
         'url': 'https://ciscolive.cisco.com/on-demand-library/?search.event=ciscoliveus2018&search.technicallevel=scpsSkillLevel_aintroductory&search.focus=scpsSessionFocus_designAndDeployment#/',
         'info_dict': {
-            'title': 'Filter query',
+            'title': 'Search query',
         },
         'playlist_count': 5,
-        'params': {
-            'proxy': '127.0.0.1:8118',
-        }
     }, {
         'url': 'https://ciscolive.cisco.com/on-demand-library/?search.technology=scpsTechnology_applicationDevelopment&search.technology=scpsTechnology_ipv6&search.focus=scpsSessionFocus_troubleshootingTroubleshooting#/',
         'only_matching': True,
+    }, {
+        'url': 'https://www.ciscolive.com/global/on-demand-library.html?search.technicallevel=scpsSkillLevel_aintroductory&search.event=ciscoliveemea2019&search.technology=scpsTechnology_dataCenter&search.focus=scpsSessionFocus_bestPractices#/',
+        'only_matching': True,
     }]
 
     @classmethod
@@ -112,15 +117,35 @@ class CiscoLiveSearchIE(CiscoLiveBaseIE):
     def _check_bc_id_exists(rf_item):
         return int_or_none(try_get(rf_item, lambda x: x['videos'][0]['url'])) is not None
 
+    def _entries(self, query, url):
+        query['size'] = 50
+        query['from'] = 0
+        for page_num in itertools.count(1):
+            results = self._call_api(
+                'search', None, query, url,
+                'Downloading search JSON page %d' % page_num)
+            sl = try_get(results, lambda x: x['sectionList'][0], dict)
+            if sl:
+                results = sl
+            items = results.get('items')
+            if not items or not isinstance(items, list):
+                break
+            for item in items:
+                if not isinstance(item, dict):
+                    continue
+                if not self._check_bc_id_exists(item):
+                    continue
+                yield self._parse_rf_item(item)
+            size = int_or_none(results.get('size'))
+            if size is not None:
+                query['size'] = size
+            total = int_or_none(results.get('total'))
+            if total is not None and query['from'] + query['size'] > total:
+                break
+            query['from'] += query['size']
+
     def _real_extract(self, url):
-        rf_query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
-        rf_query['type'] = 'session'
-        rf_query['size'] = 1000
-        rf_results = self._call_api('search', None, rf_query, url)
-        entries = [
-            self._parse_rf_item(rf_item)
-            for rf_item
-            in rf_results['sectionList'][0]['items']
-            if self._check_bc_id_exists(rf_item)
-        ]
-        return self.playlist_result(entries, playlist_title='Filter query')
+        query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+        query['type'] = 'session'
+        return self.playlist_result(
+            self._entries(query, url), playlist_title='Search query')