[youtube] Fix extraction.

[youtube-dl] / youtube_dl / extractor / cbc.py
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py

index ce8e3d346b56f4b3b48b789723324d06607b2ea6..fd5ec6033b80513012cf2615fc56e80c7e82cadc 100644 (file)
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -1,8 +1,10 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import hashlib
  import json
  import re
+from xml.sax.saxutils import escape
  
  from .common import InfoExtractor
  from ..compat import (
@@ -17,6 +19,7 @@ from ..utils import (
      xpath_element,
      xpath_with_ns,
      find_xpath_attr,
+    orderedSet,
      parse_duration,
      parse_iso8601,
      parse_age_limit,
@@ -136,9 +139,15 @@ class CBCIE(InfoExtractor):
          entries = [
              self._extract_player_init(player_init, display_id)
              for player_init in re.findall(r'CBC\.APP\.Caffeine\.initInstance\(({.+?})\);', webpage)]
+        media_ids = []
+        for media_id_re in (
+                r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"',
+                r'<div[^>]+\bid=["\']player-(\d+)',
+                r'guid["\']\s*:\s*["\'](\d+)'):
+            media_ids.extend(re.findall(media_id_re, webpage))
          entries.extend([
              self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id)
-            for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)])
+            for media_id in orderedSet(media_ids)])
          return self.playlist_result(
              entries, display_id, strip_or_none(title),
              self._og_search_description(webpage))
@@ -209,6 +218,29 @@ class CBCWatchBaseIE(InfoExtractor):
          'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',
      }
      _GEO_COUNTRIES = ['CA']
+    _LOGIN_URL = 'https://api.loginradius.com/identity/v2/auth/login'
+    _TOKEN_URL = 'https://cloud-api.loginradius.com/sso/jwt/api/token'
+    _API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
+    _NETRC_MACHINE = 'cbcwatch'
+
+    def _signature(self, email, password):
+        data = json.dumps({
+            'email': email,
+            'password': password,
+        }).encode()
+        headers = {'content-type': 'application/json'}
+        query = {'apikey': self._API_KEY}
+        resp = self._download_json(self._LOGIN_URL, None, data=data, headers=headers, query=query)
+        access_token = resp['access_token']
+
+        # token
+        query = {
+            'access_token': access_token,
+            'apikey': self._API_KEY,
+            'jwtapp': 'jwt',
+        }
+        resp = self._download_json(self._TOKEN_URL, None, headers=headers, query=query)
+        return resp['signature']
  
      def _call_api(self, path, video_id):
          url = path if path.startswith('http') else self._API_BASE_URL + path
@@ -232,7 +264,8 @@ class CBCWatchBaseIE(InfoExtractor):
      def _real_initialize(self):
          if self._valid_device_token():
              return
-        device = self._downloader.cache.load('cbcwatch', 'device') or {}
+        device = self._downloader.cache.load(
+            'cbcwatch', self._cache_device_key()) or {}
          self._device_id, self._device_token = device.get('id'), device.get('token')
          if self._valid_device_token():
              return
@@ -241,16 +274,30 @@ class CBCWatchBaseIE(InfoExtractor):
      def _valid_device_token(self):
          return self._device_id and self._device_token
  
+    def _cache_device_key(self):
+        email, _ = self._get_login_info()
+        return '%s_device' % hashlib.sha256(email.encode()).hexdigest() if email else 'device'
+
      def _register_device(self):
-        self._device_id = self._device_token = None
          result = self._download_xml(
              self._API_BASE_URL + 'device/register',
              None, 'Acquiring device token',
              data=b'<device><type>web</type></device>')
          self._device_id = xpath_text(result, 'deviceId', fatal=True)
-        self._device_token = xpath_text(result, 'deviceToken', fatal=True)
+        email, password = self._get_login_info()
+        if email and password:
+            signature = self._signature(email, password)
+            data = '<login><token>{0}</token><device><deviceId>{1}</deviceId><type>web</type></device></login>'.format(
+                escape(signature), escape(self._device_id)).encode()
+            url = self._API_BASE_URL + 'device/login'
+            result = self._download_xml(
+                url, None, data=data,
+                headers={'content-type': 'application/xml'})
+            self._device_token = xpath_text(result, 'token', fatal=True)
+        else:
+            self._device_token = xpath_text(result, 'deviceToken', fatal=True)
          self._downloader.cache.store(
-            'cbcwatch', 'device', {
+            'cbcwatch', self._cache_device_key(), {
                  'id': self._device_id,
                  'token': self._device_token,
              })
@@ -353,7 +400,7 @@ class CBCWatchVideoIE(CBCWatchBaseIE):
  
  class CBCWatchIE(CBCWatchBaseIE):
      IE_NAME = 'cbc.ca:watch'
-    _VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'
+    _VALID_URL = r'https?://(?:gem|watch)\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'
      _TESTS = [{
          # geo-restricted to Canada, bypassable
          'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4',
@@ -379,6 +426,9 @@ class CBCWatchIE(CBCWatchBaseIE):
              'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.',
          },
          'playlist_mincount': 30,
+    }, {
+        'url': 'https://gem.cbc.ca/media/this-hour-has-22-minutes/season-26/episode-20/38e815a-0108c6c6a42',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):