Unify coding cookie

[youtube-dl] / youtube_dl / extractor / theplatform.py
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

index 7a5a533b7473bc483e64915ed2065c12c9adbdc6..cfbf7f4e1562c78ea1d5ae44437694a5325eb70b 100644 (file)
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
  from __future__ import unicode_literals
  
  import re
@@ -9,16 +9,19 @@ import hashlib
  
  
  from .once import OnceIE
+from .adobepass import AdobePassIE
  from ..compat import (
      compat_parse_qs,
      compat_urllib_parse_urlparse,
  )
  from ..utils import (
+    determine_ext,
      ExtractorError,
      float_or_none,
      int_or_none,
      sanitized_Request,
      unsmuggle_url,
+    update_url_query,
      xpath_with_ns,
      mimetype2ext,
      find_xpath_attr,
@@ -48,25 +51,32 @@ class ThePlatformBaseIE(OnceIE):
              if OnceIE.suitable(_format['url']):
                  formats.extend(self._extract_once_formats(_format['url']))
              else:
+                media_url = _format['url']
+                if determine_ext(media_url) == 'm3u8':
+                    hdnea2 = self._get_cookies(media_url).get('hdnea2')
+                    if hdnea2:
+                        _format['url'] = update_url_query(media_url, {'hdnea3': hdnea2.value})
+
                  formats.append(_format)
  
          subtitles = self._parse_smil_subtitles(meta, default_ns)
  
          return formats, subtitles
  
-    def get_metadata(self, path, video_id):
+    def _download_theplatform_metadata(self, path, video_id):
          info_url = 'http://link.theplatform.com/s/%s?format=preview' % path
-        info = self._download_json(info_url, video_id)
+        return self._download_json(info_url, video_id)
  
+    def _parse_theplatform_metadata(self, info):
          subtitles = {}
          captions = info.get('captions')
          if isinstance(captions, list):
              for caption in captions:
                  lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type')
-                subtitles[lang] = [{
+                subtitles.setdefault(lang, []).append({
                      'ext': mimetype2ext(mime),
                      'url': src,
-                }]
+                })
  
          return {
              'title': info['title'],
@@ -78,11 +88,15 @@ class ThePlatformBaseIE(OnceIE):
              'uploader': info.get('billingCode'),
          }
  
+    def _extract_theplatform_metadata(self, path, video_id):
+        info = self._download_theplatform_metadata(path, video_id)
+        return self._parse_theplatform_metadata(info)
+
  
-class ThePlatformIE(ThePlatformBaseIE):
+class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
      _VALID_URL = r'''(?x)
          (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/
-           (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
+           (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))?
           |theplatform:)(?P<id>[^/\?&]+)'''
  
      _TESTS = [{
@@ -102,6 +116,7 @@ class ThePlatformIE(ThePlatformBaseIE):
              # rtmp download
              'skip_download': True,
          },
+        'skip': '404 Not Found',
      }, {
          # from http://www.cnet.com/videos/tesla-model-s-a-second-step-towards-a-cleaner-motoring-future/
          'url': 'http://link.theplatform.com/s/kYEXFC/22d_qsQ6MIRT',
@@ -151,6 +166,22 @@ class ThePlatformIE(ThePlatformBaseIE):
          'only_matching': True,
      }]
  
+    @classmethod
+    def _extract_urls(cls, webpage):
+        m = re.search(
+            r'''(?x)
+                    <meta\s+
+                        property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+
+                        content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2
+            ''', webpage)
+        if m:
+            return [m.group('url')]
+
+        matches = re.findall(
+            r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+        if matches:
+            return list(zip(*matches))[1]
+
      @staticmethod
      def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
          flags = '10' if include_qs else '00'
@@ -159,11 +190,11 @@ class ThePlatformIE(ThePlatformBaseIE):
          def str_to_hex(str):
              return binascii.b2a_hex(str.encode('ascii')).decode('ascii')
  
-        def hex_to_str(hex):
-            return binascii.a2b_hex(hex)
+        def hex_to_bytes(hex):
+            return binascii.a2b_hex(hex.encode('ascii'))
  
          relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1)
-        clear_text = hex_to_str(flags + expiration_date + str_to_hex(relative_path))
+        clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path))
          checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
          sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
          return '%s&sig=%s' % (url, sig)
@@ -241,7 +272,7 @@ class ThePlatformIE(ThePlatformBaseIE):
          formats, subtitles = self._extract_theplatform_smil(smil_url, video_id)
          self._sort_formats(formats)
  
-        ret = self.get_metadata(path, video_id)
+        ret = self._extract_theplatform_metadata(path, video_id)
          combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles)
          ret.update({
              'id': video_id,
@@ -253,9 +284,9 @@ class ThePlatformIE(ThePlatformBaseIE):
  
  
  class ThePlatformFeedIE(ThePlatformBaseIE):
-    _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&byGuid=%s'
-    _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+)'
-    _TEST = {
+    _URL_TEMPLATE = '%s//feed.theplatform.com/f/%s/%s?form=json&%s'
+    _VALID_URL = r'https?://feed\.theplatform\.com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+)\?(?:[^&]+&)*(?P<filter>by(?:Gui|I)d=(?P<id>[\w-]+))'
+    _TESTS = [{
          # From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
          'url': 'http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207',
          'md5': '6e32495b5073ab414471b615c5ded394',
@@ -269,33 +300,40 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
              'timestamp': 1391824260,
              'duration': 467.0,
              'categories': ['MSNBC/Issues/Democrats', 'MSNBC/Issues/Elections/Election 2016'],
+            'uploader': 'NBCU-NEWS',
          },
-    }
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        provider_id = mobj.group('provider_id')
-        feed_id = mobj.group('feed_id')
+    }]
  
-        real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, video_id)
-        feed = self._download_json(real_url, video_id)
-        entry = feed['entries'][0]
+    def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}):
+        real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
+        entry = self._download_json(real_url, video_id)['entries'][0]
  
          formats = []
          subtitles = {}
          first_video_id = None
          duration = None
+        asset_types = []
          for item in entry['media$content']:
-            smil_url = item['plfile$url'] + '&mbr=true'
+            smil_url = item['plfile$url']
              cur_video_id = ThePlatformIE._match_id(smil_url)
              if first_video_id is None:
                  first_video_id = cur_video_id
                  duration = float_or_none(item.get('plfile$duration'))
-            cur_formats, cur_subtitles = self._extract_theplatform_smil(smil_url, video_id, 'Downloading SMIL data for %s' % cur_video_id)
-            formats.extend(cur_formats)
-            subtitles = self._merge_subtitles(subtitles, cur_subtitles)
+            for asset_type in item['plfile$assetTypes']:
+                if asset_type in asset_types:
+                    continue
+                asset_types.append(asset_type)
+                query = {
+                    'mbr': 'true',
+                    'formats': item['plfile$format'],
+                    'assetTypes': asset_type,
+                }
+                if asset_type in asset_types_query:
+                    query.update(asset_types_query[asset_type])
+                cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
+                    smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+                formats.extend(cur_formats)
+                subtitles = self._merge_subtitles(subtitles, cur_subtitles)
  
          self._sort_formats(formats)
  
@@ -308,7 +346,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
          timestamp = int_or_none(entry.get('media$availableDate'), scale=1000)
          categories = [item['media$name'] for item in entry.get('media$categories', [])]
  
-        ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id)
+        ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id)
          subtitles = self._merge_subtitles(subtitles, ret['subtitles'])
          ret.update({
              'id': video_id,
@@ -319,5 +357,17 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
              'timestamp': timestamp,
              'categories': categories,
          })
+        if custom_fields:
+            ret.update(custom_fields(entry))
  
          return ret
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        video_id = mobj.group('id')
+        provider_id = mobj.group('provider_id')
+        feed_id = mobj.group('feed_id')
+        filter_query = mobj.group('filter')
+
+        return self._extract_feed_info(provider_id, feed_id, filter_query, video_id)