[youtube] Add ability to authenticate with cookies
[youtube-dl] / youtube_dl / extractor / theplatform.py
index 6febf805baa1bb65109462ae951e582007870bcd..b1a985ff6c12368347d98d95beed6a042e70093c 100644 (file)
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
 from __future__ import unicode_literals
 
 import re
@@ -33,7 +33,9 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
 
 class ThePlatformBaseIE(OnceIE):
     def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
-        meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'})
+        meta = self._download_xml(
+            smil_url, video_id, note=note, query={'format': 'SMIL'},
+            headers=self.geo_verification_headers())
         error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
         if error_element is not None and error_element.attrib['src'].startswith(
                 'http://link.theplatform.com/s/errorFiles/Unavailable.'):
@@ -78,14 +80,33 @@ class ThePlatformBaseIE(OnceIE):
                     'url': src,
                 })
 
+        duration = info.get('duration')
+        tp_chapters = info.get('chapters', [])
+        chapters = []
+        if tp_chapters:
+            def _add_chapter(start_time, end_time):
+                start_time = float_or_none(start_time, 1000)
+                end_time = float_or_none(end_time, 1000)
+                if start_time is None or end_time is None:
+                    return
+                chapters.append({
+                    'start_time': start_time,
+                    'end_time': end_time,
+                })
+
+            for chapter in tp_chapters[:-1]:
+                _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
+            _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
+
         return {
             'title': info['title'],
             'subtitles': subtitles,
             'description': info['description'],
             'thumbnail': info['defaultThumbnailUrl'],
-            'duration': int_or_none(info.get('duration'), 1000),
+            'duration': float_or_none(duration, 1000),
             'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
             'uploader': info.get('billingCode'),
+            'chapters': chapters,
         }
 
     def _extract_theplatform_metadata(self, path, video_id):
@@ -154,7 +175,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
             'title': 'iPhone Siri’s sassy response to a math question has people talking',
             'description': 'md5:a565d1deadd5086f3331d57298ec6333',
             'duration': 83.0,
-            'thumbnail': 're:^https?://.*\.jpg$',
+            'thumbnail': r're:^https?://.*\.jpg$',
             'timestamp': 1435752600,
             'upload_date': '20150701',
             'uploader': 'NBCU-NEWS',
@@ -177,10 +198,12 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
         if m:
             return [m.group('url')]
 
+        # Are whitesapces ignored in URLs?
+        # https://github.com/rg3/youtube-dl/issues/12044
         matches = re.findall(
-            r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+            r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
         if matches:
-            return list(zip(*matches))[1]
+            return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
 
     @staticmethod
     def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
@@ -193,7 +216,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
         def hex_to_bytes(hex):
             return binascii.a2b_hex(hex.encode('ascii'))
 
-        relative_path = re.match(r'https?://link.theplatform.com/s/([^?]+)', url).group(1)
+        relative_path = re.match(r'https?://link\.theplatform\.com/s/([^?]+)', url).group(1)
         clear_text = hex_to_bytes(flags + expiration_date + str_to_hex(relative_path))
         checksum = hmac.new(sig_key.encode('ascii'), clear_text, hashlib.sha1).hexdigest()
         sig = flags + expiration_date + checksum + str_to_hex(sig_secret)
@@ -295,7 +318,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
             'ext': 'mp4',
             'title': 'The Biden factor: will Joe run in 2016?',
             'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.',
-            'thumbnail': 're:^https?://.*\.jpg$',
+            'thumbnail': r're:^https?://.*\.jpg$',
             'upload_date': '20140208',
             'timestamp': 1391824260,
             'duration': 467.0,
@@ -304,9 +327,10 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
         },
     }]
 
-    def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}):
+    def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):
         real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
         entry = self._download_json(real_url, video_id)['entries'][0]
+        main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else None
 
         formats = []
         subtitles = {}
@@ -331,7 +355,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
                 if asset_type in asset_types_query:
                     query.update(asset_types_query[asset_type])
                 cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
-                    smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+                    main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
                 formats.extend(cur_formats)
                 subtitles = self._merge_subtitles(subtitles, cur_subtitles)