[youtube] Extract chapters

[youtube-dl] / youtube_dl / extractor / theplatform.py
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py

index 6febf805baa1bb65109462ae951e582007870bcd..9a424b1c6aeb089af8050d7eee6b29591968c3aa 100644 (file)
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -1,4 +1,4 @@
-# -*- coding: utf-8 -*-
+# coding: utf-8
  from __future__ import unicode_literals
  
  import re
@@ -33,7 +33,9 @@ _x = lambda p: xpath_with_ns(p, {'smil': default_ns})
  
  class ThePlatformBaseIE(OnceIE):
      def _extract_theplatform_smil(self, smil_url, video_id, note='Downloading SMIL data'):
-        meta = self._download_xml(smil_url, video_id, note=note, query={'format': 'SMIL'})
+        meta = self._download_xml(
+            smil_url, video_id, note=note, query={'format': 'SMIL'},
+            headers=self.geo_verification_headers())
          error_element = find_xpath_attr(meta, _x('.//smil:ref'), 'src')
          if error_element is not None and error_element.attrib['src'].startswith(
                  'http://link.theplatform.com/s/errorFiles/Unavailable.'):
@@ -154,7 +156,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
              'title': 'iPhone Siri’s sassy response to a math question has people talking',
              'description': 'md5:a565d1deadd5086f3331d57298ec6333',
              'duration': 83.0,
-            'thumbnail': 're:^https?://.*\.jpg$',
+            'thumbnail': r're:^https?://.*\.jpg$',
              'timestamp': 1435752600,
              'upload_date': '20150701',
              'uploader': 'NBCU-NEWS',
@@ -177,10 +179,12 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
          if m:
              return [m.group('url')]
  
+        # Are whitesapces ignored in URLs?
+        # https://github.com/rg3/youtube-dl/issues/12044
          matches = re.findall(
-            r'<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
+            r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
          if matches:
-            return list(zip(*matches))[1]
+            return [re.sub(r'\s', '', list(zip(*matches))[1][0])]
  
      @staticmethod
      def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False):
@@ -295,7 +299,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
              'ext': 'mp4',
              'title': 'The Biden factor: will Joe run in 2016?',
              'description': 'Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in.',
-            'thumbnail': 're:^https?://.*\.jpg$',
+            'thumbnail': r're:^https?://.*\.jpg$',
              'upload_date': '20140208',
              'timestamp': 1391824260,
              'duration': 467.0,
@@ -304,9 +308,10 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
          },
      }]
  
-    def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}):
+    def _extract_feed_info(self, provider_id, feed_id, filter_query, video_id, custom_fields=None, asset_types_query={}, account_id=None):
          real_url = self._URL_TEMPLATE % (self.http_scheme(), provider_id, feed_id, filter_query)
          entry = self._download_json(real_url, video_id)['entries'][0]
+        main_smil_url = 'http://link.theplatform.com/s/%s/media/guid/%d/%s' % (provider_id, account_id, entry['guid']) if account_id else None
  
          formats = []
          subtitles = {}
@@ -331,7 +336,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE):
                  if asset_type in asset_types_query:
                      query.update(asset_types_query[asset_type])
                  cur_formats, cur_subtitles = self._extract_theplatform_smil(update_url_query(
-                    smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
+                    main_smil_url or smil_url, query), video_id, 'Downloading SMIL data for %s' % asset_type)
                  formats.extend(cur_formats)
                  subtitles = self._merge_subtitles(subtitles, cur_subtitles)