Merge branch 'subtitles-rework'
[youtube-dl] / youtube_dl / extractor / ted.py
index d5e28efada55a91a480ce031df0bc2774de2ccc6..4cec06f8bd6e2a18ac3062e916225746f5153c93 100644 (file)
@@ -3,17 +3,17 @@ from __future__ import unicode_literals
 import json
 import re
 
-from .subtitles import SubtitlesInfoExtractor
+from .common import InfoExtractor
 
-from ..utils import (
+from ..compat import (
     compat_str,
 )
 
 
-class TEDIE(SubtitlesInfoExtractor):
+class TEDIE(InfoExtractor):
     _VALID_URL = r'''(?x)
         (?P<proto>https?://)
-        (?P<type>www|embed)(?P<urlmain>\.ted\.com/
+        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
         (
             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
             |
@@ -33,11 +33,12 @@ class TEDIE(SubtitlesInfoExtractor):
             'ext': 'mp4',
             'title': 'The illusion of consciousness',
             'description': ('Philosopher Dan Dennett makes a compelling '
-                'argument that not only don\'t we understand our own '
-                'consciousness, but that half the time our brains are '
-                'actively fooling us.'),
+                            'argument that not only don\'t we understand our own '
+                            'consciousness, but that half the time our brains are '
+                            'actively fooling us.'),
             'uploader': 'Dan Dennett',
             'width': 854,
+            'duration': 1308,
         }
     }, {
         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
@@ -57,6 +58,7 @@ class TEDIE(SubtitlesInfoExtractor):
             'title': 'Be passionate. Be courageous. Be your best.',
             'uploader': 'Gabby Giffords and Mark Kelly',
             'description': 'md5:5174aed4d0f16021b704120360f72b92',
+            'duration': 1128,
         },
     }, {
         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
@@ -65,6 +67,38 @@ class TEDIE(SubtitlesInfoExtractor):
             'title': 'Who are the hackers?',
         },
         'playlist_mincount': 6,
+    }, {
+        # contains a youtube video
+        'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
+        'add_ie': ['Youtube'],
+        'info_dict': {
+            'id': '_ZG8HBuDjgc',
+            'ext': 'mp4',
+            'title': 'Douglas Adams: Parrots the Universe and Everything',
+            'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
+            'uploader': 'University of California Television (UCTV)',
+            'uploader_id': 'UCtelevision',
+            'upload_date': '20080522',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }, {
+        # YouTube video
+        'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
+        'add_ie': ['Youtube'],
+        'info_dict': {
+            'id': 'aFBIPO-P7LM',
+            'ext': 'mp4',
+            'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
+            'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
+            'uploader': 'TEDx Talks',
+            'uploader_id': 'TEDxTalks',
+            'upload_date': '20111216',
+        },
+        'params': {
+            'skip_download': True,
+        },
     }]
 
     _NATIVE_FORMATS = {
@@ -75,12 +109,12 @@ class TEDIE(SubtitlesInfoExtractor):
 
     def _extract_info(self, webpage):
         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
-            webpage, 'info json')
+                                       webpage, 'info json')
         return json.loads(info_json)
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url, re.VERBOSE)
-        if m.group('type') == 'embed':
+        if m.group('type').startswith('embed'):
             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
             return self.url_result(desktop_url, 'TED')
         name = m.group('name')
@@ -95,7 +129,7 @@ class TEDIE(SubtitlesInfoExtractor):
         '''Returns the videos of the playlist'''
 
         webpage = self._download_webpage(url, name,
-            'Downloading playlist webpage')
+                                         'Downloading playlist webpage')
         info = self._extract_info(webpage)
         playlist_info = info['playlist']
 
@@ -114,6 +148,18 @@ class TEDIE(SubtitlesInfoExtractor):
 
         talk_info = self._extract_info(webpage)['talks'][0]
 
+        external = talk_info.get('external')
+        if external:
+            service = external['service']
+            self.to_screen('Found video from %s' % service)
+            ext_url = None
+            if service.lower() == 'youtube':
+                ext_url = external.get('code')
+            return {
+                '_type': 'url',
+                'url': ext_url or external['uri'],
+            }
+
         formats = [{
             'url': format_url,
             'format_id': format_id,
@@ -138,11 +184,6 @@ class TEDIE(SubtitlesInfoExtractor):
         self._sort_formats(formats)
 
         video_id = compat_str(talk_info['id'])
-        # subtitles
-        video_subtitles = self.extract_subtitles(video_id, talk_info)
-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id, talk_info)
-            return
 
         thumbnail = talk_info['thumb']
         if not thumbnail.startswith('http'):
@@ -153,28 +194,34 @@ class TEDIE(SubtitlesInfoExtractor):
             'uploader': talk_info['speaker'],
             'thumbnail': thumbnail,
             'description': self._og_search_description(webpage),
-            'subtitles': video_subtitles,
+            'subtitles': self._get_subtitles(video_id, talk_info),
             'formats': formats,
+            'duration': talk_info.get('duration'),
         }
 
-    def _get_available_subtitles(self, video_id, talk_info):
+    def _get_subtitles(self, video_id, talk_info):
         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
         if languages:
             sub_lang_list = {}
             for l in languages:
-                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
-                sub_lang_list[l] = url
+                sub_lang_list[l] = [
+                    {
+                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
+                        'ext': ext,
+                    }
+                    for ext in ['ted', 'srt']
+                ]
             return sub_lang_list
         else:
-            self._downloader.report_warning('video doesn\'t have subtitles')
             return {}
 
     def _watch_info(self, url, name):
         webpage = self._download_webpage(url, name)
 
         config_json = self._html_search_regex(
-            r"data-config='([^']+)", webpage, 'config')
-        config = json.loads(config_json)
+            r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
+            webpage, 'config')
+        config = json.loads(config_json)['config']
         video_url = config['video']['url']
         thumbnail = config.get('image', {}).get('url')