_ Git - youtube-dl/blob - youtube_dl/extractor/ted.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .subtitles import SubtitlesInfoExtractor
   7
   8 from ..utils import (
   9     compat_str,
  10 )
  11
  12
  13 class TEDIE(SubtitlesInfoExtractor):
  14     _VALID_URL = r'''(?x)
  15         (?P<proto>https?://)
  16         (?P<type>www|embed)(?P<urlmain>\.ted\.com/
  17         (
  18             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  19             |
  20             ((?P<type_talk>talks)) # We have a simple talk
  21             |
  22             (?P<type_watch>watch)/[^/]+/[^/]+
  23         )
  24         (/lang/(.*?))? # The url may contain the language
  25         /(?P<name>[\w-]+) # Here goes the name and then ".html"
  26         .*)$
  27         '''
  28     _TESTS = [{
  29         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  30         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
  31         'info_dict': {
  32             'id': '102',
  33             'ext': 'mp4',
  34             'title': 'The illusion of consciousness',
  35             'description': ('Philosopher Dan Dennett makes a compelling '
  36                 'argument that not only don\'t we understand our own '
  37                 'consciousness, but that half the time our brains are '
  38                 'actively fooling us.'),
  39             'uploader': 'Dan Dennett',
  40             'width': 854,
  41         }
  42     }, {
  43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  44         'md5': '226f4fb9c62380d11b7995efa4c87994',
  45         'info_dict': {
  46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
  47             'ext': 'mp4',
  48             'title': 'Vishal Sikka: The beauty and power of algorithms',
  49             'thumbnail': 're:^https?://.+\.jpg',
  50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
  51         }
  52     }, {
  53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  54         'info_dict': {
  55             'id': '1972',
  56             'ext': 'mp4',
  57             'title': 'Be passionate. Be courageous. Be your best.',
  58             'uploader': 'Gabby Giffords and Mark Kelly',
  59             'description': 'md5:5174aed4d0f16021b704120360f72b92',
  60         },
  61     }]
  62
  63     _NATIVE_FORMATS = {
  64         'low': {'preference': 1, 'width': 320, 'height': 180},
  65         'medium': {'preference': 2, 'width': 512, 'height': 288},
  66         'high': {'preference': 3, 'width': 854, 'height': 480},
  67     }
  68
  69     def _extract_info(self, webpage):
  70         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
  71             webpage, 'info json')
  72         return json.loads(info_json)
  73
  74     def _real_extract(self, url):
  75         m = re.match(self._VALID_URL, url, re.VERBOSE)
  76         if m.group('type') == 'embed':
  77             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
  78             return self.url_result(desktop_url, 'TED')
  79         name = m.group('name')
  80         if m.group('type_talk'):
  81             return self._talk_info(url, name)
  82         elif m.group('type_watch'):
  83             return self._watch_info(url, name)
  84         else:
  85             return self._playlist_videos_info(url, name)
  86
  87     def _playlist_videos_info(self, url, name):
  88         '''Returns the videos of the playlist'''
  89
  90         webpage = self._download_webpage(url, name,
  91             'Downloading playlist webpage')
  92         info = self._extract_info(webpage)
  93         playlist_info = info['playlist']
  94
  95         playlist_entries = [
  96             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
  97             for talk in info['talks']
  98         ]
  99         return self.playlist_result(
 100             playlist_entries,
 101             playlist_id=compat_str(playlist_info['id']),
 102             playlist_title=playlist_info['title'])
 103
 104     def _talk_info(self, url, video_name):
 105         webpage = self._download_webpage(url, video_name)
 106         self.report_extraction(video_name)
 107
 108         talk_info = self._extract_info(webpage)['talks'][0]
 109
 110         formats = [{
 111             'url': format_url,
 112             'format_id': format_id,
 113             'format': format_id,
 114         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
 115         if formats:
 116             for f in formats:
 117                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
 118                 if finfo:
 119                     f.update(finfo)
 120         else:
 121             # Use rtmp downloads
 122             formats = [{
 123                 'format_id': f['name'],
 124                 'url': talk_info['streamer'],
 125                 'play_path': f['file'],
 126                 'ext': 'flv',
 127                 'width': f['width'],
 128                 'height': f['height'],
 129                 'tbr': f['bitrate'],
 130             } for f in talk_info['resources']['rtmp']]
 131         self._sort_formats(formats)
 132
 133         video_id = compat_str(talk_info['id'])
 134         # subtitles
 135         video_subtitles = self.extract_subtitles(video_id, talk_info)
 136         if self._downloader.params.get('listsubtitles', False):
 137             self._list_available_subtitles(video_id, talk_info)
 138             return
 139
 140         thumbnail = talk_info['thumb']
 141         if not thumbnail.startswith('http'):
 142             thumbnail = 'http://' + thumbnail
 143         return {
 144             'id': video_id,
 145             'title': talk_info['title'],
 146             'uploader': talk_info['speaker'],
 147             'thumbnail': thumbnail,
 148             'description': self._og_search_description(webpage),
 149             'subtitles': video_subtitles,
 150             'formats': formats,
 151         }
 152
 153     def _get_available_subtitles(self, video_id, talk_info):
 154         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
 155         if languages:
 156             sub_lang_list = {}
 157             for l in languages:
 158                 url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
 159                 sub_lang_list[l] = url
 160             return sub_lang_list
 161         else:
 162             self._downloader.report_warning('video doesn\'t have subtitles')
 163             return {}
 164
 165     def _watch_info(self, url, name):
 166         webpage = self._download_webpage(url, name)
 167
 168         config_json = self._html_search_regex(
 169             r"data-config='([^']+)", webpage, 'config')
 170         config = json.loads(config_json)
 171         video_url = config['video']['url']
 172         thumbnail = config.get('image', {}).get('url')
 173
 174         title = self._html_search_regex(
 175             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
 176         description = self._html_search_regex(
 177             [
 178                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
 179                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
 180             ],
 181             webpage, 'description', fatal=False)
 182
 183         return {
 184             'id': name,
 185             'url': video_url,
 186             'title': title,
 187             'thumbnail': thumbnail,
 188             'description': description,
 189         }