_ Git - youtube-dl/blob - youtube_dl/extractor/ted.py

   1 from __future__ import unicode_literals
   2
   3 import json
   4 import re
   5
   6 from .common import InfoExtractor
   7
   8 from ..compat import compat_str
   9 from ..utils import int_or_none
  10
  11
  12 class TEDIE(InfoExtractor):
  13     _VALID_URL = r'''(?x)
  14         (?P<proto>https?://)
  15         (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
  16         (
  17             (?P<type_playlist>playlists(?:/\d+)?) # We have a playlist
  18             |
  19             ((?P<type_talk>talks)) # We have a simple talk
  20             |
  21             (?P<type_watch>watch)/[^/]+/[^/]+
  22         )
  23         (/lang/(.*?))? # The url may contain the language
  24         /(?P<name>[\w-]+) # Here goes the name and then ".html"
  25         .*)$
  26         '''
  27     _TESTS = [{
  28         'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  29         'md5': 'fc94ac279feebbce69f21c0c6ee82810',
  30         'info_dict': {
  31             'id': '102',
  32             'ext': 'mp4',
  33             'title': 'The illusion of consciousness',
  34             'description': ('Philosopher Dan Dennett makes a compelling '
  35                             'argument that not only don\'t we understand our own '
  36                             'consciousness, but that half the time our brains are '
  37                             'actively fooling us.'),
  38             'uploader': 'Dan Dennett',
  39             'width': 854,
  40             'duration': 1308,
  41         }
  42     }, {
  43         'url': 'http://www.ted.com/watch/ted-institute/ted-bcg/vishal-sikka-the-beauty-and-power-of-algorithms',
  44         'md5': '226f4fb9c62380d11b7995efa4c87994',
  45         'info_dict': {
  46             'id': 'vishal-sikka-the-beauty-and-power-of-algorithms',
  47             'ext': 'mp4',
  48             'title': 'Vishal Sikka: The beauty and power of algorithms',
  49             'thumbnail': 're:^https?://.+\.jpg',
  50             'description': 'Adaptive, intelligent, and consistent, algorithms are emerging as the ultimate app for everything from matching consumers to products to assessing medical diagnoses. Vishal Sikka shares his appreciation for the algorithm, charting both its inherent beauty and its growing power.',
  51         }
  52     }, {
  53         'url': 'http://www.ted.com/talks/gabby_giffords_and_mark_kelly_be_passionate_be_courageous_be_your_best',
  54         'info_dict': {
  55             'id': '1972',
  56             'ext': 'mp4',
  57             'title': 'Be passionate. Be courageous. Be your best.',
  58             'uploader': 'Gabby Giffords and Mark Kelly',
  59             'description': 'md5:5174aed4d0f16021b704120360f72b92',
  60             'duration': 1128,
  61         },
  62     }, {
  63         'url': 'http://www.ted.com/playlists/who_are_the_hackers',
  64         'info_dict': {
  65             'id': '10',
  66             'title': 'Who are the hackers?',
  67         },
  68         'playlist_mincount': 6,
  69     }, {
  70         # contains a youtube video
  71         'url': 'https://www.ted.com/talks/douglas_adams_parrots_the_universe_and_everything',
  72         'add_ie': ['Youtube'],
  73         'info_dict': {
  74             'id': '_ZG8HBuDjgc',
  75             'ext': 'mp4',
  76             'title': 'Douglas Adams: Parrots the Universe and Everything',
  77             'description': 'md5:01ad1e199c49ac640cb1196c0e9016af',
  78             'uploader': 'University of California Television (UCTV)',
  79             'uploader_id': 'UCtelevision',
  80             'upload_date': '20080522',
  81         },
  82         'params': {
  83             'skip_download': True,
  84         },
  85     }, {
  86         # YouTube video
  87         'url': 'http://www.ted.com/talks/jeffrey_kluger_the_sibling_bond',
  88         'add_ie': ['Youtube'],
  89         'info_dict': {
  90             'id': 'aFBIPO-P7LM',
  91             'ext': 'mp4',
  92             'title': 'The hidden power of siblings: Jeff Kluger at TEDxAsheville',
  93             'description': 'md5:3d7a4f50d95ca5dd67104e2a20f43fe1',
  94             'uploader': 'TEDx Talks',
  95             'uploader_id': 'TEDxTalks',
  96             'upload_date': '20111216',
  97         },
  98         'params': {
  99             'skip_download': True,
 100         },
 101     }]
 102
 103     _NATIVE_FORMATS = {
 104         'low': {'preference': 1, 'width': 320, 'height': 180},
 105         'medium': {'preference': 2, 'width': 512, 'height': 288},
 106         'high': {'preference': 3, 'width': 854, 'height': 480},
 107     }
 108
 109     def _extract_info(self, webpage):
 110         info_json = self._search_regex(r'q\("\w+.init",({.+})\)</script>',
 111                                        webpage, 'info json')
 112         return json.loads(info_json)
 113
 114     def _real_extract(self, url):
 115         m = re.match(self._VALID_URL, url, re.VERBOSE)
 116         if m.group('type').startswith('embed'):
 117             desktop_url = m.group('proto') + 'www' + m.group('urlmain')
 118             return self.url_result(desktop_url, 'TED')
 119         name = m.group('name')
 120         if m.group('type_talk'):
 121             return self._talk_info(url, name)
 122         elif m.group('type_watch'):
 123             return self._watch_info(url, name)
 124         else:
 125             return self._playlist_videos_info(url, name)
 126
 127     def _playlist_videos_info(self, url, name):
 128         '''Returns the videos of the playlist'''
 129
 130         webpage = self._download_webpage(url, name,
 131                                          'Downloading playlist webpage')
 132         info = self._extract_info(webpage)
 133         playlist_info = info['playlist']
 134
 135         playlist_entries = [
 136             self.url_result('http://www.ted.com/talks/' + talk['slug'], self.ie_key())
 137             for talk in info['talks']
 138         ]
 139         return self.playlist_result(
 140             playlist_entries,
 141             playlist_id=compat_str(playlist_info['id']),
 142             playlist_title=playlist_info['title'])
 143
 144     def _talk_info(self, url, video_name):
 145         webpage = self._download_webpage(url, video_name)
 146         self.report_extraction(video_name)
 147
 148         talk_info = self._extract_info(webpage)['talks'][0]
 149
 150         external = talk_info.get('external')
 151         if external:
 152             service = external['service']
 153             self.to_screen('Found video from %s' % service)
 154             ext_url = None
 155             if service.lower() == 'youtube':
 156                 ext_url = external.get('code')
 157             return {
 158                 '_type': 'url',
 159                 'url': ext_url or external['uri'],
 160             }
 161
 162         formats = [{
 163             'url': format_url,
 164             'format_id': format_id,
 165             'format': format_id,
 166         } for (format_id, format_url) in talk_info['nativeDownloads'].items() if format_url is not None]
 167         if formats:
 168             for f in formats:
 169                 finfo = self._NATIVE_FORMATS.get(f['format_id'])
 170                 if finfo:
 171                     f.update(finfo)
 172
 173         for format_id, resources in talk_info['resources'].items():
 174             if format_id == 'h264':
 175                 for resource in resources:
 176                     bitrate = int_or_none(resource.get('bitrate'))
 177                     formats.append({
 178                         'url': resource['file'],
 179                         'format_id': '%s-%sk' % (format_id, bitrate),
 180                         'tbr': bitrate,
 181                     })
 182             elif format_id == 'rtmp':
 183                 streamer = talk_info.get('streamer')
 184                 if not streamer:
 185                     continue
 186                 for resource in resources:
 187                     formats.append({
 188                         'format_id': '%s-%s' % (format_id, resource.get('name')),
 189                         'url': streamer,
 190                         'play_path': resource['file'],
 191                         'ext': 'flv',
 192                         'width': int_or_none(resource.get('width')),
 193                         'height': int_or_none(resource.get('height')),
 194                         'tbr': int_or_none(resource.get('bitrate')),
 195                     })
 196             elif format_id == 'hls':
 197                 formats.extend(self._extract_m3u8_formats(
 198                     resources.get('stream'), video_name, 'mp4', m3u8_id=format_id))
 199
 200         audio_download = talk_info.get('audioDownload')
 201         if audio_download:
 202             formats.append({
 203                 'url': audio_download,
 204                 'format_id': 'audio',
 205             })
 206
 207         self._sort_formats(formats)
 208
 209         video_id = compat_str(talk_info['id'])
 210
 211         thumbnail = talk_info['thumb']
 212         if not thumbnail.startswith('http'):
 213             thumbnail = 'http://' + thumbnail
 214         return {
 215             'id': video_id,
 216             'title': talk_info['title'].strip(),
 217             'uploader': talk_info['speaker'],
 218             'thumbnail': thumbnail,
 219             'description': self._og_search_description(webpage),
 220             'subtitles': self._get_subtitles(video_id, talk_info),
 221             'formats': formats,
 222             'duration': talk_info.get('duration'),
 223         }
 224
 225     def _get_subtitles(self, video_id, talk_info):
 226         languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
 227         if languages:
 228             sub_lang_list = {}
 229             for l in languages:
 230                 sub_lang_list[l] = [
 231                     {
 232                         'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
 233                         'ext': ext,
 234                     }
 235                     for ext in ['ted', 'srt']
 236                 ]
 237             return sub_lang_list
 238         else:
 239             return {}
 240
 241     def _watch_info(self, url, name):
 242         webpage = self._download_webpage(url, name)
 243
 244         config_json = self._html_search_regex(
 245             r'"pages\.jwplayer"\s*,\s*({.+?})\s*\)\s*</script>',
 246             webpage, 'config')
 247         config = json.loads(config_json)['config']
 248         video_url = config['video']['url']
 249         thumbnail = config.get('image', {}).get('url')
 250
 251         title = self._html_search_regex(
 252             r"(?s)<h1(?:\s+class='[^']+')?>(.+?)</h1>", webpage, 'title')
 253         description = self._html_search_regex(
 254             [
 255                 r'(?s)<h4 class="[^"]+" id="h3--about-this-talk">.*?</h4>(.*?)</div>',
 256                 r'(?s)<p><strong>About this talk:</strong>\s+(.*?)</p>',
 257             ],
 258             webpage, 'description', fatal=False)
 259
 260         return {
 261             'id': name,
 262             'url': video_url,
 263             'title': title,
 264             'thumbnail': thumbnail,
 265             'description': description,
 266         }