_ Git - youtube-dl/blob - youtube_dl/extractor/discovery.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..utils import (
   7     parse_duration,
   8     parse_iso8601,
   9 )
  10 from ..compat import (
  11     compat_str,
  12     compat_urlparse,
  13 )
  14
  15
  16 class DiscoveryIE(InfoExtractor):
  17     _VALID_URL = r'''(?x)https?://(?:www\.)?(?:
  18             discovery|
  19             investigationdiscovery|
  20             discoverylife|
  21             animalplanet|
  22             ahctv|
  23             destinationamerica|
  24             sciencechannel|
  25             tlc|
  26             velocity
  27         )\.com/(?:[^/]+/)*(?P<id>[^./?#]+)'''
  28     _TESTS = [{
  29         'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mission-impossible-outtakes.htm',
  30         'info_dict': {
  31             'id': '20769',
  32             'ext': 'mp4',
  33             'title': 'Mission Impossible Outtakes',
  34             'description': ('Watch Jamie Hyneman and Adam Savage practice being'
  35                             ' each other -- to the point of confusing Jamie\'s dog -- and '
  36                             'don\'t miss Adam moon-walking as Jamie ... behind Jamie\'s'
  37                             ' back.'),
  38             'duration': 156,
  39             'timestamp': 1302032462,
  40             'upload_date': '20110405',
  41         },
  42         'params': {
  43             'skip_download': True,  # requires ffmpeg
  44         }
  45     }, {
  46         'url': 'http://www.discovery.com/tv-shows/mythbusters/videos/mythbusters-the-simpsons',
  47         'info_dict': {
  48             'id': 'mythbusters-the-simpsons',
  49             'title': 'MythBusters: The Simpsons',
  50         },
  51         'playlist_mincount': 10,
  52     }, {
  53         'url': 'http://www.animalplanet.com/longfin-eels-maneaters/',
  54         'info_dict': {
  55             'id': '78326',
  56             'ext': 'mp4',
  57             'title': 'Longfin Eels: Maneaters?',
  58             'description': 'Jeremy Wade tests whether or not New Zealand\'s longfin eels are man-eaters by covering himself in fish guts and getting in the water with them.',
  59             'upload_date': '20140725',
  60             'timestamp': 1406246400,
  61             'duration': 116,
  62         },
  63     }]
  64
  65     def _real_extract(self, url):
  66         display_id = self._match_id(url)
  67         info = self._download_json(url + '?flat=1', display_id)
  68
  69         video_title = info.get('playlist_title') or info.get('video_title')
  70
  71         entries = []
  72
  73         for idx, video_info in enumerate(info['playlist']):
  74             m3u8_url = video_info['src']
  75             formats = m3u8_formats = self._extract_m3u8_formats(
  76                 m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls',
  77                 note='Download m3u8 information for video %d' % (idx + 1))
  78             qualities_basename = self._search_regex(
  79                 '/([^/]+)\.csmil/', m3u8_url, 'qualities basename', default=None)
  80             if qualities_basename:
  81                 m3u8_path = compat_urlparse.urlparse(m3u8_url).path
  82                 QUALITIES_RE = r'((,\d+k)+,?)'
  83                 qualities = self._search_regex(
  84                     QUALITIES_RE, qualities_basename,
  85                     'qualities', default=None)
  86                 if qualities:
  87                     qualities = list(map(lambda q: int(q[:-1]), qualities.strip(',').split(',')))
  88                     qualities.sort()
  89                     http_path = m3u8_path[1:].split('/', 1)[1]
  90                     http_template = re.sub(QUALITIES_RE, r'%dk', http_path)
  91                     http_template = http_template.replace('.csmil/master.m3u8', '')
  92                     http_template = compat_urlparse.urljoin(
  93                         'http://discsmil.edgesuite.net/', http_template)
  94                     if m3u8_formats:
  95                         self._sort_formats(m3u8_formats)
  96                         m3u8_formats = list(filter(
  97                             lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
  98                             m3u8_formats))
  99                     if len(qualities) == len(m3u8_formats):
 100                         for q, m3u8_format in zip(qualities, m3u8_formats):
 101                             f = m3u8_format.copy()
 102                             f.update({
 103                                 'url': http_template % q,
 104                                 'format_id': f['format_id'].replace('hls', 'http'),
 105                                 'protocol': 'http',
 106                             })
 107                             formats.append(f)
 108                     else:
 109                         for q in qualities:
 110                             formats.append({
 111                                 'url': http_template % q,
 112                                 'ext': 'mp4',
 113                                 'format_id': 'http-%d' % q,
 114                                 'tbr': q,
 115                             })
 116             self._sort_formats(formats)
 117
 118             subtitles = []
 119             caption_url = video_info.get('captionsUrl')
 120             if caption_url:
 121                 subtitles = {
 122                     'en': [{
 123                         'url': caption_url,
 124                     }]
 125                 }
 126
 127             entries.append({
 128                 'id': compat_str(video_info['id']),
 129                 'formats': formats,
 130                 'title': video_info['title'],
 131                 'description': video_info.get('description'),
 132                 'duration': parse_duration(video_info.get('video_length')),
 133                 'webpage_url': video_info.get('href') or video_info.get('url'),
 134                 'thumbnail': video_info.get('thumbnailURL'),
 135                 'alt_title': video_info.get('secondary_title'),
 136                 'timestamp': parse_iso8601(video_info.get('publishedDate')),
 137                 'subtitles': subtitles,
 138             })
 139
 140         return self.playlist_result(entries, display_id, video_title)