_ Git - youtube-dl/blob - youtube_dl/extractor/comcarcoff.py

   1 # encoding: utf-8
   2 from __future__ import unicode_literals
   3
   4 from .common import InfoExtractor
   5 from ..compat import compat_str
   6 from ..utils import (
   7     int_or_none,
   8     parse_duration,
   9     parse_iso8601,
  10 )
  11
  12
  13 class ComCarCoffIE(InfoExtractor):
  14     _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)'
  15     _TESTS = [{
  16         'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/',
  17         'info_dict': {
  18             'id': '2494164',
  19             'ext': 'mp4',
  20             'upload_date': '20141127',
  21             'timestamp': 1417107600,
  22             'duration': 1232,
  23             'title': 'Happy Thanksgiving Miranda',
  24             'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.',
  25         },
  26         'params': {
  27             'skip_download': 'requires ffmpeg',
  28         }
  29     }]
  30
  31     def _real_extract(self, url):
  32         display_id = self._match_id(url)
  33         if not display_id:
  34             display_id = 'comediansincarsgettingcoffee.com'
  35         webpage = self._download_webpage(url, display_id)
  36
  37         full_data = self._parse_json(
  38             self._search_regex(
  39                 r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'),
  40             display_id)['videoData']
  41
  42         display_id = full_data['activeVideo']['video']
  43         video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id]
  44
  45         video_id = compat_str(video_data['mediaId'])
  46         title = video_data['title']
  47         formats = self._extract_m3u8_formats(
  48             video_data['mediaUrl'], video_id, 'mp4')
  49         self._sort_formats(formats)
  50
  51         thumbnails = [{
  52             'url': video_data['images']['thumb'],
  53         }, {
  54             'url': video_data['images']['poster'],
  55         }]
  56
  57         timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601(
  58             video_data.get('pubDate'))
  59         duration = int_or_none(video_data.get('durationSeconds')) or parse_duration(
  60             video_data.get('duration'))
  61
  62         return {
  63             'id': video_id,
  64             'display_id': display_id,
  65             'title': title,
  66             'description': video_data.get('description'),
  67             'timestamp': timestamp,
  68             'duration': duration,
  69             'thumbnails': thumbnails,
  70             'formats': formats,
  71             'season_number': int_or_none(video_data.get('season')),
  72             'episode_number': int_or_none(video_data.get('episode')),
  73             'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))),
  74         }