_ Git - youtube-dl/blob - youtube_dl/extractor/vidbit.py

   1 from __future__ import unicode_literals
   2
   3 from .common import InfoExtractor
   4 from ..compat import compat_urlparse
   5 from ..utils import (
   6     int_or_none,
   7     js_to_json,
   8     remove_end,
   9     unified_strdate,
  10 )
  11
  12
  13 class VidbitIE(InfoExtractor):
  14     _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)'
  15     _TESTS = [{
  16         'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2',
  17         'md5': '1a34b7f14defe3b8fafca9796892924d',
  18         'info_dict': {
  19             'id': 'jkL2yDOEq2',
  20             'ext': 'mp4',
  21             'title': 'Intro to VidBit',
  22             'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7',
  23             'thumbnail': r're:https?://.*\.jpg$',
  24             'upload_date': '20160618',
  25             'view_count': int,
  26             'comment_count': int,
  27         }
  28     }, {
  29         'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0',
  30         'only_matching': True,
  31     }]
  32
  33     def _real_extract(self, url):
  34         video_id = self._match_id(url)
  35
  36         webpage = self._download_webpage(
  37             compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id)
  38
  39         video_url, title = [None] * 2
  40
  41         config = self._parse_json(self._search_regex(
  42             r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'),
  43             video_id, transform_source=js_to_json)
  44         if config:
  45             if config.get('file'):
  46                 video_url = compat_urlparse.urljoin(url, config['file'])
  47             title = config.get('title')
  48
  49         if not video_url:
  50             video_url = compat_urlparse.urljoin(url, self._search_regex(
  51                 r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
  52                 webpage, 'video URL', group='url'))
  53
  54         if not title:
  55             title = remove_end(
  56                 self._html_search_regex(
  57                     (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'),
  58                     webpage, 'title', default=None) or self._og_search_title(webpage),
  59                 ' - VidBit')
  60
  61         description = self._html_search_meta(
  62             ('description', 'og:description', 'twitter:description'),
  63             webpage, 'description')
  64
  65         upload_date = unified_strdate(self._html_search_meta(
  66             'datePublished', webpage, 'upload date'))
  67
  68         view_count = int_or_none(self._search_regex(
  69             r'<strong>(\d+)</strong> views',
  70             webpage, 'view count', fatal=False))
  71         comment_count = int_or_none(self._search_regex(
  72             r'id=["\']cmt_num["\'][^>]*>\((\d+)\)',
  73             webpage, 'comment count', fatal=False))
  74
  75         return {
  76             'id': video_id,
  77             'url': video_url,
  78             'title': title,
  79             'description': description,
  80             'thumbnail': self._og_search_thumbnail(webpage),
  81             'upload_date': upload_date,
  82             'view_count': view_count,
  83             'comment_count': comment_count,
  84         }