_ Git - youtube-dl/blob - youtube_dl/extractor/pinkbike.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..utils import (
   8     int_or_none,
   9     remove_end,
  10     remove_start
  11 )
  12
  13
  14 class PinkbikeIE(InfoExtractor):
  15     _VALID_URL = r'https?://(?:www\.)?pinkbike\.com/video/(?P<id>[0-9]+)'
  16     _TESTS = [{
  17         'url': 'http://www.pinkbike.com/video/402811/',
  18         'md5': '4814b8ca7651034cd87e3361d5c2155a',
  19         'info_dict': {
  20             'id': '402811',
  21             'ext': 'mp4',
  22             'title': 'Brandon Semenuk - RAW 100',
  23             'thumbnail': 're:^https?://.*\.jpg$',
  24             'location': 'Victoria, British Columbia, Canada',
  25             'uploader_id': 'revelco',
  26             'upload_date': '20150406',
  27             'description': 'Official release: www.redbull.ca/rupertwalker',
  28             'duration': 100
  29         }
  30     }, {
  31         'url': 'http://www.pinkbike.com/video/406629/',
  32         'md5': 'c7a3e19a2bd5cde5a1cda6b2b46caa74',
  33         'info_dict': {
  34             'id': '406629',
  35             'ext': 'mp4',
  36             'title': 'Chromag: Reece Wallace in Utah',
  37             'thumbnail': 're:^https?://.*\.jpg$',
  38             'location': 'Whistler, British Columbia, Canada',
  39             'uploader_id': 'Chromagbikes',
  40             'upload_date': '20150505',
  41             'description': 'Reece Wallace shredding Virgin, Utah. Video by Virtu Media.',
  42             'duration': 180
  43         }
  44     }]
  45
  46     def _real_extract(self, url):
  47         video_id = self._match_id(url)
  48         webpage = self._download_webpage(url, video_id)
  49
  50         title = self._html_search_regex(r'<title>(.*?)</title>', webpage, 'title')
  51         title = remove_end(title, ' Video - Pinkbike')
  52
  53         description = self._html_search_meta('description', webpage, 'description')
  54         description = remove_start(description, title + '. ')
  55
  56         duration = int_or_none(self._html_search_meta(
  57             'video:duration', webpage, 'duration'))
  58
  59         uploader_id = self._html_search_regex(r'un:\s*"(.*?)"', webpage, 'uploader_id')
  60
  61         upload_date = self._html_search_regex(
  62             r'class="fullTime"\s*title="([0-9]{4}(?:-[0-9]{2}){2})"',
  63             webpage, 'upload_date')
  64         upload_date = upload_date.replace('-', '')
  65
  66         location = self._html_search_regex(
  67             r'<dt>Location</dt>\n?\s*<dd>\n?(.*?)\s*<img',
  68             webpage, 'location')
  69
  70         formats = re.findall(
  71             r'<source data-quality=\\"([0-9]+)p\\" src=\\"(.*?)\\">',
  72             webpage)
  73
  74         formats = [{'url': fmt[1], 'height': int_or_none(fmt[0])} for fmt in formats]
  75
  76         return {
  77             'id': video_id,
  78             'title': title,
  79             'description': description,
  80             'duration': duration,
  81             'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'),
  82             'uploader_id': uploader_id,
  83             'upload_date': upload_date,
  84             'location': location,
  85             'formats': formats
  86         }