_ Git - youtube-dl/blob - youtube_dl/extractor/googledrive.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import (
   5     RegexNotFoundError,
   6     ExtractorError,
   7 )
   8
   9 class GoogleDriveEmbedIE(InfoExtractor):
  10     _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
  11     _TEST = {
  12         'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview',
  13         'info_dict': {
  14             'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE',
  15             'ext': 'mp4',
  16             'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv',
  17         }
  18     }
  19
  20     @staticmethod
  21     def _extract_url(webpage):
  22         mobj = re.search(
  23             r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
  24             webpage)
  25         if mobj:
  26             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
  27
  28     def _real_extract(self, url):
  29         video_id = self._match_id(url)
  30         return {
  31             '_type': 'url',
  32             'ie_key': 'GoogleDrive',
  33             'url': 'https://drive.google.com/file/d/%s' % video_id
  34         }
  35
  36 class GoogleDriveIE(InfoExtractor):
  37     _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
  38     _TEST = {
  39         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  40         'info_dict': {
  41             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  42             'ext': 'mp4',
  43             'title': 'Big Buck Bunny.mp4',
  44         }
  45     }
  46     _formats = {
  47         '5': {'ext': 'flv'},
  48         '6': {'ext': 'flv'},
  49         '13': {'ext': '3gp'},
  50         '17': {'ext': '3gp'},
  51         '18': {'ext': 'mp4'},
  52         '22': {'ext': 'mp4'},
  53         '34': {'ext': 'flv'},
  54         '35': {'ext': 'flv'},
  55         '36': {'ext': '3gp'},
  56         '37': {'ext': 'mp4'},
  57         '38': {'ext': 'mp4'},
  58         '43': {'ext': 'webm'},
  59         '44': {'ext': 'webm'},
  60         '45': {'ext': 'webm'},
  61         '46': {'ext': 'webm'},
  62         '59': {'ext': 'mp4'}
  63     }
  64
  65     def _real_extract(self, url):
  66         video_id = self._match_id(url)
  67         webpage = self._download_webpage(
  68             'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape'
  69         )
  70         try:
  71             title = self._html_search_regex(
  72                 r'"title"\s*,\s*"([^"]+)',
  73                 webpage,
  74                 'title'
  75             )
  76             fmt_stream_map = self._html_search_regex(
  77                 r'"fmt_stream_map"\s*,\s*"([^"]+)',
  78                 webpage,
  79                 'fmt_stream_map'
  80             )
  81             fmt_list = self._html_search_regex(
  82                 r'"fmt_list"\s*,\s*"([^"]+)',
  83                 webpage,
  84                 'fmt_list'
  85             )
  86 #                       timestamp = self._html_search_regex(
  87 #                               r'"timestamp"\s*,\s*"([^"]+)',
  88 #                               webpage,
  89 #                               'timestamp'
  90 #                       )
  91             length_seconds = self._html_search_regex(
  92                 r'"length_seconds"\s*,\s*"([^"]+)',
  93                 webpage,
  94                 'length_seconds'
  95             )
  96         except RegexNotFoundError:
  97             try:
  98                 reason = self._html_search_regex(
  99                     r'"reason","([^"]+)',
 100                     webpage,
 101                     'reason'
 102                 )
 103                 raise ExtractorError(reason)
 104                 return
 105             except RegexNotFoundError:
 106                 raise ExtractorError('not a video')
 107                 return
 108
 109         fmt_stream_map = fmt_stream_map.split(',')
 110         fmt_list = fmt_list.split(',')
 111         formats = []
 112         for i in range(len(fmt_stream_map)):
 113             fmt_id, fmt_url = fmt_stream_map[i].split('|')
 114             resolution = fmt_list[i].split('/')[1]
 115             width, height = resolution.split('x')
 116             formats.append({
 117                 'url': fmt_url,
 118                 'format_id': fmt_id,
 119                 'resolution': resolution,
 120                 'width': int(width),
 121                 'height': int(height),
 122                 'ext': self._formats[fmt_id]['ext']
 123             })
 124         self._sort_formats(formats)
 125
 126         return {
 127             'id': video_id,
 128             'title': title,
 129 #           'timestamp': int(timestamp),
 130             'duration': int(length_seconds),
 131             'formats': formats
 132         }