_ Git - youtube-dl/blob - youtube_dl/extractor/googledrive.py

   1 import re
   2
   3 from .common import InfoExtractor
   4 from ..utils import RegexNotFoundError
   5
   6 class GoogleDriveEmbedIE(InfoExtractor):
   7     _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
   8     _TEST = {
   9         'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview',
  10         'info_dict': {
  11             'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE',
  12             'ext': 'mp4',
  13             'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv',
  14         }
  15     }
  16
  17     @staticmethod
  18     def _extract_url(webpage):
  19         mobj = re.search(
  20             r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
  21             webpage)
  22         if mobj:
  23             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
  24
  25     def _real_extract(self, url):
  26         video_id = self._match_id(url)
  27         return {
  28             '_type': 'url',
  29             'ie-key': 'GoogleDrive',
  30             'url': 'https://drive.google.com/file/d/%s' % video_id
  31         }
  32
  33 class GoogleDriveIE(InfoExtractor):
  34     _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
  35     _TEST = {
  36         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
  37         'info_dict': {
  38             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
  39             'ext': 'mp4',
  40             'title': 'Big Buck Bunny.mp4',
  41         }
  42     }
  43     _formats = {
  44         '5': {'ext': 'flv'},
  45         '6': {'ext': 'flv'},
  46         '13': {'ext': '3gp'},
  47         '17': {'ext': '3gp'},
  48         '18': {'ext': 'mp4'},
  49         '22': {'ext': 'mp4'},
  50         '34': {'ext': 'flv'},
  51         '35': {'ext': 'flv'},
  52         '36': {'ext': '3gp'},
  53         '37': {'ext': 'mp4'},
  54         '38': {'ext': 'mp4'},
  55         '43': {'ext': 'webm'},
  56         '44': {'ext': 'webm'},
  57         '45': {'ext': 'webm'},
  58         '46': {'ext': 'webm'},
  59         '59': {'ext': 'mp4'}
  60     }
  61
  62     def _real_extract(self, url):
  63         video_id = self._match_id(url)
  64         webpage = self._download_webpage(
  65             'http://docs.google.com/file/d/' + video_id, video_id, encoding='unicode_escape'
  66         )
  67         try:
  68             title = self._html_search_regex(
  69                 r'"title"\s+,\s+"[^"]+',
  70                 webpage,
  71                 'title'
  72             )
  73             fmt_stream_map = self._html_search_regex(
  74                 r'"fmt_stream_map"\s+,\s+"[^"]+',
  75                 webpage,
  76                 'fmt_stream_map'
  77             )
  78             fmt_list = self._html_search_regex(
  79                 r'"fmt_list"\s+,\s+"[^"]+',
  80                 webpage,
  81                 'fmt_list'
  82             )
  83 #                       timestamp = self._html_search_regex(
  84 #                               r'"timestamp"\s+,\s+"[^"]+',
  85 #                               webpage,
  86 #                               'timestamp'
  87 #                       )
  88             length_seconds = self._html_search_regex(
  89                 r'"length_seconds"\s+,\s+"[^"]+',
  90                 webpage,
  91                 'length_seconds'
  92             )
  93         except RegexNotFoundError:
  94             try:
  95                 reason = self._html_search_regex(
  96                     r'"reason","[^"]+',
  97                     webpage,
  98                     'reason'
  99                 )
 100                 self.report_warning(reason)
 101                 return
 102             except RegexNotFoundError:
 103                 self.report_warning('not a video')
 104                 return
 105
 106         fmt_stream_map = fmt_stream_map.split(',')
 107         fmt_list = fmt_list.split(',')
 108         formats = []
 109         for i in range(len(fmt_stream_map)):
 110             fmt_id, fmt_url = fmt_stream_map[i].split('|')
 111             resolution = fmt_list[i].split('/')[1]
 112             width, height = resolution.split('x')
 113             formats.append({
 114                 'url': fmt_url,
 115                 'format_id': fmt_id,
 116                 'resolution': resolution,
 117                 'width': int(width),
 118                 'height': int(height),
 119                 'ext': self._formats[fmt_id]['ext']
 120             })
 121         self._sort_formats(formats)
 122
 123         return {
 124             'id': video_id,
 125             'title': title,
 126 #           'timestamp': int(timestamp),
 127             'duration': int(length_seconds),
 128             'formats': formats
 129         }