1 from __future__ import unicode_literals
5 from .common import InfoExtractor
14 class GoogleDriveIE(InfoExtractor):
15 _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
17 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
18 'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
20 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
22 'title': 'Big Buck Bunny.mp4',
26 # video id is longer than 28 characters
27 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
28 'md5': 'c230c67252874fddd8170e3fd1a45886',
30 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
32 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
55 _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
56 _CAPTIONS_ENTRY_TAG = {
58 'automatic_captions': 'target',
60 _caption_formats_ext = []
64 def _extract_url(webpage):
66 r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
69 return 'https://drive.google.com/file/d/%s' % mobj.group('id')
71 def _download_subtitles_xml(self, video_id, subtitles_id, hl):
72 if self._captions_xml:
74 self._captions_xml = self._download_xml(
75 self._BASE_URL_CAPTIONS, video_id, query={
84 }, note='Downloading subtitles XML',
85 errnote='Unable to download subtitles XML', fatal=False)
86 if self._captions_xml:
87 for f in self._captions_xml.findall('format'):
88 if f.attrib.get('fmt_code') and not f.attrib.get('default'):
89 self._caption_formats_ext.append(f.attrib['fmt_code'])
91 def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
92 origin_lang_code=None):
93 if not subtitles_id or not caption_type:
96 for caption_entry in self._captions_xml.findall(
97 self._CAPTIONS_ENTRY_TAG[caption_type]):
98 caption_lang_code = caption_entry.attrib.get('lang_code')
99 if not caption_lang_code:
101 caption_format_data = []
102 for caption_format in self._caption_formats_ext:
106 'fmt': caption_format,
107 'lang': (caption_lang_code if origin_lang_code is None
108 else origin_lang_code),
113 if origin_lang_code is not None:
114 query.update({'tlang': caption_lang_code})
115 caption_format_data.append({
116 'url': update_url_query(self._BASE_URL_CAPTIONS, query),
117 'ext': caption_format,
119 captions[caption_lang_code] = caption_format_data
122 def _get_subtitles(self, video_id, subtitles_id, hl):
123 if not subtitles_id or not hl:
125 self._download_subtitles_xml(video_id, subtitles_id, hl)
126 if not self._captions_xml:
128 return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
130 def _get_automatic_captions(self, video_id, subtitles_id, hl):
131 if not subtitles_id or not hl:
133 self._download_subtitles_xml(video_id, subtitles_id, hl)
134 if not self._captions_xml:
136 track = self._captions_xml.find('track')
139 origin_lang_code = track.attrib.get('lang_code')
140 if not origin_lang_code:
142 return self._get_captions_by_type(
143 video_id, subtitles_id, 'automatic_captions', origin_lang_code)
145 def _real_extract(self, url):
146 video_id = self._match_id(url)
147 webpage = self._download_webpage(
148 'http://docs.google.com/file/d/%s' % video_id, video_id)
150 reason = self._search_regex(
151 r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
153 raise ExtractorError(reason)
155 title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title')
156 duration = int_or_none(self._search_regex(
157 r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
159 fmt_stream_map = self._search_regex(
160 r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
161 'fmt stream map').split(',')
162 fmt_list = self._search_regex(
163 r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',')
168 r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
170 resolutions[mobj.group('format_id')] = (
171 int(mobj.group('width')), int(mobj.group('height')))
174 for fmt_stream in fmt_stream_map:
175 fmt_stream_split = fmt_stream.split('|')
176 if len(fmt_stream_split) < 2:
178 format_id, format_url = fmt_stream_split[:2]
180 'url': lowercase_escape(format_url),
181 'format_id': format_id,
182 'ext': self._FORMATS_EXT[format_id],
184 resolution = resolutions.get(format_id)
187 'width': resolution[0],
188 'height': resolution[1],
191 self._sort_formats(formats)
193 hl = self._search_regex(
194 r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
196 ttsurl = self._search_regex(
197 r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
199 # the video Id for subtitles will be the last value in the ttsurl
201 subtitles_id = ttsurl.encode('utf-8').decode(
202 'unicode_escape').split('=')[-1]
207 'thumbnail': self._og_search_thumbnail(webpage, default=None),
208 'duration': duration,
210 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
211 'automatic_captions': self.extract_automatic_captions(
212 video_id, subtitles_id, hl),