[googledrive] Add support for source format (closes #14046)
[youtube-dl] / youtube_dl / extractor / googledrive.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     determine_ext,
8     ExtractorError,
9     int_or_none,
10     lowercase_escape,
11     update_url_query,
12 )
13
14
15 class GoogleDriveIE(InfoExtractor):
16     _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
17     _TESTS = [{
18         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
19         'md5': '5c602afbbf2c1db91831f5d82f678554',
20         'info_dict': {
21             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
22             'ext': 'mp4',
23             'title': 'Big Buck Bunny.mp4',
24             'duration': 45,
25         }
26     }, {
27         # video can't be watched anonymously due to view count limit reached,
28         # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046)
29         'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
30         'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
31         'info_dict': {
32             'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
33             'ext': 'mp4',
34             'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
35         }
36     }, {
37         # video id is longer than 28 characters
38         'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
39         'info_dict': {
40             'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
41             'ext': 'mp4',
42             'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
43             'duration': 189,
44         },
45         'only_matching': True
46     }]
47     _FORMATS_EXT = {
48         '5': 'flv',
49         '6': 'flv',
50         '13': '3gp',
51         '17': '3gp',
52         '18': 'mp4',
53         '22': 'mp4',
54         '34': 'flv',
55         '35': 'flv',
56         '36': '3gp',
57         '37': 'mp4',
58         '38': 'mp4',
59         '43': 'webm',
60         '44': 'webm',
61         '45': 'webm',
62         '46': 'webm',
63         '59': 'mp4',
64     }
65     _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
66     _CAPTIONS_ENTRY_TAG = {
67         'subtitles': 'track',
68         'automatic_captions': 'target',
69     }
70     _caption_formats_ext = []
71     _captions_xml = None
72
73     @staticmethod
74     def _extract_url(webpage):
75         mobj = re.search(
76             r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
77             webpage)
78         if mobj:
79             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
80
81     def _download_subtitles_xml(self, video_id, subtitles_id, hl):
82         if self._captions_xml:
83             return
84         self._captions_xml = self._download_xml(
85             self._BASE_URL_CAPTIONS, video_id, query={
86                 'id': video_id,
87                 'vid': subtitles_id,
88                 'hl': hl,
89                 'v': video_id,
90                 'type': 'list',
91                 'tlangs': '1',
92                 'fmts': '1',
93                 'vssids': '1',
94             }, note='Downloading subtitles XML',
95             errnote='Unable to download subtitles XML', fatal=False)
96         if self._captions_xml:
97             for f in self._captions_xml.findall('format'):
98                 if f.attrib.get('fmt_code') and not f.attrib.get('default'):
99                     self._caption_formats_ext.append(f.attrib['fmt_code'])
100
101     def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
102                               origin_lang_code=None):
103         if not subtitles_id or not caption_type:
104             return
105         captions = {}
106         for caption_entry in self._captions_xml.findall(
107                 self._CAPTIONS_ENTRY_TAG[caption_type]):
108             caption_lang_code = caption_entry.attrib.get('lang_code')
109             if not caption_lang_code:
110                 continue
111             caption_format_data = []
112             for caption_format in self._caption_formats_ext:
113                 query = {
114                     'vid': subtitles_id,
115                     'v': video_id,
116                     'fmt': caption_format,
117                     'lang': (caption_lang_code if origin_lang_code is None
118                              else origin_lang_code),
119                     'type': 'track',
120                     'name': '',
121                     'kind': '',
122                 }
123                 if origin_lang_code is not None:
124                     query.update({'tlang': caption_lang_code})
125                 caption_format_data.append({
126                     'url': update_url_query(self._BASE_URL_CAPTIONS, query),
127                     'ext': caption_format,
128                 })
129             captions[caption_lang_code] = caption_format_data
130         return captions
131
132     def _get_subtitles(self, video_id, subtitles_id, hl):
133         if not subtitles_id or not hl:
134             return
135         self._download_subtitles_xml(video_id, subtitles_id, hl)
136         if not self._captions_xml:
137             return
138         return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
139
140     def _get_automatic_captions(self, video_id, subtitles_id, hl):
141         if not subtitles_id or not hl:
142             return
143         self._download_subtitles_xml(video_id, subtitles_id, hl)
144         if not self._captions_xml:
145             return
146         track = self._captions_xml.find('track')
147         if track is None:
148             return
149         origin_lang_code = track.attrib.get('lang_code')
150         if not origin_lang_code:
151             return
152         return self._get_captions_by_type(
153             video_id, subtitles_id, 'automatic_captions', origin_lang_code)
154
155     def _real_extract(self, url):
156         video_id = self._match_id(url)
157         webpage = self._download_webpage(
158             'http://docs.google.com/file/d/%s' % video_id, video_id)
159
160         title = self._search_regex(
161             r'"title"\s*,\s*"([^"]+)', webpage, 'title',
162             default=None) or self._og_search_title(webpage)
163         duration = int_or_none(self._search_regex(
164             r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
165             default=None))
166
167         formats = []
168         fmt_stream_map = self._search_regex(
169             r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
170             'fmt stream map', default='').split(',')
171         fmt_list = self._search_regex(
172             r'"fmt_list"\s*,\s*"([^"]+)', webpage,
173             'fmt_list', default='').split(',')
174         if fmt_stream_map and fmt_list:
175             resolutions = {}
176             for fmt in fmt_list:
177                 mobj = re.search(
178                     r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
179                 if mobj:
180                     resolutions[mobj.group('format_id')] = (
181                         int(mobj.group('width')), int(mobj.group('height')))
182
183             for fmt_stream in fmt_stream_map:
184                 fmt_stream_split = fmt_stream.split('|')
185                 if len(fmt_stream_split) < 2:
186                     continue
187                 format_id, format_url = fmt_stream_split[:2]
188                 f = {
189                     'url': lowercase_escape(format_url),
190                     'format_id': format_id,
191                     'ext': self._FORMATS_EXT[format_id],
192                 }
193                 resolution = resolutions.get(format_id)
194                 if resolution:
195                     f.update({
196                         'width': resolution[0],
197                         'height': resolution[1],
198                     })
199                 formats.append(f)
200
201         source_url = update_url_query(
202             'https://drive.google.com/uc', {
203                 'id': video_id,
204                 'export': 'download',
205             })
206         urlh = self._request_webpage(
207             source_url, video_id, note='Requesting source file',
208             errnote='Unable to request source file', fatal=False)
209         if urlh:
210             def add_source_format(src_url):
211                 formats.append({
212                     'url': src_url,
213                     'ext': determine_ext(title, 'mp4').lower(),
214                     'format_id': 'source',
215                     'quality': 1,
216                 })
217             if urlh.headers.get('Content-Disposition'):
218                 add_source_format(source_url)
219             else:
220                 confirmation_webpage = self._webpage_read_content(
221                     urlh, url, video_id, note='Downloading confirmation page',
222                     errnote='Unable to confirm download', fatal=False)
223                 if confirmation_webpage:
224                     confirm = self._search_regex(
225                         r'confirm=([^&"\']+)', confirmation_webpage,
226                         'confirmation code', fatal=False)
227                     if confirm:
228                         add_source_format(update_url_query(source_url, {
229                             'confirm': confirm,
230                         }))
231
232         if not formats:
233             reason = self._search_regex(
234                 r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
235             if reason:
236                 raise ExtractorError(reason, expected=True)
237
238         self._sort_formats(formats)
239
240         hl = self._search_regex(
241             r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
242         subtitles_id = None
243         ttsurl = self._search_regex(
244             r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
245         if ttsurl:
246             # the video Id for subtitles will be the last value in the ttsurl
247             # query string
248             subtitles_id = ttsurl.encode('utf-8').decode(
249                 'unicode_escape').split('=')[-1]
250
251         return {
252             'id': video_id,
253             'title': title,
254             'thumbnail': self._og_search_thumbnail(webpage, default=None),
255             'duration': duration,
256             'formats': formats,
257             'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
258             'automatic_captions': self.extract_automatic_captions(
259                 video_id, subtitles_id, hl),
260         }