[googledrive] Use redirect URLs for source format (closes #18877, closes #23919,...
[youtube-dl] / youtube_dl / extractor / googledrive.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     determine_ext,
8     ExtractorError,
9     int_or_none,
10     lowercase_escape,
11     update_url_query,
12 )
13
14
15 class GoogleDriveIE(InfoExtractor):
16     _VALID_URL = r'''(?x)
17                         https?://
18                             (?:
19                                 (?:docs|drive)\.google\.com/
20                                 (?:
21                                     (?:uc|open)\?.*?id=|
22                                     file/d/
23                                 )|
24                                 video\.google\.com/get_player\?.*?docid=
25                             )
26                             (?P<id>[a-zA-Z0-9_-]{28,})
27                     '''
28     _TESTS = [{
29         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
30         'md5': '5c602afbbf2c1db91831f5d82f678554',
31         'info_dict': {
32             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
33             'ext': 'mp4',
34             'title': 'Big Buck Bunny.mp4',
35             'duration': 45,
36         }
37     }, {
38         # video can't be watched anonymously due to view count limit reached,
39         # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
40         'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
41         'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
42         'info_dict': {
43             'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
44             'ext': 'mp4',
45             'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
46         }
47     }, {
48         # video id is longer than 28 characters
49         'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
50         'info_dict': {
51             'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
52             'ext': 'mp4',
53             'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
54             'duration': 189,
55         },
56         'only_matching': True,
57     }, {
58         'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
59         'only_matching': True,
60     }, {
61         'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
62         'only_matching': True,
63     }]
64     _FORMATS_EXT = {
65         '5': 'flv',
66         '6': 'flv',
67         '13': '3gp',
68         '17': '3gp',
69         '18': 'mp4',
70         '22': 'mp4',
71         '34': 'flv',
72         '35': 'flv',
73         '36': '3gp',
74         '37': 'mp4',
75         '38': 'mp4',
76         '43': 'webm',
77         '44': 'webm',
78         '45': 'webm',
79         '46': 'webm',
80         '59': 'mp4',
81     }
82     _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
83     _CAPTIONS_ENTRY_TAG = {
84         'subtitles': 'track',
85         'automatic_captions': 'target',
86     }
87     _caption_formats_ext = []
88     _captions_xml = None
89
90     @staticmethod
91     def _extract_url(webpage):
92         mobj = re.search(
93             r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})',
94             webpage)
95         if mobj:
96             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
97
98     def _download_subtitles_xml(self, video_id, subtitles_id, hl):
99         if self._captions_xml:
100             return
101         self._captions_xml = self._download_xml(
102             self._BASE_URL_CAPTIONS, video_id, query={
103                 'id': video_id,
104                 'vid': subtitles_id,
105                 'hl': hl,
106                 'v': video_id,
107                 'type': 'list',
108                 'tlangs': '1',
109                 'fmts': '1',
110                 'vssids': '1',
111             }, note='Downloading subtitles XML',
112             errnote='Unable to download subtitles XML', fatal=False)
113         if self._captions_xml:
114             for f in self._captions_xml.findall('format'):
115                 if f.attrib.get('fmt_code') and not f.attrib.get('default'):
116                     self._caption_formats_ext.append(f.attrib['fmt_code'])
117
118     def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
119                               origin_lang_code=None):
120         if not subtitles_id or not caption_type:
121             return
122         captions = {}
123         for caption_entry in self._captions_xml.findall(
124                 self._CAPTIONS_ENTRY_TAG[caption_type]):
125             caption_lang_code = caption_entry.attrib.get('lang_code')
126             if not caption_lang_code:
127                 continue
128             caption_format_data = []
129             for caption_format in self._caption_formats_ext:
130                 query = {
131                     'vid': subtitles_id,
132                     'v': video_id,
133                     'fmt': caption_format,
134                     'lang': (caption_lang_code if origin_lang_code is None
135                              else origin_lang_code),
136                     'type': 'track',
137                     'name': '',
138                     'kind': '',
139                 }
140                 if origin_lang_code is not None:
141                     query.update({'tlang': caption_lang_code})
142                 caption_format_data.append({
143                     'url': update_url_query(self._BASE_URL_CAPTIONS, query),
144                     'ext': caption_format,
145                 })
146             captions[caption_lang_code] = caption_format_data
147         return captions
148
149     def _get_subtitles(self, video_id, subtitles_id, hl):
150         if not subtitles_id or not hl:
151             return
152         self._download_subtitles_xml(video_id, subtitles_id, hl)
153         if not self._captions_xml:
154             return
155         return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
156
157     def _get_automatic_captions(self, video_id, subtitles_id, hl):
158         if not subtitles_id or not hl:
159             return
160         self._download_subtitles_xml(video_id, subtitles_id, hl)
161         if not self._captions_xml:
162             return
163         track = self._captions_xml.find('track')
164         if track is None:
165             return
166         origin_lang_code = track.attrib.get('lang_code')
167         if not origin_lang_code:
168             return
169         return self._get_captions_by_type(
170             video_id, subtitles_id, 'automatic_captions', origin_lang_code)
171
172     def _real_extract(self, url):
173         video_id = self._match_id(url)
174         webpage = self._download_webpage(
175             'http://docs.google.com/file/d/%s' % video_id, video_id)
176
177         title = self._search_regex(
178             r'"title"\s*,\s*"([^"]+)', webpage, 'title',
179             default=None) or self._og_search_title(webpage)
180         duration = int_or_none(self._search_regex(
181             r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
182             default=None))
183
184         formats = []
185         fmt_stream_map = self._search_regex(
186             r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
187             'fmt stream map', default='').split(',')
188         fmt_list = self._search_regex(
189             r'"fmt_list"\s*,\s*"([^"]+)', webpage,
190             'fmt_list', default='').split(',')
191         if fmt_stream_map and fmt_list:
192             resolutions = {}
193             for fmt in fmt_list:
194                 mobj = re.search(
195                     r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
196                 if mobj:
197                     resolutions[mobj.group('format_id')] = (
198                         int(mobj.group('width')), int(mobj.group('height')))
199
200             for fmt_stream in fmt_stream_map:
201                 fmt_stream_split = fmt_stream.split('|')
202                 if len(fmt_stream_split) < 2:
203                     continue
204                 format_id, format_url = fmt_stream_split[:2]
205                 f = {
206                     'url': lowercase_escape(format_url),
207                     'format_id': format_id,
208                     'ext': self._FORMATS_EXT[format_id],
209                 }
210                 resolution = resolutions.get(format_id)
211                 if resolution:
212                     f.update({
213                         'width': resolution[0],
214                         'height': resolution[1],
215                     })
216                 formats.append(f)
217
218         source_url = update_url_query(
219             'https://drive.google.com/uc', {
220                 'id': video_id,
221                 'export': 'download',
222             })
223
224         def request_source_file(source_url, kind):
225             return self._request_webpage(
226                 source_url, video_id, note='Requesting %s file' % kind,
227                 errnote='Unable to request %s file' % kind, fatal=False)
228         urlh = request_source_file(source_url, 'source')
229         if urlh:
230             def add_source_format(urlh):
231                 formats.append({
232                     # Use redirect URLs as download URLs in order to calculate
233                     # correct cookies in _calc_cookies.
234                     # Using original URLs may result in redirect loop due to
235                     # google.com's cookies mistakenly used for googleusercontent.com
236                     # redirect URLs (see #23919).
237                     'url': urlh.geturl(),
238                     'ext': determine_ext(title, 'mp4').lower(),
239                     'format_id': 'source',
240                     'quality': 1,
241                 })
242             if urlh.headers.get('Content-Disposition'):
243                 add_source_format(urlh)
244             else:
245                 confirmation_webpage = self._webpage_read_content(
246                     urlh, url, video_id, note='Downloading confirmation page',
247                     errnote='Unable to confirm download', fatal=False)
248                 if confirmation_webpage:
249                     confirm = self._search_regex(
250                         r'confirm=([^&"\']+)', confirmation_webpage,
251                         'confirmation code', fatal=False)
252                     if confirm:
253                         confirmed_source_url = update_url_query(source_url, {
254                             'confirm': confirm,
255                         })
256                         urlh = request_source_file(confirmed_source_url, 'confirmed source')
257                         if urlh and urlh.headers.get('Content-Disposition'):
258                             add_source_format(urlh)
259
260         if not formats:
261             reason = self._search_regex(
262                 r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
263             if reason:
264                 raise ExtractorError(reason, expected=True)
265
266         self._sort_formats(formats)
267
268         hl = self._search_regex(
269             r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
270         subtitles_id = None
271         ttsurl = self._search_regex(
272             r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
273         if ttsurl:
274             # the video Id for subtitles will be the last value in the ttsurl
275             # query string
276             subtitles_id = ttsurl.encode('utf-8').decode(
277                 'unicode_escape').split('=')[-1]
278
279         return {
280             'id': video_id,
281             'title': title,
282             'thumbnail': self._og_search_thumbnail(webpage, default=None),
283             'duration': duration,
284             'formats': formats,
285             'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
286             'automatic_captions': self.extract_automatic_captions(
287                 video_id, subtitles_id, hl),
288         }