[googledrive] Use redirect URLs for source format (closes #18877, closes #23919,...
[youtube-dl] / youtube_dl / extractor / googledrive.py
index 37d37390197b0393c1415bd86e9e30e2b93d7c5c..f2cc57e447660f2d047be5bc306aaa4397bbf6af 100644 (file)
@@ -13,7 +13,18 @@ from ..utils import (
 
 
 class GoogleDriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:docs|drive)\.google\.com/
+                                (?:
+                                    (?:uc|open)\?.*?id=|
+                                    file/d/
+                                )|
+                                video\.google\.com/get_player\?.*?docid=
+                            )
+                            (?P<id>[a-zA-Z0-9_-]{28,})
+                    '''
     _TESTS = [{
         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
         'md5': '5c602afbbf2c1db91831f5d82f678554',
@@ -25,7 +36,7 @@ class GoogleDriveIE(InfoExtractor):
         }
     }, {
         # video can't be watched anonymously due to view count limit reached,
-        # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046)
+        # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
         'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
         'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
         'info_dict': {
@@ -42,7 +53,13 @@ class GoogleDriveIE(InfoExtractor):
             'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
             'duration': 189,
         },
-        'only_matching': True
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+        'only_matching': True,
     }]
     _FORMATS_EXT = {
         '5': 'flv',
@@ -203,19 +220,27 @@ class GoogleDriveIE(InfoExtractor):
                 'id': video_id,
                 'export': 'download',
             })
-        urlh = self._request_webpage(
-            source_url, video_id, note='Requesting source file',
-            errnote='Unable to request source file', fatal=False)
+
+        def request_source_file(source_url, kind):
+            return self._request_webpage(
+                source_url, video_id, note='Requesting %s file' % kind,
+                errnote='Unable to request %s file' % kind, fatal=False)
+        urlh = request_source_file(source_url, 'source')
         if urlh:
-            def add_source_format(src_url):
+            def add_source_format(urlh):
                 formats.append({
-                    'url': src_url,
+                    # Use redirect URLs as download URLs in order to calculate
+                    # correct cookies in _calc_cookies.
+                    # Using original URLs may result in redirect loop due to
+                    # google.com's cookies mistakenly used for googleusercontent.com
+                    # redirect URLs (see #23919).
+                    'url': urlh.geturl(),
                     'ext': determine_ext(title, 'mp4').lower(),
                     'format_id': 'source',
                     'quality': 1,
                 })
             if urlh.headers.get('Content-Disposition'):
-                add_source_format(source_url)
+                add_source_format(urlh)
             else:
                 confirmation_webpage = self._webpage_read_content(
                     urlh, url, video_id, note='Downloading confirmation page',
@@ -225,9 +250,12 @@ class GoogleDriveIE(InfoExtractor):
                         r'confirm=([^&"\']+)', confirmation_webpage,
                         'confirmation code', fatal=False)
                     if confirm:
-                        add_source_format(update_url_query(source_url, {
+                        confirmed_source_url = update_url_query(source_url, {
                             'confirm': confirm,
-                        }))
+                        })
+                        urlh = request_source_file(confirmed_source_url, 'confirmed source')
+                        if urlh and urlh.headers.get('Content-Disposition'):
+                            add_source_format(urlh)
 
         if not formats:
             reason = self._search_regex(