[googledrive] Use redirect URLs for source format (closes #18877, closes #23919,...

[youtube-dl] / youtube_dl / extractor / googledrive.py
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py

index 37d37390197b0393c1415bd86e9e30e2b93d7c5c..f2cc57e447660f2d047be5bc306aaa4397bbf6af 100644 (file)
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@@ -13,7 +13,18 @@ from ..utils import (
  
  
  class GoogleDriveIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+    _VALID_URL = r'''(?x)
+                        https?://
+                            (?:
+                                (?:docs|drive)\.google\.com/
+                                (?:
+                                    (?:uc|open)\?.*?id=|
+                                    file/d/
+                                )|
+                                video\.google\.com/get_player\?.*?docid=
+                            )
+                            (?P<id>[a-zA-Z0-9_-]{28,})
+                    '''
      _TESTS = [{
          'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
          'md5': '5c602afbbf2c1db91831f5d82f678554',
@@ -25,7 +36,7 @@ class GoogleDriveIE(InfoExtractor):
          }
      }, {
          # video can't be watched anonymously due to view count limit reached,
-        # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046)
+        # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
          'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
          'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
          'info_dict': {
@@ -42,7 +53,13 @@ class GoogleDriveIE(InfoExtractor):
              'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
              'duration': 189,
          },
-        'only_matching': True
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+        'only_matching': True,
+    }, {
+        'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+        'only_matching': True,
      }]
      _FORMATS_EXT = {
          '5': 'flv',
@@ -203,19 +220,27 @@ class GoogleDriveIE(InfoExtractor):
                  'id': video_id,
                  'export': 'download',
              })
-        urlh = self._request_webpage(
-            source_url, video_id, note='Requesting source file',
-            errnote='Unable to request source file', fatal=False)
+
+        def request_source_file(source_url, kind):
+            return self._request_webpage(
+                source_url, video_id, note='Requesting %s file' % kind,
+                errnote='Unable to request %s file' % kind, fatal=False)
+        urlh = request_source_file(source_url, 'source')
          if urlh:
-            def add_source_format(src_url):
+            def add_source_format(urlh):
                  formats.append({
-                    'url': src_url,
+                    # Use redirect URLs as download URLs in order to calculate
+                    # correct cookies in _calc_cookies.
+                    # Using original URLs may result in redirect loop due to
+                    # google.com's cookies mistakenly used for googleusercontent.com
+                    # redirect URLs (see #23919).
+                    'url': urlh.geturl(),
                      'ext': determine_ext(title, 'mp4').lower(),
                      'format_id': 'source',
                      'quality': 1,
                  })
              if urlh.headers.get('Content-Disposition'):
-                add_source_format(source_url)
+                add_source_format(urlh)
              else:
                  confirmation_webpage = self._webpage_read_content(
                      urlh, url, video_id, note='Downloading confirmation page',
@@ -225,9 +250,12 @@ class GoogleDriveIE(InfoExtractor):
                          r'confirm=([^&"\']+)', confirmation_webpage,
                          'confirmation code', fatal=False)
                      if confirm:
-                        add_source_format(update_url_query(source_url, {
+                        confirmed_source_url = update_url_query(source_url, {
                              'confirm': confirm,
-                        }))
+                        })
+                        urlh = request_source_file(confirmed_source_url, 'confirmed source')
+                        if urlh and urlh.headers.get('Content-Disposition'):
+                            add_source_format(urlh)
  
          if not formats:
              reason = self._search_regex(