[googledrive] Fix extraction on Python 3.6
authorYen Chi Hsuan <yan12125@gmail.com>
Sun, 5 Feb 2017 13:41:08 +0000 (21:41 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Sun, 5 Feb 2017 13:41:08 +0000 (21:41 +0800)
Since Python 3.6, invalid escape sequences are deprecated. It's likely
that there are invalid escape sequences somewhere on the webpage, so
instead of unescaping the whole webpage, just unescape the URL.

See https://bugs.python.org/issue27364. That change was designed for
string literals, while it affects the 'unicode_escape' encoding as well.
The code path is:

str.decode('unicode_escape')
    codecs.unicode_escape_decode()
        PyUnicode_DecodeUnicodeEscape()

ChangeLog
youtube_dl/extractor/googledrive.py

index 23a72955914ec8be2066d18d136f0fb6b7a953c1..a0025ab91d3b4a7b674fd011bfd095d7c9648016 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+version <unreleased>
+
+Extractors
+* [googledrive] Fix extraction on Python 3.6
+
+
 version 2017.02.04.1
 
 Extractors
index 766fc26d0f01145bdd2456a221940fa60ece6953..fec36cbbb7f43d2b8b37370aec270f543e8f257d 100644 (file)
@@ -6,6 +6,7 @@ from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
     int_or_none,
+    lowercase_escape,
 )
 
 
@@ -13,12 +14,12 @@ class GoogleDriveIE(InfoExtractor):
     _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
     _TESTS = [{
         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
-        'md5': '881f7700aec4f538571fa1e0eed4a7b6',
+        'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
         'info_dict': {
             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
             'ext': 'mp4',
             'title': 'Big Buck Bunny.mp4',
-            'duration': 46,
+            'duration': 45,
         }
     }, {
         # video id is longer than 28 characters
@@ -55,7 +56,7 @@ class GoogleDriveIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(
-            'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape')
+            'http://docs.google.com/file/d/%s' % video_id, video_id)
 
         reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
         if reason:
@@ -74,7 +75,7 @@ class GoogleDriveIE(InfoExtractor):
             resolution = fmt.split('/')[1]
             width, height = resolution.split('x')
             formats.append({
-                'url': fmt_url,
+                'url': lowercase_escape(fmt_url),
                 'format_id': fmt_id,
                 'resolution': resolution,
                 'width': int_or_none(width),