X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fgoogledrive.py;h=766fc26d0f01145bdd2456a221940fa60ece6953;hb=e2628fb6a028bd48f39ee556ca5ecb07aceba7f5;hp=e3d5c341892b25b97d42d520db5a154977cc9868;hpb=f120a7ab5e9c560a8114f9662e2f213243a945b0;p=youtube-dl diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py index e3d5c3418..766fc26d0 100644 --- a/youtube_dl/extractor/googledrive.py +++ b/youtube_dl/extractor/googledrive.py @@ -1,106 +1,92 @@ +from __future__ import unicode_literals + +import re + from .common import InfoExtractor -from ..utils import RegexNotFoundError +from ..utils import ( + ExtractorError, + int_or_none, +) + class GoogleDriveIE(InfoExtractor): - _VALID_URL = r'(?:https?://)?(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/))(?P.+?)(?:&|/|$)' - _TEST = { + _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P[a-zA-Z0-9_-]{28,})' + _TESTS = [{ 'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1', + 'md5': '881f7700aec4f538571fa1e0eed4a7b6', 'info_dict': { 'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ', 'ext': 'mp4', 'title': 'Big Buck Bunny.mp4', + 'duration': 46, } - } - _formats = { - '5': {'ext': 'flv'}, - '6': {'ext': 'flv'}, - '13': {'ext': '3gp'}, - '17': {'ext': '3gp'}, - '18': {'ext': 'mp4'}, - '22': {'ext': 'mp4'}, - '34': {'ext': 'flv'}, - '35': {'ext': 'flv'}, - '36': {'ext': '3gp'}, - '37': {'ext': 'mp4'}, - '38': {'ext': 'mp4'}, - '43': {'ext': 'webm'}, - '44': {'ext': 'webm'}, - '45': {'ext': 'webm'}, - '46': {'ext': 'webm'}, - '59': {'ext': 'mp4'} + }, { + # video id is longer than 28 characters + 'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', + 'only_matching': True, + }] + _FORMATS_EXT = { + '5': 'flv', + '6': 'flv', + '13': '3gp', + '17': '3gp', + '18': 'mp4', + '22': 'mp4', + '34': 'flv', + '35': 'flv', + '36': '3gp', + '37': 'mp4', + '38': 'mp4', + '43': 'webm', + '44': 'webm', + '45': 'webm', + '46': 'webm', + '59': 'mp4', } + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r']+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P[a-zA-Z0-9_-]{28,})', + webpage) + if mobj: + return 'https://drive.google.com/file/d/%s' % mobj.group('id') + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( - 'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape' - ) - try: - title = self._html_search_regex( - r'"title","(?P.*?)"', - webpage, - 'title', - group='title' - ) - fmt_stream_map = self._html_search_regex( - r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"', - webpage, - 'fmt_stream_map', - group='fmt_stream_map' - ) - fmt_list = self._html_search_regex( - r'"fmt_list","(?P<fmt_list>.*?)"', - webpage, - 'fmt_list', - group='fmt_list' - ) -# timestamp = self._html_search_regex( -# r'"timestamp","(?P<timestamp>.*?)"', -# webpage, -# 'timestamp', -# group='timestamp' -# ) - length_seconds = self._html_search_regex( - r'"length_seconds","(?P<length_seconds>.*?)"', - webpage, - 'length_seconds', - group='length_seconds' - ) - except RegexNotFoundError: - try: - reason = self._html_search_regex( - r'"reason","(?P<reason>.*?)"', - webpage, - 'reason', - group='reason' - ) - self.report_warning(reason) - return - except RegexNotFoundError: - self.report_warning('not a video') - return + 'http://docs.google.com/file/d/%s' % video_id, video_id, encoding='unicode_escape') + + reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) + if reason: + raise ExtractorError(reason) + + title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title') + duration = int_or_none(self._search_regex( + r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None)) + fmt_stream_map = self._search_regex( + r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',') + fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',') - fmt_stream_map = fmt_stream_map.split(',') - fmt_list = fmt_list.split(',') formats = [] - for i in range(len(fmt_stream_map)): - fmt_id, fmt_url = fmt_stream_map[i].split('|') - resolution = fmt_list[i].split('/')[1] + for fmt, fmt_stream in zip(fmt_list, fmt_stream_map): + fmt_id, fmt_url = fmt_stream.split('|') + resolution = fmt.split('/')[1] width, height = resolution.split('x') formats.append({ 'url': fmt_url, 'format_id': fmt_id, 'resolution': resolution, - 'width': int(width), - 'height': int(height), - 'ext': self._formats[fmt_id]['ext'] + 'width': int_or_none(width), + 'height': int_or_none(height), + 'ext': self._FORMATS_EXT[fmt_id], }) self._sort_formats(formats) return { 'id': video_id, 'title': title, -# 'timestamp': int(timestamp), - 'duration': int(length_seconds), - 'formats': formats + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'duration': duration, + 'formats': formats, }