fix embed regex
[youtube-dl] / youtube_dl / extractor / googledrive.py
1 import re
2
3 from .common import InfoExtractor
4 from ..utils import RegexNotFoundError
5
6 class GoogleDriveEmbedIE(InfoExtractor):
7     _VALID_URL = r'https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
8     _TEST = {
9         'url': 'https://docs.google.com/file/d/0B8KB9DRosYGKMXNoeWxqa3JYclE/preview',
10         'info_dict': {
11             'id': '0B8KB9DRosYGKMXNoeWxqa3JYclE',
12             'ext': 'mp4',
13             'title': 'Jimmy Fallon Sings Since You\'ve Been Gone.wmv',
14         }
15     }
16
17     @staticmethod
18     def _extract_url(webpage):
19         mobj = re.search(
20             r'<iframe src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28})',
21             webpage)
22         if mobj:
23             return 'https://drive.google.com/file/d/%s' % mobj.group('id')
24
25     def _real_extract(self, url):
26         video_id = self._match_id(url)
27         return {
28             '_type': 'url',
29             'ie-key': 'GoogleDrive',
30             'url': 'https://drive.google.com/file/d/%s' % video_id
31         }
32
33 class GoogleDriveIE(InfoExtractor):
34     _VALID_URL = r'https?://(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)(?P<id>[a-zA-Z0-9_-]{28})'
35     _TEST = {
36         'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
37         'info_dict': {
38             'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
39             'ext': 'mp4',
40             'title': 'Big Buck Bunny.mp4',
41         }
42     }
43     _formats = {
44         '5': {'ext': 'flv'},
45         '6': {'ext': 'flv'},
46         '13': {'ext': '3gp'},
47         '17': {'ext': '3gp'},
48         '18': {'ext': 'mp4'},
49         '22': {'ext': 'mp4'},
50         '34': {'ext': 'flv'},
51         '35': {'ext': 'flv'},
52         '36': {'ext': '3gp'},
53         '37': {'ext': 'mp4'},
54         '38': {'ext': 'mp4'},
55         '43': {'ext': 'webm'},
56         '44': {'ext': 'webm'},
57         '45': {'ext': 'webm'},
58         '46': {'ext': 'webm'},
59         '59': {'ext': 'mp4'}
60     }
61
62     def _real_extract(self, url):
63         video_id = self._match_id(url)
64         webpage = self._download_webpage(
65             'http://docs.google.com/file/d/'+video_id, video_id, encoding='unicode_escape'
66         )
67         try:
68             title = self._html_search_regex(
69                 r'"title","(?P<title>.*?)"',
70                 webpage,
71                 'title',
72                 group='title'
73             )
74             fmt_stream_map = self._html_search_regex(
75                 r'"fmt_stream_map","(?P<fmt_stream_map>.*?)"',
76                 webpage,
77                 'fmt_stream_map',
78                 group='fmt_stream_map'
79             )
80             fmt_list = self._html_search_regex(
81                 r'"fmt_list","(?P<fmt_list>.*?)"',
82                 webpage,
83                 'fmt_list',
84                 group='fmt_list'
85             )
86 #                       timestamp = self._html_search_regex(
87 #                               r'"timestamp","(?P<timestamp>.*?)"',
88 #                               webpage,
89 #                               'timestamp',
90 #                               group='timestamp'
91 #                       )
92             length_seconds = self._html_search_regex(
93                 r'"length_seconds","(?P<length_seconds>.*?)"',
94                 webpage,
95                 'length_seconds',
96                 group='length_seconds'
97             )
98         except RegexNotFoundError:
99             try:
100                 reason = self._html_search_regex(
101                     r'"reason","(?P<reason>.*?)"',
102                     webpage,
103                     'reason',
104                     group='reason'
105                 )
106                 self.report_warning(reason)
107                 return
108             except RegexNotFoundError:
109                 self.report_warning('not a video')
110                 return
111
112         fmt_stream_map = fmt_stream_map.split(',')
113         fmt_list = fmt_list.split(',')
114         formats = []
115         for i in range(len(fmt_stream_map)):
116             fmt_id, fmt_url = fmt_stream_map[i].split('|')
117             resolution = fmt_list[i].split('/')[1]
118             width, height = resolution.split('x')
119             formats.append({
120                 'url': fmt_url,
121                 'format_id': fmt_id,
122                 'resolution': resolution,
123                 'width': int(width),
124                 'height': int(height),
125                 'ext': self._formats[fmt_id]['ext']
126             })
127         self._sort_formats(formats)
128
129         return {
130             'id': video_id,
131             'title': title,
132 #           'timestamp': int(timestamp),
133             'duration': int(length_seconds),
134             'formats': formats
135         }