[vgtv] Add new extractor
[youtube-dl] / youtube_dl / extractor / gdcvault.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     compat_urllib_parse,
8     compat_urllib_request,
9 )
10
11 class GDCVaultIE(InfoExtractor):
12     _VALID_URL = r'https?://(?:www\.)?gdcvault\.com/play/(?P<id>\d+)/(?P<name>(\w|-)+)'
13     _TESTS = [
14         {
15             'url': 'http://www.gdcvault.com/play/1019721/Doki-Doki-Universe-Sweet-Simple',
16             'md5': '7ce8388f544c88b7ac11c7ab1b593704',
17             'info_dict': {
18                 'id': '1019721',
19                 'ext': 'mp4',
20                 'title': 'Doki-Doki Universe: Sweet, Simple and Genuine (GDC Next 10)'
21             }
22         },
23         {
24             'url': 'http://www.gdcvault.com/play/1015683/Embracing-the-Dark-Art-of',
25             'info_dict': {
26                 'id': '1015683',
27                 'ext': 'flv',
28                 'title': 'Embracing the Dark Art of Mathematical Modeling in AI'
29             },
30             'params': {
31                 'skip_download': True,  # Requires rtmpdump
32             }
33         },
34     ]
35
36     def _parse_mp4(self, xml_description):
37         video_formats = []
38         mp4_video = xml_description.find('./metadata/mp4video')
39         if mp4_video is None:
40             return None
41
42         mobj = re.match(r'(?P<root>https?://.*?/).*', mp4_video.text)
43         video_root = mobj.group('root')
44         formats = xml_description.findall('./metadata/MBRVideos/MBRVideo')
45         for format in formats:
46             mobj = re.match(r'mp4\:(?P<path>.*)', format.find('streamName').text)
47             url = video_root + mobj.group('path')
48             vbr = format.find('bitrate').text
49             video_formats.append({
50                 'url': url,
51                 'vbr': int(vbr),
52             })
53         return video_formats
54
55     def _parse_flv(self, xml_description):
56         video_formats = []
57         akami_url = xml_description.find('./metadata/akamaiHost').text
58         slide_video_path = xml_description.find('./metadata/slideVideo').text
59         video_formats.append({
60             'url': 'rtmp://' + akami_url + '/' + slide_video_path,
61             'format_note': 'slide deck video',
62             'quality': -2,
63             'preference': -2,
64             'format_id': 'slides',
65         })
66         speaker_video_path = xml_description.find('./metadata/speakerVideo').text
67         video_formats.append({
68             'url': 'rtmp://' + akami_url + '/' + speaker_video_path,
69             'format_note': 'speaker video',
70             'quality': -1,
71             'preference': -1,
72             'format_id': 'speaker',
73         })
74         return video_formats
75
76     def _login(self, webpage_url, video_id):
77         (username, password) = self._get_login_info()
78         if username is None or password is None:
79             self.report_warning('It looks like ' + webpage_url + ' requires a login. Try specifying a username and password and try again.')
80             return None
81
82         mobj = re.match(r'(?P<root_url>https?://.*?/).*', webpage_url)
83         login_url = mobj.group('root_url') + 'api/login.php'
84         logout_url = mobj.group('root_url') + 'logout'
85
86         login_form = {
87             'email': username,
88             'password': password,
89         }
90
91         request = compat_urllib_request.Request(login_url, compat_urllib_parse.urlencode(login_form))
92         request.add_header('Content-Type', 'application/x-www-form-urlencoded')
93         self._download_webpage(request, video_id, 'Logging in')
94         start_page = self._download_webpage(webpage_url, video_id, 'Getting authenticated video page')
95         self._download_webpage(logout_url, video_id, 'Logging out')
96
97         return start_page
98
99     def _real_extract(self, url):
100         mobj = re.match(self._VALID_URL, url)
101
102         video_id = mobj.group('id')
103         webpage_url = 'http://www.gdcvault.com/play/' + video_id
104         start_page = self._download_webpage(webpage_url, video_id)
105
106         xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root', None, False)
107
108         if xml_root is None:
109             # Probably need to authenticate
110             start_page = self._login(webpage_url, video_id)
111             if start_page is None:
112                 self.report_warning('Could not login.')
113             else:
114                 # Grab the url from the authenticated page
115                 xml_root = self._html_search_regex(r'<iframe src="(?P<xml_root>.*?)player.html.*?".*?</iframe>', start_page, 'xml root')
116
117         xml_name = self._html_search_regex(r'<iframe src=".*?\?xml=(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename', None, False)
118         if xml_name is None:
119             # Fallback to the older format
120             xml_name = self._html_search_regex(r'<iframe src=".*?\?xmlURL=xml/(?P<xml_file>.+?\.xml).*?".*?</iframe>', start_page, 'xml filename')
121
122         xml_decription_url = xml_root + 'xml/' + xml_name
123         xml_description = self._download_xml(xml_decription_url, video_id)
124
125         video_title = xml_description.find('./metadata/title').text
126         video_formats = self._parse_mp4(xml_description)
127         if video_formats is None:
128             video_formats = self._parse_flv(xml_description)
129
130         return {
131             'id': video_id,
132             'title': video_title,
133             'formats': video_formats,
134         }