[niconico] Fix extraction and update tests (closes #5511)
[youtube-dl] / youtube_dl / extractor / niconico.py
1 # encoding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5 import json
6
7 from .common import InfoExtractor
8 from ..compat import (
9     compat_urllib_parse,
10     compat_urllib_request,
11     compat_urlparse,
12 )
13 from ..utils import (
14     ExtractorError,
15     int_or_none,
16     parse_duration,
17     parse_iso8601,
18     xpath_text,
19     determine_ext,
20 )
21
22
23 class NiconicoIE(InfoExtractor):
24     IE_NAME = 'niconico'
25     IE_DESC = 'ニコニコ動画'
26
27     _TESTS = [{
28         'url': 'http://www.nicovideo.jp/watch/sm22312215',
29         'md5': 'd1a75c0823e2f629128c43e1212760f9',
30         'info_dict': {
31             'id': 'sm22312215',
32             'ext': 'mp4',
33             'title': 'Big Buck Bunny',
34             'uploader': 'takuya0301',
35             'uploader_id': '2698420',
36             'upload_date': '20131123',
37             'timestamp': 1385182762,
38             'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
39             'duration': 33,
40         },
41         'params': {
42             'username': 'ydl.niconico@gmail.com',
43             'password': 'youtube-dl',
44         },
45     }, {
46         'url': 'http://www.nicovideo.jp/watch/nm14296458',
47         'md5': '8db08e0158457cf852a31519fceea5bc',
48         'info_dict': {
49             'id': 'nm14296458',
50             'ext': 'swf',
51             'title': '【鏡音リン】Dance on media【オリジナル】take2!',
52             'description': 'md5:689f066d74610b3b22e0f1739add0f58',
53             'uploader': 'りょうた',
54             'uploader_id': '18822557',
55             'upload_date': '20110429',
56             'timestamp': 1304065916,
57             'duration': 209,
58         },
59         'params': {
60             'username': 'ydl.niconico@gmail.com',
61             'password': 'youtube-dl',
62         },
63     }, {
64         # 'video exists but is marked as "deleted"
65         'url': 'http://www.nicovideo.jp/watch/sm10000',
66         'md5': '38e53c9aad548f3ecf01ca7680b59b08',
67         'info_dict': {
68             'id': 'sm10000',
69             'ext': 'unknown_video',
70             'description': 'deleted',
71             'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>',
72         },
73         'params': {
74             'username': 'ydl.niconico@gmail.com',
75             'password': 'youtube-dl',
76         }
77     }]
78
79     _VALID_URL = r'https?://(?:www\.|secure\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
80     _NETRC_MACHINE = 'niconico'
81     # Determine whether the downloader used authentication to download video
82     _AUTHENTICATED = False
83
84     def _real_initialize(self):
85         self._login()
86
87     def _login(self):
88         (username, password) = self._get_login_info()
89         # No authentication to be performed
90         if not username:
91             return True
92
93         # Log in
94         login_form_strs = {
95             'mail': username,
96             'password': password,
97         }
98         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99         # chokes on unicode
100         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
101         login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8')
102         request = compat_urllib_request.Request(
103             'https://secure.nicovideo.jp/secure/login', login_data)
104         login_results = self._download_webpage(
105             request, None, note='Logging in', errnote='Unable to log in')
106         if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
107             self._downloader.report_warning('unable to log in: bad username or password')
108             return False
109         # Successful login
110         self._AUTHENTICATED = True
111         return True
112
113     def _real_extract(self, url):
114         video_id = self._match_id(url)
115
116         # Get video webpage. We are not actually interested in it for normal
117         # cases, but need the cookies in order to be able to download the
118         # info webpage
119         webpage = self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id)
120
121         video_info = self._download_xml(
122             'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
123             note='Downloading video info page')
124
125         if self._AUTHENTICATED:
126             # Get flv info
127             flv_info_webpage = self._download_webpage(
128                 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
129                 video_id, 'Downloading flv info')
130         else:
131             # Get external player info
132             ext_player_info = self._download_webpage(
133                 'http://ext.nicovideo.jp/thumb_watch/' + video_id, video_id)
134             thumb_play_key = self._search_regex(
135                 r'\'thumbPlayKey\'\s*:\s*\'(.*?)\'', ext_player_info, 'thumbPlayKey')
136
137             # Get flv info
138             flv_info_data = compat_urllib_parse.urlencode({
139                 'k': thumb_play_key,
140                 'v': video_id
141             })
142             flv_info_request = compat_urllib_request.Request(
143                 'http://ext.nicovideo.jp/thumb_watch', flv_info_data,
144                 {'Content-Type': 'application/x-www-form-urlencoded'})
145             flv_info_webpage = self._download_webpage(
146                 flv_info_request, video_id,
147                 note='Downloading flv info', errnote='Unable to download flv info')
148
149         flv_info = compat_urlparse.parse_qs(flv_info_webpage)
150         if 'url' not in flv_info:
151             if 'deleted' in flv_info:
152                 raise ExtractorError('The video has been deleted.',
153                                      expected=True)
154             else:
155                 raise ExtractorError('Unable to find video URL')
156
157         video_real_url = flv_info['url'][0]
158
159         # Start extracting information
160         title = xpath_text(video_info, './/title')
161         if not title:
162             title = self._html_search_regex(
163                 r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
164                 webpage, 'video title')
165
166         extension = xpath_text(video_info, './/movie_type')
167         if not extension:
168             extension = determine_ext(video_real_url)
169         video_format = extension.upper()
170         thumbnail = xpath_text(video_info, './/thumbnail_url')
171         description = xpath_text(video_info, './/description')
172         timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve'))
173         view_count = int_or_none(xpath_text(video_info, './/view_counter'))
174         comment_count = int_or_none(xpath_text(video_info, './/comment_num'))
175         duration = parse_duration(xpath_text(video_info, './/length'))
176         webpage_url = xpath_text(video_info, './/watch_url')
177
178         if video_info.find('.//ch_id') is not None:
179             uploader_id = video_info.find('.//ch_id').text
180             uploader = video_info.find('.//ch_name').text
181         elif video_info.find('.//user_id') is not None:
182             uploader_id = video_info.find('.//user_id').text
183             uploader = video_info.find('.//user_nickname').text
184         else:
185             uploader_id = uploader = None
186
187         ret = {
188             'id': video_id,
189             'url': video_real_url,
190             'title': title,
191             'ext': extension,
192             'format': video_format,
193             'thumbnail': thumbnail,
194             'description': description,
195             'uploader': uploader,
196             'timestamp': timestamp,
197             'uploader_id': uploader_id,
198             'view_count': view_count,
199             'comment_count': comment_count,
200             'duration': duration,
201             'webpage_url': webpage_url,
202         }
203         return dict((k, v) for k, v in ret.items() if v is not None)
204
205
206 class NiconicoPlaylistIE(InfoExtractor):
207     _VALID_URL = r'https?://www\.nicovideo\.jp/mylist/(?P<id>\d+)'
208
209     _TEST = {
210         'url': 'http://www.nicovideo.jp/mylist/27411728',
211         'info_dict': {
212             'id': '27411728',
213             'title': 'AKB48のオールナイトニッポン',
214         },
215         'playlist_mincount': 225,
216     }
217
218     def _real_extract(self, url):
219         list_id = self._match_id(url)
220         webpage = self._download_webpage(url, list_id)
221
222         entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);',
223                                           webpage, 'entries')
224         entries = json.loads(entries_json)
225         entries = [{
226             '_type': 'url',
227             'ie_key': NiconicoIE.ie_key(),
228             'url': ('http://www.nicovideo.jp/watch/%s' %
229                     entry['item_data']['video_id']),
230         } for entry in entries]
231
232         return {
233             '_type': 'playlist',
234             'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'),
235             'id': list_id,
236             'entries': entries,
237         }