[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / yandexmusic.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5 import hashlib
6
7 from .common import InfoExtractor
8 from ..compat import compat_str
9 from ..utils import (
10     ExtractorError,
11     int_or_none,
12     float_or_none,
13     try_get,
14 )
15
16
17 class YandexMusicBaseIE(InfoExtractor):
18     @staticmethod
19     def _handle_error(response):
20         if isinstance(response, dict):
21             error = response.get('error')
22             if error:
23                 raise ExtractorError(error, expected=True)
24             if response.get('type') == 'captcha' or 'captcha' in response:
25                 YandexMusicBaseIE._raise_captcha()
26
27     @staticmethod
28     def _raise_captcha():
29         raise ExtractorError(
30             'YandexMusic has considered youtube-dl requests automated and '
31             'asks you to solve a CAPTCHA. You can either wait for some '
32             'time until unblocked and optionally use --sleep-interval '
33             'in future or alternatively you can go to https://music.yandex.ru/ '
34             'solve CAPTCHA, then export cookies and pass cookie file to '
35             'youtube-dl with --cookies',
36             expected=True)
37
38     def _download_webpage_handle(self, *args, **kwargs):
39         webpage = super(YandexMusicBaseIE, self)._download_webpage_handle(*args, **kwargs)
40         if 'Нам очень жаль, но запросы, поступившие с вашего IP-адреса, похожи на автоматические.' in webpage:
41             self._raise_captcha()
42         return webpage
43
44     def _download_json(self, *args, **kwargs):
45         response = super(YandexMusicBaseIE, self)._download_json(*args, **kwargs)
46         self._handle_error(response)
47         return response
48
49
50 class YandexMusicTrackIE(YandexMusicBaseIE):
51     IE_NAME = 'yandexmusic:track'
52     IE_DESC = 'Яндекс.Музыка - Трек'
53     _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<album_id>\d+)/track/(?P<id>\d+)'
54
55     _TESTS = [{
56         'url': 'http://music.yandex.ru/album/540508/track/4878838',
57         'md5': 'f496818aa2f60b6c0062980d2e00dc20',
58         'info_dict': {
59             'id': '4878838',
60             'ext': 'mp3',
61             'title': 'Carlo Ambrosio & Fabio Di Bari - Gypsy Eyes 1',
62             'filesize': 4628061,
63             'duration': 193.04,
64             'track': 'Gypsy Eyes 1',
65             'album': 'Gypsy Soul',
66             'album_artist': 'Carlo Ambrosio',
67             'artist': 'Carlo Ambrosio & Fabio Di Bari',
68             'release_year': 2009,
69         },
70         'skip': 'Travis CI servers blocked by YandexMusic',
71     }, {
72         # multiple disks
73         'url': 'http://music.yandex.ru/album/3840501/track/705105',
74         'md5': 'ebe7b4e2ac7ac03fe11c19727ca6153e',
75         'info_dict': {
76             'id': '705105',
77             'ext': 'mp3',
78             'title': 'Hooverphonic - Sometimes',
79             'filesize': 5743386,
80             'duration': 239.27,
81             'track': 'Sometimes',
82             'album': 'The Best of Hooverphonic',
83             'album_artist': 'Hooverphonic',
84             'artist': 'Hooverphonic',
85             'release_year': 2016,
86             'genre': 'pop',
87             'disc_number': 2,
88             'track_number': 9,
89         },
90         'skip': 'Travis CI servers blocked by YandexMusic',
91     }]
92
93     def _real_extract(self, url):
94         mobj = re.match(self._VALID_URL, url)
95         album_id, track_id = mobj.group('album_id'), mobj.group('id')
96
97         track = self._download_json(
98             'http://music.yandex.ru/handlers/track.jsx?track=%s:%s' % (track_id, album_id),
99             track_id, 'Downloading track JSON')['track']
100         track_title = track['title']
101
102         download_data = self._download_json(
103             'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id),
104             track_id, 'Downloading track location url JSON',
105             headers={'X-Retpath-Y': url})
106
107         fd_data = self._download_json(
108             download_data['src'], track_id,
109             'Downloading track location JSON',
110             query={'format': 'json'})
111         key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + fd_data['path'][1:] + fd_data['s']).encode('utf-8')).hexdigest()
112         storage = track['storageDir'].split('.')
113         f_url = 'http://%s/get-mp3/%s/%s?track-id=%s ' % (fd_data['host'], key, fd_data['ts'] + fd_data['path'], storage[1])
114
115         thumbnail = None
116         cover_uri = track.get('albums', [{}])[0].get('coverUri')
117         if cover_uri:
118             thumbnail = cover_uri.replace('%%', 'orig')
119             if not thumbnail.startswith('http'):
120                 thumbnail = 'http://' + thumbnail
121
122         track_info = {
123             'id': track_id,
124             'ext': 'mp3',
125             'url': f_url,
126             'filesize': int_or_none(track.get('fileSize')),
127             'duration': float_or_none(track.get('durationMs'), 1000),
128             'thumbnail': thumbnail,
129             'track': track_title,
130             'acodec': download_data.get('codec'),
131             'abr': int_or_none(download_data.get('bitrate')),
132         }
133
134         def extract_artist_name(artist):
135             decomposed = artist.get('decomposed')
136             if not isinstance(decomposed, list):
137                 return artist['name']
138             parts = [artist['name']]
139             for element in decomposed:
140                 if isinstance(element, dict) and element.get('name'):
141                     parts.append(element['name'])
142                 elif isinstance(element, compat_str):
143                     parts.append(element)
144             return ''.join(parts)
145
146         def extract_artist(artist_list):
147             if artist_list and isinstance(artist_list, list):
148                 artists_names = [extract_artist_name(a) for a in artist_list if a.get('name')]
149                 if artists_names:
150                     return ', '.join(artists_names)
151
152         albums = track.get('albums')
153         if albums and isinstance(albums, list):
154             album = albums[0]
155             if isinstance(album, dict):
156                 year = album.get('year')
157                 disc_number = int_or_none(try_get(
158                     album, lambda x: x['trackPosition']['volume']))
159                 track_number = int_or_none(try_get(
160                     album, lambda x: x['trackPosition']['index']))
161                 track_info.update({
162                     'album': album.get('title'),
163                     'album_artist': extract_artist(album.get('artists')),
164                     'release_year': int_or_none(year),
165                     'genre': album.get('genre'),
166                     'disc_number': disc_number,
167                     'track_number': track_number,
168                 })
169
170         track_artist = extract_artist(track.get('artists'))
171         if track_artist:
172             track_info.update({
173                 'artist': track_artist,
174                 'title': '%s - %s' % (track_artist, track_title),
175             })
176         else:
177             track_info['title'] = track_title
178
179         return track_info
180
181
182 class YandexMusicPlaylistBaseIE(YandexMusicBaseIE):
183     def _build_playlist(self, tracks):
184         return [
185             self.url_result(
186                 'http://music.yandex.ru/album/%s/track/%s' % (track['albums'][0]['id'], track['id']))
187             for track in tracks if track.get('albums') and isinstance(track.get('albums'), list)]
188
189
190 class YandexMusicAlbumIE(YandexMusicPlaylistBaseIE):
191     IE_NAME = 'yandexmusic:album'
192     IE_DESC = 'Яндекс.Музыка - Альбом'
193     _VALID_URL = r'https?://music\.yandex\.(?:ru|kz|ua|by)/album/(?P<id>\d+)/?(\?|$)'
194
195     _TESTS = [{
196         'url': 'http://music.yandex.ru/album/540508',
197         'info_dict': {
198             'id': '540508',
199             'title': 'Carlo Ambrosio - Gypsy Soul (2009)',
200         },
201         'playlist_count': 50,
202         'skip': 'Travis CI servers blocked by YandexMusic',
203     }, {
204         'url': 'https://music.yandex.ru/album/3840501',
205         'info_dict': {
206             'id': '3840501',
207             'title': 'Hooverphonic - The Best of Hooverphonic (2016)',
208         },
209         'playlist_count': 33,
210         'skip': 'Travis CI servers blocked by YandexMusic',
211     }]
212
213     def _real_extract(self, url):
214         album_id = self._match_id(url)
215
216         album = self._download_json(
217             'http://music.yandex.ru/handlers/album.jsx?album=%s' % album_id,
218             album_id, 'Downloading album JSON')
219
220         entries = self._build_playlist([track for volume in album['volumes'] for track in volume])
221
222         title = '%s - %s' % (album['artists'][0]['name'], album['title'])
223         year = album.get('year')
224         if year:
225             title += ' (%s)' % year
226
227         return self.playlist_result(entries, compat_str(album['id']), title)
228
229
230 class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
231     IE_NAME = 'yandexmusic:playlist'
232     IE_DESC = 'Яндекс.Музыка - Плейлист'
233     _VALID_URL = r'https?://music\.yandex\.(?P<tld>ru|kz|ua|by)/users/(?P<user>[^/]+)/playlists/(?P<id>\d+)'
234
235     _TESTS = [{
236         'url': 'http://music.yandex.ru/users/music.partners/playlists/1245',
237         'info_dict': {
238             'id': '1245',
239             'title': 'Что слушают Enter Shikari',
240             'description': 'md5:3b9f27b0efbe53f2ee1e844d07155cc9',
241         },
242         'playlist_count': 6,
243         'skip': 'Travis CI servers blocked by YandexMusic',
244     }, {
245         # playlist exceeding the limit of 150 tracks shipped with webpage (see
246         # https://github.com/ytdl-org/youtube-dl/issues/6666)
247         'url': 'https://music.yandex.ru/users/ya.playlist/playlists/1036',
248         'info_dict': {
249             'id': '1036',
250             'title': 'Музыка 90-х',
251         },
252         'playlist_mincount': 300,
253         'skip': 'Travis CI servers blocked by YandexMusic',
254     }]
255
256     def _real_extract(self, url):
257         mobj = re.match(self._VALID_URL, url)
258         tld = mobj.group('tld')
259         user = mobj.group('user')
260         playlist_id = mobj.group('id')
261
262         playlist = self._download_json(
263             'https://music.yandex.%s/handlers/playlist.jsx' % tld,
264             playlist_id, 'Downloading missing tracks JSON',
265             fatal=False,
266             headers={
267                 'Referer': url,
268                 'X-Requested-With': 'XMLHttpRequest',
269                 'X-Retpath-Y': url,
270             },
271             query={
272                 'owner': user,
273                 'kinds': playlist_id,
274                 'light': 'true',
275                 'lang': tld,
276                 'external-domain': 'music.yandex.%s' % tld,
277                 'overembed': 'false',
278             })['playlist']
279
280         tracks = playlist['tracks']
281         track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
282
283         # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
284         # missing tracks should be retrieved manually.
285         if len(tracks) < len(track_ids):
286             present_track_ids = set([
287                 compat_str(track['id'])
288                 for track in tracks if track.get('id')])
289             missing_track_ids = [
290                 track_id for track_id in track_ids
291                 if track_id not in present_track_ids]
292             missing_tracks = self._download_json(
293                 'https://music.yandex.%s/handlers/track-entries.jsx' % tld,
294                 playlist_id, 'Downloading missing tracks JSON',
295                 fatal=False,
296                 headers={
297                     'Referer': url,
298                     'X-Requested-With': 'XMLHttpRequest',
299                 },
300                 query={
301                     'entries': ','.join(missing_track_ids),
302                     'lang': tld,
303                     'external-domain': 'music.yandex.%s' % tld,
304                     'overembed': 'false',
305                     'strict': 'true',
306                 })
307             if missing_tracks:
308                 tracks.extend(missing_tracks)
309
310         return self.playlist_result(
311             self._build_playlist(tracks),
312             compat_str(playlist_id),
313             playlist.get('title'), playlist.get('description'))