[freesound] Fix extraction and extended (closes #11602)
[youtube-dl] / youtube_dl / extractor / freesound.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     determine_ext,
8     float_or_none,
9     get_element_by_class,
10     get_element_by_id,
11     int_or_none,
12     parse_filesize,
13     unified_strdate,
14 )
15
16
17 class FreesoundIE(InfoExtractor):
18     _VALID_URL = r'https?://(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
19     _TEST = {
20         'url': 'http://www.freesound.org/people/miklovan/sounds/194503/',
21         'md5': '12280ceb42c81f19a515c745eae07650',
22         'info_dict': {
23             'id': '194503',
24             'ext': 'mp3',
25             'title': 'gulls in the city.wav',
26             'uploader': 'miklovan',
27             'description': 'the sounds of seagulls in the city',
28         }
29     }
30
31     def _real_extract(self, url):
32         mobj = re.match(self._VALID_URL, url)
33         music_id = mobj.group('id')
34         webpage = self._download_webpage(url, music_id)
35
36         audio_url = self._og_search_property('audio', webpage, 'song url')
37         title = self._og_search_property('audio:title', webpage, 'song title')
38         duration = float_or_none(get_element_by_class('duration', webpage), scale=1000)
39         tags = get_element_by_class('tags', webpage)
40         sound_info = get_element_by_id('sound_information_box', webpage)
41         release_date = get_element_by_id('sound_date', webpage)
42
43         description = self._html_search_regex(
44             r'<div id="sound_description">(.*?)</div>', webpage, 'description',
45             fatal=False, flags=re.DOTALL)
46
47         download_count = int_or_none(self._html_search_regex(
48             r'Downloaded.*>(\d+)<', webpage, 'downloaded', fatal=False))
49
50         filesize = float_or_none(parse_filesize(self._search_regex(
51             r'Filesize</dt><dd>(.*)</dd>', sound_info, 'file size (approx)', fatal=False)))
52
53         if release_date:
54             release_date = unified_strdate(release_date.replace('th', ''))
55
56         bitdepth = self._html_search_regex(
57             r'Bitdepth</dt><dd>(.*)</dd>', sound_info, 'Bitdepth', fatal=False)
58
59         channels = self._html_search_regex(
60             r'Channels</dt><dd>(.*)</dd>', sound_info, 'Channels info', fatal=False)
61
62         formats = [{
63             'url': audio_url,
64             'id': music_id,
65             'format_id': self._og_search_property('audio:type', webpage, 'audio format', fatal=False),
66             'format_note': '{0} {1} {2}'.format(determine_ext(audio_url), bitdepth, channels),
67             'filesize_approx': filesize,
68             'asr': int_or_none(self._html_search_regex(
69                 r'Samplerate</dt><dd>(\d+).*</dd>',
70                 sound_info, 'samplerate', fatal=False)),
71         }]
72
73         return {
74             'id': music_id,
75             'title': title,
76             'uploader': self._og_search_property('audio:artist', webpage, 'music uploader', fatal=False),
77             'description': description,
78             'duration': duration,
79             'tags': [self._html_search_regex(r'>(.*)</a>', t, 'tag', fatal=False)
80                      for t in tags.split('\n') if t.strip()],
81             'formats': formats,
82             'release_date': release_date,
83             'likes_count': download_count,
84         }