Clarify that download rate is in bytes per second
[youtube-dl] / youtube_dl / extractor / mixcloud.py
1 import json
2 import re
3 import socket
4
5 from .common import InfoExtractor
6 from ..utils import (
7     compat_http_client,
8     compat_str,
9     compat_urllib_error,
10     compat_urllib_request,
11
12     ExtractorError,
13 )
14
15
16 class MixcloudIE(InfoExtractor):
17     _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
18     _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
19     IE_NAME = u'mixcloud'
20
21     def report_download_json(self, file_id):
22         """Report JSON download."""
23         self.to_screen(u'Downloading json')
24
25     def get_urls(self, jsonData, fmt, bitrate='best'):
26         """Get urls from 'audio_formats' section in json"""
27         try:
28             bitrate_list = jsonData[fmt]
29             if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
30                 bitrate = max(bitrate_list) # select highest
31
32             url_list = jsonData[fmt][bitrate]
33         except TypeError: # we have no bitrate info.
34             url_list = jsonData[fmt]
35         return url_list
36
37     def check_urls(self, url_list):
38         """Returns 1st active url from list"""
39         for url in url_list:
40             try:
41                 compat_urllib_request.urlopen(url)
42                 return url
43             except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error):
44                 url = None
45
46         return None
47
48     def _print_formats(self, formats):
49         print('Available formats:')
50         for fmt in formats.keys():
51             for b in formats[fmt]:
52                 try:
53                     ext = formats[fmt][b][0]
54                     print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
55                 except TypeError: # we have no bitrate info
56                     ext = formats[fmt][0]
57                     print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
58                     break
59
60     def _real_extract(self, url):
61         mobj = re.match(self._VALID_URL, url)
62         if mobj is None:
63             raise ExtractorError(u'Invalid URL: %s' % url)
64         # extract uploader & filename from url
65         uploader = mobj.group(1).decode('utf-8')
66         file_id = uploader + "-" + mobj.group(2).decode('utf-8')
67
68         # construct API request
69         file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
70         # retrieve .json file with links to files
71         request = compat_urllib_request.Request(file_url)
72         try:
73             self.report_download_json(file_url)
74             jsonData = compat_urllib_request.urlopen(request).read()
75         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
76             raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
77
78         # parse JSON
79         json_data = json.loads(jsonData)
80         player_url = json_data['player_swf_url']
81         formats = dict(json_data['audio_formats'])
82
83         req_format = self._downloader.params.get('format', None)
84
85         if self._downloader.params.get('listformats', None):
86             self._print_formats(formats)
87             return
88
89         if req_format is None or req_format == 'best':
90             for format_param in formats.keys():
91                 url_list = self.get_urls(formats, format_param)
92                 # check urls
93                 file_url = self.check_urls(url_list)
94                 if file_url is not None:
95                     break # got it!
96         else:
97             if req_format not in formats:
98                 raise ExtractorError(u'Format is not available')
99
100             url_list = self.get_urls(formats, req_format)
101             file_url = self.check_urls(url_list)
102             format_param = req_format
103
104         return [{
105             'id': file_id.decode('utf-8'),
106             'url': file_url.decode('utf-8'),
107             'uploader': uploader.decode('utf-8'),
108             'upload_date': None,
109             'title': json_data['name'],
110             'ext': file_url.split('.')[-1].decode('utf-8'),
111             'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
112             'thumbnail': json_data['thumbnail_url'],
113             'description': json_data['description'],
114             'player_url': player_url.decode('utf-8'),
115         }]