[newgrounds:playlist] Add extractor (closes #10611)
[youtube-dl] / youtube_dl / extractor / newgrounds.py
1 from __future__ import unicode_literals
2
3 import re
4
5 from .common import InfoExtractor
6 from ..utils import (
7     extract_attributes,
8     int_or_none,
9 )
10
11
12 class NewgroundsIE(InfoExtractor):
13     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:audio/listen|portal/view)/(?P<id>[0-9]+)'
14     _TESTS = [{
15         'url': 'https://www.newgrounds.com/audio/listen/549479',
16         'md5': 'fe6033d297591288fa1c1f780386f07a',
17         'info_dict': {
18             'id': '549479',
19             'ext': 'mp3',
20             'title': 'B7 - BusMode',
21             'uploader': 'Burn7',
22         }
23     }, {
24         'url': 'https://www.newgrounds.com/portal/view/673111',
25         'md5': '3394735822aab2478c31b1004fe5e5bc',
26         'info_dict': {
27             'id': '673111',
28             'ext': 'mp4',
29             'title': 'Dancin',
30             'uploader': 'Squirrelman82',
31         },
32     }, {
33         # source format unavailable, additional mp4 formats
34         'url': 'http://www.newgrounds.com/portal/view/689400',
35         'info_dict': {
36             'id': '689400',
37             'ext': 'mp4',
38             'title': 'ZTV News Episode 8',
39             'uploader': 'BennettTheSage',
40         },
41         'params': {
42             'skip_download': True,
43         },
44     }]
45
46     def _real_extract(self, url):
47         media_id = self._match_id(url)
48
49         webpage = self._download_webpage(url, media_id)
50
51         title = self._html_search_regex(
52             r'<title>([^>]+)</title>', webpage, 'title')
53
54         video_url = self._parse_json(self._search_regex(
55             r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id)
56
57         formats = [{
58             'url': video_url,
59             'format_id': 'source',
60             'quality': 1,
61         }]
62
63         max_resolution = int_or_none(self._search_regex(
64             r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
65             default=None))
66         if max_resolution:
67             url_base = video_url.rpartition('.')[0]
68             for resolution in (360, 720, 1080):
69                 if resolution > max_resolution:
70                     break
71                 formats.append({
72                     'url': '%s.%dp.mp4' % (url_base, resolution),
73                     'format_id': '%dp' % resolution,
74                     'height': resolution,
75                 })
76
77         self._check_formats(formats, media_id)
78         self._sort_formats(formats)
79
80         uploader = self._html_search_regex(
81             r'(?:Author|Writer)\s*<a[^>]+>([^<]+)', webpage, 'uploader',
82             fatal=False)
83
84         return {
85             'id': media_id,
86             'title': title,
87             'uploader': uploader,
88             'formats': formats,
89         }
90
91
92 class NewgroundsPlaylistIE(InfoExtractor):
93     _VALID_URL = r'https?://(?:www\.)?newgrounds\.com/(?:collection|[^/]+/search/[^/]+)/(?P<id>[^/?#&]+)'
94     _TESTS = [{
95         'url': 'https://www.newgrounds.com/collection/cats',
96         'info_dict': {
97             'id': 'cats',
98             'title': 'Cats',
99         },
100         'playlist_mincount': 46,
101     }, {
102         'url': 'http://www.newgrounds.com/portal/search/author/ZONE-SAMA',
103         'info_dict': {
104             'id': 'ZONE-SAMA',
105             'title': 'Portal Search: ZONE-SAMA',
106         },
107         'playlist_mincount': 47,
108     }, {
109         'url': 'http://www.newgrounds.com/audio/search/title/cats',
110         'only_matching': True,
111     }]
112
113     def _real_extract(self, url):
114         playlist_id = self._match_id(url)
115
116         webpage = self._download_webpage(url, playlist_id)
117
118         title = self._search_regex(
119             r'<title>([^>]+)</title>', webpage, 'title', default=None)
120
121         # cut left menu
122         webpage = self._search_regex(
123             r'(?s)<div[^>]+\bclass=["\']column wide(.+)',
124             webpage, 'wide column', default=webpage)
125
126         entries = []
127         for a, path, media_id in re.findall(
128                 r'(<a[^>]+\bhref=["\']/?((?:portal/view|audio/listen)/(\d+))[^>]+>)',
129                 webpage):
130             a_class = extract_attributes(a).get('class')
131             if a_class not in ('item-portalsubmission', 'item-audiosubmission'):
132                 continue
133             entries.append(
134                 self.url_result(
135                     'https://www.newgrounds.com/%s' % path,
136                     ie=NewgroundsIE.ie_key(), video_id=media_id))
137
138         return self.playlist_result(entries, playlist_id, title)