[peertube] Detect embed URLs in generic extraction (closes #21666)
[youtube-dl] / youtube_dl / extractor / peertube.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import compat_str
8 from ..utils import (
9     int_or_none,
10     parse_resolution,
11     try_get,
12     unified_timestamp,
13     url_or_none,
14     urljoin,
15 )
16
17
18 class PeerTubeIE(InfoExtractor):
19     _INSTANCES_RE = r'''(?:
20                             # Taken from https://instances.joinpeertube.org/instances
21                             tube\.openalgeria\.org|
22                             peertube\.pointsecu\.fr|
23                             peertube\.nogafa\.org|
24                             peertube\.pl|
25                             megatube\.lilomoino\.fr|
26                             peertube\.tamanoir\.foucry\.net|
27                             peertube\.inapurna\.org|
28                             peertube\.netzspielplatz\.de|
29                             video\.deadsuperhero\.com|
30                             peertube\.devosi\.org|
31                             peertube\.1312\.media|
32                             tube\.worldofhauru\.xyz|
33                             tube\.bootlicker\.party|
34                             skeptikon\.fr|
35                             peertube\.geekshell\.fr|
36                             tube\.opportunis\.me|
37                             peertube\.peshane\.net|
38                             video\.blueline\.mg|
39                             tube\.homecomputing\.fr|
40                             videos\.cloudfrancois\.fr|
41                             peertube\.viviers-fibre\.net|
42                             tube\.ouahpiti\.info|
43                             video\.tedomum\.net|
44                             video\.g3l\.org|
45                             fontube\.fr|
46                             peertube\.gaialabs\.ch|
47                             peertube\.extremely\.online|
48                             peertube\.public-infrastructure\.eu|
49                             tube\.kher\.nl|
50                             peertube\.qtg\.fr|
51                             tube\.22decembre\.eu|
52                             facegirl\.me|
53                             video\.migennes\.net|
54                             janny\.moe|
55                             tube\.p2p\.legal|
56                             video\.atlanti\.se|
57                             troll\.tv|
58                             peertube\.geekael\.fr|
59                             vid\.leotindall\.com|
60                             video\.anormallostpod\.ovh|
61                             p-tube\.h3z\.jp|
62                             tube\.darfweb\.eu|
63                             videos\.iut-orsay\.fr|
64                             peertube\.solidev\.net|
65                             videos\.symphonie-of-code\.fr|
66                             testtube\.ortg\.de|
67                             videos\.cemea\.org|
68                             peertube\.gwendalavir\.eu|
69                             video\.passageenseine\.fr|
70                             videos\.festivalparminous\.org|
71                             peertube\.touhoppai\.moe|
72                             peertube\.duckdns\.org|
73                             sikke\.fi|
74                             peertube\.mastodon\.host|
75                             firedragonvideos\.com|
76                             vidz\.dou\.bet|
77                             peertube\.koehn\.com|
78                             peer\.hostux\.social|
79                             share\.tube|
80                             peertube\.walkingmountains\.fr|
81                             medias\.libox\.fr|
82                             peertube\.moe|
83                             peertube\.xyz|
84                             jp\.peertube\.network|
85                             videos\.benpro\.fr|
86                             tube\.otter\.sh|
87                             peertube\.angristan\.xyz|
88                             peertube\.parleur\.net|
89                             peer\.ecutsa\.fr|
90                             peertube\.heraut\.eu|
91                             peertube\.tifox\.fr|
92                             peertube\.maly\.io|
93                             vod\.mochi\.academy|
94                             exode\.me|
95                             coste\.video|
96                             tube\.aquilenet\.fr|
97                             peertube\.gegeweb\.eu|
98                             framatube\.org|
99                             thinkerview\.video|
100                             tube\.conferences-gesticulees\.net|
101                             peertube\.datagueule\.tv|
102                             video\.lqdn\.fr|
103                             meilleurtube\.delire\.party|
104                             tube\.mochi\.academy|
105                             peertube\.dav\.li|
106                             media\.zat\.im|
107                             pytu\.be|
108                             peertube\.valvin\.fr|
109                             peertube\.nsa\.ovh|
110                             video\.colibris-outilslibres\.org|
111                             video\.hispagatos\.org|
112                             tube\.svnet\.fr|
113                             peertube\.video|
114                             videos\.lecygnenoir\.info|
115                             peertube3\.cpy\.re|
116                             peertube2\.cpy\.re|
117                             videos\.tcit\.fr|
118                             peertube\.cpy\.re
119                         )'''
120     _UUID_RE = r'[\da-fA-F]{8}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{4}-[\da-fA-F]{12}'
121     _VALID_URL = r'''(?x)
122                     (?:
123                         peertube:(?P<host>[^:]+):|
124                         https?://(?P<host_2>%s)/(?:videos/(?:watch|embed)|api/v\d/videos)/
125                     )
126                     (?P<id>%s)
127                     ''' % (_INSTANCES_RE, _UUID_RE)
128     _TESTS = [{
129         'url': 'https://peertube.moe/videos/watch/2790feb0-8120-4e63-9af3-c943c69f5e6c',
130         'md5': '80f24ff364cc9d333529506a263e7feb',
131         'info_dict': {
132             'id': '2790feb0-8120-4e63-9af3-c943c69f5e6c',
133             'ext': 'mp4',
134             'title': 'wow',
135             'description': 'wow such video, so gif',
136             'thumbnail': r're:https?://.*\.(?:jpg|png)',
137             'timestamp': 1519297480,
138             'upload_date': '20180222',
139             'uploader': 'Luclu7',
140             'uploader_id': '7fc42640-efdb-4505-a45d-a15b1a5496f1',
141             'uploder_url': 'https://peertube.nsa.ovh/accounts/luclu7',
142             'license': 'Unknown',
143             'duration': 3,
144             'view_count': int,
145             'like_count': int,
146             'dislike_count': int,
147             'tags': list,
148             'categories': list,
149         }
150     }, {
151         'url': 'https://peertube.tamanoir.foucry.net/videos/watch/0b04f13d-1e18-4f1d-814e-4979aa7c9c44',
152         'only_matching': True,
153     }, {
154         # nsfw
155         'url': 'https://tube.22decembre.eu/videos/watch/9bb88cd3-9959-46d9-9ab9-33d2bb704c39',
156         'only_matching': True,
157     }, {
158         'url': 'https://tube.22decembre.eu/videos/embed/fed67262-6edb-4d1c-833b-daa9085c71d7',
159         'only_matching': True,
160     }, {
161         'url': 'https://tube.openalgeria.org/api/v1/videos/c1875674-97d0-4c94-a058-3f7e64c962e8',
162         'only_matching': True,
163     }, {
164         'url': 'peertube:video.blender.org:b37a5b9f-e6b5-415c-b700-04a5cd6ec205',
165         'only_matching': True,
166     }]
167
168     @staticmethod
169     def _extract_peertube_url(webpage, source_url):
170         mobj = re.match(
171             r'https?://(?P<host>[^/]+)/videos/(?:watch|embed)/(?P<id>%s)'
172             % PeerTubeIE._UUID_RE, source_url)
173         if mobj and any(p in webpage for p in (
174                 '<title>PeerTube<',
175                 'There will be other non JS-based clients to access PeerTube',
176                 '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')):
177             return 'peertube:%s:%s' % mobj.group('host', 'id')
178
179     @staticmethod
180     def _extract_urls(webpage, source_url):
181         entries = re.findall(
182             r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)'''
183             % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage)
184         if not entries:
185             peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url)
186             if peertube_url:
187                 entries = [peertube_url]
188         return entries
189
190     def _real_extract(self, url):
191         mobj = re.match(self._VALID_URL, url)
192         host = mobj.group('host') or mobj.group('host_2')
193         video_id = mobj.group('id')
194
195         video = self._download_json(
196             'https://%s/api/v1/videos/%s' % (host, video_id), video_id)
197
198         title = video['name']
199
200         formats = []
201         for file_ in video['files']:
202             if not isinstance(file_, dict):
203                 continue
204             file_url = url_or_none(file_.get('fileUrl'))
205             if not file_url:
206                 continue
207             file_size = int_or_none(file_.get('size'))
208             format_id = try_get(
209                 file_, lambda x: x['resolution']['label'], compat_str)
210             f = parse_resolution(format_id)
211             f.update({
212                 'url': file_url,
213                 'format_id': format_id,
214                 'filesize': file_size,
215             })
216             formats.append(f)
217         self._sort_formats(formats)
218
219         def account_data(field):
220             return try_get(video, lambda x: x['account'][field], compat_str)
221
222         category = try_get(video, lambda x: x['category']['label'], compat_str)
223         categories = [category] if category else None
224
225         nsfw = video.get('nsfw')
226         if nsfw is bool:
227             age_limit = 18 if nsfw else 0
228         else:
229             age_limit = None
230
231         return {
232             'id': video_id,
233             'title': title,
234             'description': video.get('description'),
235             'thumbnail': urljoin(url, video.get('thumbnailPath')),
236             'timestamp': unified_timestamp(video.get('publishedAt')),
237             'uploader': account_data('displayName'),
238             'uploader_id': account_data('uuid'),
239             'uploder_url': account_data('url'),
240             'license': try_get(
241                 video, lambda x: x['licence']['label'], compat_str),
242             'duration': int_or_none(video.get('duration')),
243             'view_count': int_or_none(video.get('views')),
244             'like_count': int_or_none(video.get('likes')),
245             'dislike_count': int_or_none(video.get('dislikes')),
246             'age_limit': age_limit,
247             'tags': try_get(video, lambda x: x['tags'], list),
248             'categories': categories,
249             'formats': formats,
250         }