[pornhub] Extract upload date (closes #17574)
[youtube-dl] / youtube_dl / extractor / pornhub.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import functools
5 import itertools
6 import operator
7 import re
8
9 from .common import InfoExtractor
10 from ..compat import (
11     compat_HTTPError,
12     compat_str,
13 )
14 from ..utils import (
15     ExtractorError,
16     int_or_none,
17     js_to_json,
18     orderedSet,
19     remove_quotes,
20     str_to_int,
21     url_or_none,
22 )
23
24
25 class PornHubIE(InfoExtractor):
26     IE_DESC = 'PornHub and Thumbzilla'
27     _VALID_URL = r'''(?x)
28                     https?://
29                         (?:
30                             (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
31                             (?:www\.)?thumbzilla\.com/video/
32                         )
33                         (?P<id>[\da-z]+)
34                     '''
35     _TESTS = [{
36         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
37         'md5': '1e19b41231a02eba417839222ac9d58e',
38         'info_dict': {
39             'id': '648719015',
40             'ext': 'mp4',
41             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
42             'uploader': 'Babes',
43             'upload_date': '20130628',
44             'duration': 361,
45             'view_count': int,
46             'like_count': int,
47             'dislike_count': int,
48             'comment_count': int,
49             'age_limit': 18,
50             'tags': list,
51             'categories': list,
52         },
53     }, {
54         # non-ASCII title
55         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
56         'info_dict': {
57             'id': '1331683002',
58             'ext': 'mp4',
59             'title': '重庆婷婷女王足交',
60             'uploader': 'Unknown',
61             'upload_date': '20150213',
62             'duration': 1753,
63             'view_count': int,
64             'like_count': int,
65             'dislike_count': int,
66             'comment_count': int,
67             'age_limit': 18,
68             'tags': list,
69             'categories': list,
70         },
71         'params': {
72             'skip_download': True,
73         },
74     }, {
75         # subtitles
76         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
77         'info_dict': {
78             'id': 'ph5af5fef7c2aa7',
79             'ext': 'mp4',
80             'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
81             'uploader': 'BFFs',
82             'duration': 622,
83             'view_count': int,
84             'like_count': int,
85             'dislike_count': int,
86             'comment_count': int,
87             'age_limit': 18,
88             'tags': list,
89             'categories': list,
90             'subtitles': {
91                 'en': [{
92                     "ext": 'srt'
93                 }]
94             },
95         },
96         'params': {
97             'skip_download': True,
98         },
99     }, {
100         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
101         'only_matching': True,
102     }, {
103         # removed at the request of cam4.com
104         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
105         'only_matching': True,
106     }, {
107         # removed at the request of the copyright owner
108         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
109         'only_matching': True,
110     }, {
111         # removed by uploader
112         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
113         'only_matching': True,
114     }, {
115         # private video
116         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
117         'only_matching': True,
118     }, {
119         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
120         'only_matching': True,
121     }, {
122         'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
123         'only_matching': True,
124     }]
125
126     @staticmethod
127     def _extract_urls(webpage):
128         return re.findall(
129             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
130             webpage)
131
132     def _extract_count(self, pattern, webpage, name):
133         return str_to_int(self._search_regex(
134             pattern, webpage, '%s count' % name, fatal=False))
135
136     def _real_extract(self, url):
137         video_id = self._match_id(url)
138
139         self._set_cookie('pornhub.com', 'age_verified', '1')
140
141         def dl_webpage(platform):
142             self._set_cookie('pornhub.com', 'platform', platform)
143             return self._download_webpage(
144                 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
145                 video_id, 'Downloading %s webpage' % platform)
146
147         webpage = dl_webpage('pc')
148
149         error_msg = self._html_search_regex(
150             r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
151             webpage, 'error message', default=None, group='error')
152         if error_msg:
153             error_msg = re.sub(r'\s+', ' ', error_msg)
154             raise ExtractorError(
155                 'PornHub said: %s' % error_msg,
156                 expected=True, video_id=video_id)
157
158         # video_title from flashvars contains whitespace instead of non-ASCII (see
159         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
160         # on that anymore.
161         title = self._html_search_meta(
162             'twitter:title', webpage, default=None) or self._search_regex(
163             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
164              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
165              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
166             webpage, 'title', group='title')
167
168         video_urls = []
169         video_urls_set = set()
170         subtitles = {}
171
172         flashvars = self._parse_json(
173             self._search_regex(
174                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
175             video_id)
176         if flashvars:
177             subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
178             if subtitle_url:
179                 subtitles.setdefault('en', []).append({
180                     'url': subtitle_url,
181                     'ext': 'srt',
182                 })
183             thumbnail = flashvars.get('image_url')
184             duration = int_or_none(flashvars.get('video_duration'))
185             media_definitions = flashvars.get('mediaDefinitions')
186             if isinstance(media_definitions, list):
187                 for definition in media_definitions:
188                     if not isinstance(definition, dict):
189                         continue
190                     video_url = definition.get('videoUrl')
191                     if not video_url or not isinstance(video_url, compat_str):
192                         continue
193                     if video_url in video_urls_set:
194                         continue
195                     video_urls_set.add(video_url)
196                     video_urls.append(
197                         (video_url, int_or_none(definition.get('quality'))))
198         else:
199             thumbnail, duration = [None] * 2
200
201         if not video_urls:
202             tv_webpage = dl_webpage('tv')
203
204             assignments = self._search_regex(
205                 r'(var.+?mediastring.+?)</script>', tv_webpage,
206                 'encoded url').split(';')
207
208             js_vars = {}
209
210             def parse_js_value(inp):
211                 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
212                 if '+' in inp:
213                     inps = inp.split('+')
214                     return functools.reduce(
215                         operator.concat, map(parse_js_value, inps))
216                 inp = inp.strip()
217                 if inp in js_vars:
218                     return js_vars[inp]
219                 return remove_quotes(inp)
220
221             for assn in assignments:
222                 assn = assn.strip()
223                 if not assn:
224                     continue
225                 assn = re.sub(r'var\s+', '', assn)
226                 vname, value = assn.split('=', 1)
227                 js_vars[vname] = parse_js_value(value)
228
229             video_url = js_vars['mediastring']
230             if video_url not in video_urls_set:
231                 video_urls.append((video_url, None))
232                 video_urls_set.add(video_url)
233
234         for mobj in re.finditer(
235                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
236                 webpage):
237             video_url = mobj.group('url')
238             if video_url not in video_urls_set:
239                 video_urls.append((video_url, None))
240                 video_urls_set.add(video_url)
241
242         upload_date = None
243         formats = []
244         for video_url, height in video_urls:
245             if not upload_date:
246                 upload_date = self._search_regex(
247                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
248                 if upload_date:
249                     upload_date = upload_date.replace('/', '')
250             tbr = None
251             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
252             if mobj:
253                 if not height:
254                     height = int(mobj.group('height'))
255                 tbr = int(mobj.group('tbr'))
256             formats.append({
257                 'url': video_url,
258                 'format_id': '%dp' % height if height else None,
259                 'height': height,
260                 'tbr': tbr,
261             })
262         self._sort_formats(formats)
263
264         video_uploader = self._html_search_regex(
265             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
266             webpage, 'uploader', fatal=False)
267
268         view_count = self._extract_count(
269             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
270         like_count = self._extract_count(
271             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
272         dislike_count = self._extract_count(
273             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
274         comment_count = self._extract_count(
275             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
276
277         page_params = self._parse_json(self._search_regex(
278             r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
279             webpage, 'page parameters', group='data', default='{}'),
280             video_id, transform_source=js_to_json, fatal=False)
281         tags = categories = None
282         if page_params:
283             tags = page_params.get('tags', '').split(',')
284             categories = page_params.get('categories', '').split(',')
285
286         return {
287             'id': video_id,
288             'uploader': video_uploader,
289             'upload_date': upload_date,
290             'title': title,
291             'thumbnail': thumbnail,
292             'duration': duration,
293             'view_count': view_count,
294             'like_count': like_count,
295             'dislike_count': dislike_count,
296             'comment_count': comment_count,
297             'formats': formats,
298             'age_limit': 18,
299             'tags': tags,
300             'categories': categories,
301             'subtitles': subtitles,
302         }
303
304
305 class PornHubPlaylistBaseIE(InfoExtractor):
306     def _extract_entries(self, webpage):
307         # Only process container div with main playlist content skipping
308         # drop-down menu that uses similar pattern for videos (see
309         # https://github.com/rg3/youtube-dl/issues/11594).
310         container = self._search_regex(
311             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
312             'container', default=webpage)
313
314         return [
315             self.url_result(
316                 'http://www.pornhub.com/%s' % video_url,
317                 PornHubIE.ie_key(), video_title=title)
318             for video_url, title in orderedSet(re.findall(
319                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
320                 container))
321         ]
322
323     def _real_extract(self, url):
324         playlist_id = self._match_id(url)
325
326         webpage = self._download_webpage(url, playlist_id)
327
328         entries = self._extract_entries(webpage)
329
330         playlist = self._parse_json(
331             self._search_regex(
332                 r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
333                 'playlist', default='{}'),
334             playlist_id, fatal=False)
335         title = playlist.get('title') or self._search_regex(
336             r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
337
338         return self.playlist_result(
339             entries, playlist_id, title, playlist.get('description'))
340
341
342 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
343     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)'
344     _TESTS = [{
345         'url': 'http://www.pornhub.com/playlist/4667351',
346         'info_dict': {
347             'id': '4667351',
348             'title': 'Nataly Hot',
349         },
350         'playlist_mincount': 2,
351     }, {
352         'url': 'https://de.pornhub.com/playlist/4667351',
353         'only_matching': True,
354     }]
355
356
357 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
358     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
359     _TESTS = [{
360         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
361         'info_dict': {
362             'id': 'zoe_ph',
363         },
364         'playlist_mincount': 171,
365     }, {
366         'url': 'http://www.pornhub.com/users/rushandlia/videos',
367         'only_matching': True,
368     }, {
369         # default sorting as Top Rated Videos
370         'url': 'https://www.pornhub.com/channels/povd/videos',
371         'info_dict': {
372             'id': 'povd',
373         },
374         'playlist_mincount': 293,
375     }, {
376         # Top Rated Videos
377         'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
378         'only_matching': True,
379     }, {
380         # Most Recent Videos
381         'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
382         'only_matching': True,
383     }, {
384         # Most Viewed Videos
385         'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
386         'only_matching': True,
387     }, {
388         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
389         'only_matching': True,
390     }, {
391         'url': 'https://www.pornhub.com/model/jayndrea/videos/upload',
392         'only_matching': True,
393     }, {
394         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
395         'only_matching': True,
396     }]
397
398     def _real_extract(self, url):
399         user_id = self._match_id(url)
400
401         entries = []
402         for page_num in itertools.count(1):
403             try:
404                 webpage = self._download_webpage(
405                     url, user_id, 'Downloading page %d' % page_num,
406                     query={'page': page_num})
407             except ExtractorError as e:
408                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
409                     break
410                 raise
411             page_entries = self._extract_entries(webpage)
412             if not page_entries:
413                 break
414             entries.extend(page_entries)
415
416         return self.playlist_result(entries, user_id)