[pornhub] Add more tests with removed videos
[youtube-dl] / youtube_dl / extractor / pornhub.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import itertools
5 import os
6 import re
7
8 from .common import InfoExtractor
9 from ..compat import (
10     compat_HTTPError,
11     compat_urllib_parse_unquote,
12     compat_urllib_parse_unquote_plus,
13     compat_urllib_parse_urlparse,
14 )
15 from ..utils import (
16     ExtractorError,
17     int_or_none,
18     orderedSet,
19     sanitized_Request,
20     str_to_int,
21 )
22 from ..aes import (
23     aes_decrypt_text
24 )
25
26
27 class PornHubIE(InfoExtractor):
28     _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
29     _TESTS = [{
30         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
31         'md5': '1e19b41231a02eba417839222ac9d58e',
32         'info_dict': {
33             'id': '648719015',
34             'ext': 'mp4',
35             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
36             'uploader': 'Babes',
37             'duration': 361,
38             'view_count': int,
39             'like_count': int,
40             'dislike_count': int,
41             'comment_count': int,
42             'age_limit': 18,
43         },
44     }, {
45         # non-ASCII title
46         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
47         'info_dict': {
48             'id': '1331683002',
49             'ext': 'mp4',
50             'title': '重庆婷婷女王足交',
51             'uploader': 'cj397186295',
52             'duration': 1753,
53             'view_count': int,
54             'like_count': int,
55             'dislike_count': int,
56             'comment_count': int,
57             'age_limit': 18,
58         },
59         'params': {
60             'skip_download': True,
61         },
62     }, {
63         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
64         'only_matching': True,
65     }, {
66         # removed at the request of cam4.com
67         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
68         'only_matching': True,
69     }, {
70         # removed at the request of the copyright owner
71         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
72         'only_matching': True,
73     }, {
74         # removed by uploader
75         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
76         'only_matching': True,
77     }]
78
79     @classmethod
80     def _extract_url(cls, webpage):
81         mobj = re.search(
82             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
83         if mobj:
84             return mobj.group('url')
85
86     def _extract_count(self, pattern, webpage, name):
87         return str_to_int(self._search_regex(
88             pattern, webpage, '%s count' % name, fatal=False))
89
90     def _real_extract(self, url):
91         video_id = self._match_id(url)
92
93         req = sanitized_Request(
94             'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
95         req.add_header('Cookie', 'age_verified=1')
96         webpage = self._download_webpage(req, video_id)
97
98         error_msg = self._html_search_regex(
99             r'(?s)<div[^>]+class=(["\']).*?\bremoved\b.*?\1[^>]*>(?P<error>.+?)</div>',
100             webpage, 'error message', default=None, group='error')
101         if error_msg:
102             error_msg = re.sub(r'\s+', ' ', error_msg)
103             raise ExtractorError(
104                 'PornHub said: %s' % error_msg,
105                 expected=True, video_id=video_id)
106
107         # video_title from flashvars contains whitespace instead of non-ASCII (see
108         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
109         # on that anymore.
110         title = self._html_search_meta(
111             'twitter:title', webpage, default=None) or self._search_regex(
112             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
113              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
114              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
115             webpage, 'title', group='title')
116
117         flashvars = self._parse_json(
118             self._search_regex(
119                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
120             video_id)
121         if flashvars:
122             thumbnail = flashvars.get('image_url')
123             duration = int_or_none(flashvars.get('video_duration'))
124         else:
125             title, thumbnail, duration = [None] * 3
126
127         video_uploader = self._html_search_regex(
128             r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
129             webpage, 'uploader', fatal=False)
130
131         view_count = self._extract_count(
132             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
133         like_count = self._extract_count(
134             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
135         dislike_count = self._extract_count(
136             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
137         comment_count = self._extract_count(
138             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
139
140         video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage)))
141         if webpage.find('"encrypted":true') != -1:
142             password = compat_urllib_parse_unquote_plus(
143                 self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
144             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
145
146         formats = []
147         for video_url in video_urls:
148             path = compat_urllib_parse_urlparse(video_url).path
149             extension = os.path.splitext(path)[1][1:]
150             format = path.split('/')[5].split('_')[:2]
151             format = '-'.join(format)
152
153             m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
154             if m is None:
155                 height = None
156                 tbr = None
157             else:
158                 height = int(m.group('height'))
159                 tbr = int(m.group('tbr'))
160
161             formats.append({
162                 'url': video_url,
163                 'ext': extension,
164                 'format': format,
165                 'format_id': format,
166                 'tbr': tbr,
167                 'height': height,
168             })
169         self._sort_formats(formats)
170
171         return {
172             'id': video_id,
173             'uploader': video_uploader,
174             'title': title,
175             'thumbnail': thumbnail,
176             'duration': duration,
177             'view_count': view_count,
178             'like_count': like_count,
179             'dislike_count': dislike_count,
180             'comment_count': comment_count,
181             'formats': formats,
182             'age_limit': 18,
183         }
184
185
186 class PornHubPlaylistBaseIE(InfoExtractor):
187     def _extract_entries(self, webpage):
188         return [
189             self.url_result(
190                 'http://www.pornhub.com/%s' % video_url,
191                 PornHubIE.ie_key(), video_title=title)
192             for video_url, title in orderedSet(re.findall(
193                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
194                 webpage))
195         ]
196
197     def _real_extract(self, url):
198         playlist_id = self._match_id(url)
199
200         webpage = self._download_webpage(url, playlist_id)
201
202         entries = self._extract_entries(webpage)
203
204         playlist = self._parse_json(
205             self._search_regex(
206                 r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
207             playlist_id)
208
209         return self.playlist_result(
210             entries, playlist_id, playlist.get('title'), playlist.get('description'))
211
212
213 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
214     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
215     _TESTS = [{
216         'url': 'http://www.pornhub.com/playlist/6201671',
217         'info_dict': {
218             'id': '6201671',
219             'title': 'P0p4',
220         },
221         'playlist_mincount': 35,
222     }]
223
224
225 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
226     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos'
227     _TESTS = [{
228         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
229         'info_dict': {
230             'id': 'zoe_ph',
231         },
232         'playlist_mincount': 171,
233     }, {
234         'url': 'http://www.pornhub.com/users/rushandlia/videos',
235         'only_matching': True,
236     }]
237
238     def _real_extract(self, url):
239         user_id = self._match_id(url)
240
241         entries = []
242         for page_num in itertools.count(1):
243             try:
244                 webpage = self._download_webpage(
245                     url, user_id, 'Downloading page %d' % page_num,
246                     query={'page': page_num})
247             except ExtractorError as e:
248                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
249                     break
250             page_entries = self._extract_entries(webpage)
251             if not page_entries:
252                 break
253             entries.extend(page_entries)
254
255         return self.playlist_result(entries, user_id)