[pornhub] Decode obfuscated video URL (closes #12470)
[youtube-dl] / youtube_dl / extractor / pornhub.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import functools
5 import itertools
6 import operator
7 # import os
8 import re
9
10 from .common import InfoExtractor
11 from ..compat import (
12     compat_HTTPError,
13     # compat_urllib_parse_unquote,
14     # compat_urllib_parse_unquote_plus,
15     # compat_urllib_parse_urlparse,
16 )
17 from ..utils import (
18     ExtractorError,
19     int_or_none,
20     js_to_json,
21     orderedSet,
22     # sanitized_Request,
23     str_to_int,
24 )
25 # from ..aes import (
26 #     aes_decrypt_text
27 # )
28
29
30 class PornHubIE(InfoExtractor):
31     IE_DESC = 'PornHub and Thumbzilla'
32     _VALID_URL = r'''(?x)
33                     https?://
34                         (?:
35                             (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
36                             (?:www\.)?thumbzilla\.com/video/
37                         )
38                         (?P<id>[\da-z]+)
39                     '''
40     _TESTS = [{
41         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
42         'md5': '1e19b41231a02eba417839222ac9d58e',
43         'info_dict': {
44             'id': '648719015',
45             'ext': 'mp4',
46             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
47             'uploader': 'Babes',
48             'duration': 361,
49             'view_count': int,
50             'like_count': int,
51             'dislike_count': int,
52             'comment_count': int,
53             'age_limit': 18,
54             'tags': list,
55             'categories': list,
56         },
57     }, {
58         # non-ASCII title
59         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
60         'info_dict': {
61             'id': '1331683002',
62             'ext': 'mp4',
63             'title': '重庆婷婷女王足交',
64             'uploader': 'cj397186295',
65             'duration': 1753,
66             'view_count': int,
67             'like_count': int,
68             'dislike_count': int,
69             'comment_count': int,
70             'age_limit': 18,
71             'tags': list,
72             'categories': list,
73         },
74         'params': {
75             'skip_download': True,
76         },
77     }, {
78         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
79         'only_matching': True,
80     }, {
81         # removed at the request of cam4.com
82         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
83         'only_matching': True,
84     }, {
85         # removed at the request of the copyright owner
86         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
87         'only_matching': True,
88     }, {
89         # removed by uploader
90         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
91         'only_matching': True,
92     }, {
93         # private video
94         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
95         'only_matching': True,
96     }, {
97         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
98         'only_matching': True,
99     }]
100
101     @staticmethod
102     def _extract_urls(webpage):
103         return re.findall(
104             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
105             webpage)
106
107     def _extract_count(self, pattern, webpage, name):
108         return str_to_int(self._search_regex(
109             pattern, webpage, '%s count' % name, fatal=False))
110
111     def _real_extract(self, url):
112         video_id = self._match_id(url)
113
114         def dl_webpage(platform):
115             return self._download_webpage(
116                 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
117                 video_id, headers={
118                     'Cookie': 'age_verified=1; platform=%s' % platform,
119                 })
120
121         webpage = dl_webpage('pc')
122
123         error_msg = self._html_search_regex(
124             r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
125             webpage, 'error message', default=None, group='error')
126         if error_msg:
127             error_msg = re.sub(r'\s+', ' ', error_msg)
128             raise ExtractorError(
129                 'PornHub said: %s' % error_msg,
130                 expected=True, video_id=video_id)
131
132         tv_webpage = dl_webpage('tv')
133
134         encoded_url = self._search_regex(r'(var.*mediastring.*)</script>',
135             tv_webpage, 'encoded url')
136         assignments = encoded_url.split(";")
137         js_vars = {}
138
139         def parse_js_value(inp):
140             inp = re.sub(r'/\*[^*]*\*/', "", inp)
141
142             if "+" in inp:
143                 inps = inp.split("+")
144                 return functools.reduce(operator.concat, map(parse_js_value, inps))
145
146             inp = inp.strip()
147             if inp in js_vars:
148                 return js_vars[inp]
149
150             # Hope it's a string!
151             assert inp.startswith('"') and inp.endswith('"')
152             return inp[1:-1]
153
154         for assn in assignments:
155             assn = assn.strip()
156             if len(assn) == 0:
157                 continue
158
159             assert assn.startswith("var ")
160             assn = assn[4:]
161             vname, value = assn.split("=", 1)
162
163             js_vars[vname] = parse_js_value(value)
164
165         video_url = js_vars["mediastring"]
166
167         title = self._search_regex(
168             r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
169
170         # video_title from flashvars contains whitespace instead of non-ASCII (see
171         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
172         # on that anymore.
173         title = title or self._html_search_meta(
174             'twitter:title', webpage, default=None) or self._search_regex(
175             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
176              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
177              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
178             webpage, 'title', group='title')
179
180         flashvars = self._parse_json(
181             self._search_regex(
182                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
183             video_id)
184         if flashvars:
185             thumbnail = flashvars.get('image_url')
186             duration = int_or_none(flashvars.get('video_duration'))
187         else:
188             title, thumbnail, duration = [None] * 3
189
190         video_uploader = self._html_search_regex(
191             r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
192             webpage, 'uploader', fatal=False)
193
194         view_count = self._extract_count(
195             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
196         like_count = self._extract_count(
197             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
198         dislike_count = self._extract_count(
199             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
200         comment_count = self._extract_count(
201             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
202
203         page_params = self._parse_json(self._search_regex(
204             r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
205             webpage, 'page parameters', group='data', default='{}'),
206             video_id, transform_source=js_to_json, fatal=False)
207         tags = categories = None
208         if page_params:
209             tags = page_params.get('tags', '').split(',')
210             categories = page_params.get('categories', '').split(',')
211
212         return {
213             'id': video_id,
214             'url': video_url,
215             'uploader': video_uploader,
216             'title': title,
217             'thumbnail': thumbnail,
218             'duration': duration,
219             'view_count': view_count,
220             'like_count': like_count,
221             'dislike_count': dislike_count,
222             'comment_count': comment_count,
223             # 'formats': formats,
224             'age_limit': 18,
225             'tags': tags,
226             'categories': categories,
227         }
228
229
230 class PornHubPlaylistBaseIE(InfoExtractor):
231     def _extract_entries(self, webpage):
232         return [
233             self.url_result(
234                 'http://www.pornhub.com/%s' % video_url,
235                 PornHubIE.ie_key(), video_title=title)
236             for video_url, title in orderedSet(re.findall(
237                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
238                 webpage))
239         ]
240
241     def _real_extract(self, url):
242         playlist_id = self._match_id(url)
243
244         webpage = self._download_webpage(url, playlist_id)
245
246         # Only process container div with main playlist content skipping
247         # drop-down menu that uses similar pattern for videos (see
248         # https://github.com/rg3/youtube-dl/issues/11594).
249         container = self._search_regex(
250             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
251             'container', default=webpage)
252
253         entries = self._extract_entries(container)
254
255         playlist = self._parse_json(
256             self._search_regex(
257                 r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
258             playlist_id)
259
260         return self.playlist_result(
261             entries, playlist_id, playlist.get('title'), playlist.get('description'))
262
263
264 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
265     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
266     _TESTS = [{
267         'url': 'http://www.pornhub.com/playlist/4667351',
268         'info_dict': {
269             'id': '4667351',
270             'title': 'Nataly Hot',
271         },
272         'playlist_mincount': 2,
273     }]
274
275
276 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
277     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos'
278     _TESTS = [{
279         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
280         'info_dict': {
281             'id': 'zoe_ph',
282         },
283         'playlist_mincount': 171,
284     }, {
285         'url': 'http://www.pornhub.com/users/rushandlia/videos',
286         'only_matching': True,
287     }]
288
289     def _real_extract(self, url):
290         user_id = self._match_id(url)
291
292         entries = []
293         for page_num in itertools.count(1):
294             try:
295                 webpage = self._download_webpage(
296                     url, user_id, 'Downloading page %d' % page_num,
297                     query={'page': page_num})
298             except ExtractorError as e:
299                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
300                     break
301             page_entries = self._extract_entries(webpage)
302             if not page_entries:
303                 break
304             entries.extend(page_entries)
305
306         return self.playlist_result(entries, user_id)