_ Git - youtube-dl/blob - youtube_dl/extractor/pornhub.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import itertools
   6 import operator
   7 import re
   8
   9 from .common import InfoExtractor
  10 from ..compat import (
  11     compat_HTTPError,
  12     compat_str,
  13     compat_urllib_request,
  14 )
  15 from .openload import PhantomJSwrapper
  16 from ..utils import (
  17     ExtractorError,
  18     int_or_none,
  19     js_to_json,
  20     orderedSet,
  21     remove_quotes,
  22     str_to_int,
  23     url_or_none,
  24 )
  25
  26
  27 class PornHubIE(InfoExtractor):
  28     IE_DESC = 'PornHub and Thumbzilla'
  29     _VALID_URL = r'''(?x)
  30                     https?://
  31                         (?:
  32                             (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  33                             (?:www\.)?thumbzilla\.com/video/
  34                         )
  35                         (?P<id>[\da-z]+)
  36                     '''
  37     _TESTS = [{
  38         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  39         'md5': '1e19b41231a02eba417839222ac9d58e',
  40         'info_dict': {
  41             'id': '648719015',
  42             'ext': 'mp4',
  43             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  44             'uploader': 'Babes',
  45             'upload_date': '20130628',
  46             'duration': 361,
  47             'view_count': int,
  48             'like_count': int,
  49             'dislike_count': int,
  50             'comment_count': int,
  51             'age_limit': 18,
  52             'tags': list,
  53             'categories': list,
  54         },
  55     }, {
  56         # non-ASCII title
  57         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  58         'info_dict': {
  59             'id': '1331683002',
  60             'ext': 'mp4',
  61             'title': '重庆婷婷女王足交',
  62             'uploader': 'Unknown',
  63             'upload_date': '20150213',
  64             'duration': 1753,
  65             'view_count': int,
  66             'like_count': int,
  67             'dislike_count': int,
  68             'comment_count': int,
  69             'age_limit': 18,
  70             'tags': list,
  71             'categories': list,
  72         },
  73         'params': {
  74             'skip_download': True,
  75         },
  76     }, {
  77         # subtitles
  78         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  79         'info_dict': {
  80             'id': 'ph5af5fef7c2aa7',
  81             'ext': 'mp4',
  82             'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  83             'uploader': 'BFFs',
  84             'duration': 622,
  85             'view_count': int,
  86             'like_count': int,
  87             'dislike_count': int,
  88             'comment_count': int,
  89             'age_limit': 18,
  90             'tags': list,
  91             'categories': list,
  92             'subtitles': {
  93                 'en': [{
  94                     "ext": 'srt'
  95                 }]
  96             },
  97         },
  98         'params': {
  99             'skip_download': True,
 100         },
 101     }, {
 102         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
 103         'only_matching': True,
 104     }, {
 105         # removed at the request of cam4.com
 106         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
 107         'only_matching': True,
 108     }, {
 109         # removed at the request of the copyright owner
 110         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
 111         'only_matching': True,
 112     }, {
 113         # removed by uploader
 114         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
 115         'only_matching': True,
 116     }, {
 117         # private video
 118         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
 119         'only_matching': True,
 120     }, {
 121         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
 122         'only_matching': True,
 123     }, {
 124         'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
 125         'only_matching': True,
 126     }, {
 127         'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
 128         'only_matching': True,
 129     }]
 130
 131     def _download_webpage_handle(self, *args, **kwargs):
 132         def dl(*args, **kwargs):
 133             return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs)
 134
 135         webpage, urlh = dl(*args, **kwargs)
 136
 137         if any(re.search(p, webpage) for p in (
 138                 r'<body\b[^>]+\bonload=["\']go\(\)',
 139                 r'document\.cookie\s*=\s*["\']RNKEY=',
 140                 r'document\.location\.reload\(true\)')):
 141             url_or_request = args[0]
 142             url = (url_or_request.get_full_url()
 143                    if isinstance(url_or_request, compat_urllib_request.Request)
 144                    else url_or_request)
 145             phantom = PhantomJSwrapper(self, required_version='2.0')
 146             phantom.get(url, html=webpage)
 147             webpage, urlh = dl(*args, **kwargs)
 148
 149         return webpage, urlh
 150
 151     @staticmethod
 152     def _extract_urls(webpage):
 153         return re.findall(
 154             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)',
 155             webpage)
 156
 157     def _extract_count(self, pattern, webpage, name):
 158         return str_to_int(self._search_regex(
 159             pattern, webpage, '%s count' % name, fatal=False))
 160
 161     def _real_extract(self, url):
 162         mobj = re.match(self._VALID_URL, url)
 163         host = mobj.group('host') or 'pornhub.com'
 164         video_id = mobj.group('id')
 165
 166         self._set_cookie(host, 'age_verified', '1')
 167
 168         def dl_webpage(platform):
 169             self._set_cookie(host, 'platform', platform)
 170             return self._download_webpage(
 171                 'http://www.%s/view_video.php?viewkey=%s' % (host, video_id),
 172                 video_id, 'Downloading %s webpage' % platform)
 173
 174         webpage = dl_webpage('pc')
 175
 176         error_msg = self._html_search_regex(
 177             r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
 178             webpage, 'error message', default=None, group='error')
 179         if error_msg:
 180             error_msg = re.sub(r'\s+', ' ', error_msg)
 181             raise ExtractorError(
 182                 'PornHub said: %s' % error_msg,
 183                 expected=True, video_id=video_id)
 184
 185         # video_title from flashvars contains whitespace instead of non-ASCII (see
 186         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
 187         # on that anymore.
 188         title = self._html_search_meta(
 189             'twitter:title', webpage, default=None) or self._search_regex(
 190             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
 191              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
 192              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
 193             webpage, 'title', group='title')
 194
 195         video_urls = []
 196         video_urls_set = set()
 197         subtitles = {}
 198
 199         flashvars = self._parse_json(
 200             self._search_regex(
 201                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
 202             video_id)
 203         if flashvars:
 204             subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
 205             if subtitle_url:
 206                 subtitles.setdefault('en', []).append({
 207                     'url': subtitle_url,
 208                     'ext': 'srt',
 209                 })
 210             thumbnail = flashvars.get('image_url')
 211             duration = int_or_none(flashvars.get('video_duration'))
 212             media_definitions = flashvars.get('mediaDefinitions')
 213             if isinstance(media_definitions, list):
 214                 for definition in media_definitions:
 215                     if not isinstance(definition, dict):
 216                         continue
 217                     video_url = definition.get('videoUrl')
 218                     if not video_url or not isinstance(video_url, compat_str):
 219                         continue
 220                     if video_url in video_urls_set:
 221                         continue
 222                     video_urls_set.add(video_url)
 223                     video_urls.append(
 224                         (video_url, int_or_none(definition.get('quality'))))
 225         else:
 226             thumbnail, duration = [None] * 2
 227
 228         if not video_urls:
 229             tv_webpage = dl_webpage('tv')
 230
 231             assignments = self._search_regex(
 232                 r'(var.+?mediastring.+?)</script>', tv_webpage,
 233                 'encoded url').split(';')
 234
 235             js_vars = {}
 236
 237             def parse_js_value(inp):
 238                 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
 239                 if '+' in inp:
 240                     inps = inp.split('+')
 241                     return functools.reduce(
 242                         operator.concat, map(parse_js_value, inps))
 243                 inp = inp.strip()
 244                 if inp in js_vars:
 245                     return js_vars[inp]
 246                 return remove_quotes(inp)
 247
 248             for assn in assignments:
 249                 assn = assn.strip()
 250                 if not assn:
 251                     continue
 252                 assn = re.sub(r'var\s+', '', assn)
 253                 vname, value = assn.split('=', 1)
 254                 js_vars[vname] = parse_js_value(value)
 255
 256             video_url = js_vars['mediastring']
 257             if video_url not in video_urls_set:
 258                 video_urls.append((video_url, None))
 259                 video_urls_set.add(video_url)
 260
 261         for mobj in re.finditer(
 262                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
 263                 webpage):
 264             video_url = mobj.group('url')
 265             if video_url not in video_urls_set:
 266                 video_urls.append((video_url, None))
 267                 video_urls_set.add(video_url)
 268
 269         upload_date = None
 270         formats = []
 271         for video_url, height in video_urls:
 272             if not upload_date:
 273                 upload_date = self._search_regex(
 274                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
 275                 if upload_date:
 276                     upload_date = upload_date.replace('/', '')
 277             tbr = None
 278             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
 279             if mobj:
 280                 if not height:
 281                     height = int(mobj.group('height'))
 282                 tbr = int(mobj.group('tbr'))
 283             formats.append({
 284                 'url': video_url,
 285                 'format_id': '%dp' % height if height else None,
 286                 'height': height,
 287                 'tbr': tbr,
 288             })
 289         self._sort_formats(formats)
 290
 291         video_uploader = self._html_search_regex(
 292             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
 293             webpage, 'uploader', fatal=False)
 294
 295         view_count = self._extract_count(
 296             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
 297         like_count = self._extract_count(
 298             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
 299         dislike_count = self._extract_count(
 300             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
 301         comment_count = self._extract_count(
 302             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 303
 304         page_params = self._parse_json(self._search_regex(
 305             r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
 306             webpage, 'page parameters', group='data', default='{}'),
 307             video_id, transform_source=js_to_json, fatal=False)
 308         tags = categories = None
 309         if page_params:
 310             tags = page_params.get('tags', '').split(',')
 311             categories = page_params.get('categories', '').split(',')
 312
 313         return {
 314             'id': video_id,
 315             'uploader': video_uploader,
 316             'upload_date': upload_date,
 317             'title': title,
 318             'thumbnail': thumbnail,
 319             'duration': duration,
 320             'view_count': view_count,
 321             'like_count': like_count,
 322             'dislike_count': dislike_count,
 323             'comment_count': comment_count,
 324             'formats': formats,
 325             'age_limit': 18,
 326             'tags': tags,
 327             'categories': categories,
 328             'subtitles': subtitles,
 329         }
 330
 331
 332 class PornHubPlaylistBaseIE(InfoExtractor):
 333     def _extract_entries(self, webpage, host):
 334         # Only process container div with main playlist content skipping
 335         # drop-down menu that uses similar pattern for videos (see
 336         # https://github.com/rg3/youtube-dl/issues/11594).
 337         container = self._search_regex(
 338             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
 339             'container', default=webpage)
 340
 341         return [
 342             self.url_result(
 343                 'http://www.%s/%s' % (host, video_url),
 344                 PornHubIE.ie_key(), video_title=title)
 345             for video_url, title in orderedSet(re.findall(
 346                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
 347                 container))
 348         ]
 349
 350     def _real_extract(self, url):
 351         mobj = re.match(self._VALID_URL, url)
 352         host = mobj.group('host')
 353         playlist_id = mobj.group('id')
 354
 355         webpage = self._download_webpage(url, playlist_id)
 356
 357         entries = self._extract_entries(webpage, host)
 358
 359         playlist = self._parse_json(
 360             self._search_regex(
 361                 r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
 362                 'playlist', default='{}'),
 363             playlist_id, fatal=False)
 364         title = playlist.get('title') or self._search_regex(
 365             r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
 366
 367         return self.playlist_result(
 368             entries, playlist_id, title, playlist.get('description'))
 369
 370
 371 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
 372     _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/playlist/(?P<id>\d+)'
 373     _TESTS = [{
 374         'url': 'http://www.pornhub.com/playlist/4667351',
 375         'info_dict': {
 376             'id': '4667351',
 377             'title': 'Nataly Hot',
 378         },
 379         'playlist_mincount': 2,
 380     }, {
 381         'url': 'https://de.pornhub.com/playlist/4667351',
 382         'only_matching': True,
 383     }]
 384
 385
 386 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
 387     _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
 388     _TESTS = [{
 389         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 390         'info_dict': {
 391             'id': 'zoe_ph',
 392         },
 393         'playlist_mincount': 171,
 394     }, {
 395         'url': 'http://www.pornhub.com/users/rushandlia/videos',
 396         'only_matching': True,
 397     }, {
 398         # default sorting as Top Rated Videos
 399         'url': 'https://www.pornhub.com/channels/povd/videos',
 400         'info_dict': {
 401             'id': 'povd',
 402         },
 403         'playlist_mincount': 293,
 404     }, {
 405         # Top Rated Videos
 406         'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
 407         'only_matching': True,
 408     }, {
 409         # Most Recent Videos
 410         'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
 411         'only_matching': True,
 412     }, {
 413         # Most Viewed Videos
 414         'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
 415         'only_matching': True,
 416     }, {
 417         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 418         'only_matching': True,
 419     }, {
 420         'url': 'https://www.pornhub.com/model/jayndrea/videos/upload',
 421         'only_matching': True,
 422     }, {
 423         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
 424         'only_matching': True,
 425     }]
 426
 427     def _real_extract(self, url):
 428         mobj = re.match(self._VALID_URL, url)
 429         host = mobj.group('host')
 430         user_id = mobj.group('id')
 431
 432         entries = []
 433         for page_num in itertools.count(1):
 434             try:
 435                 webpage = self._download_webpage(
 436                     url, user_id, 'Downloading page %d' % page_num,
 437                     query={'page': page_num})
 438             except ExtractorError as e:
 439                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
 440                     break
 441                 raise
 442             page_entries = self._extract_entries(webpage, host)
 443             if not page_entries:
 444                 break
 445             entries.extend(page_entries)
 446
 447         return self.playlist_result(entries, user_id)