_ Git - youtube-dl/blob - youtube_dl/extractor/pornhub.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import itertools
   6 import operator
   7 import re
   8
   9 from .common import InfoExtractor
  10 from ..compat import (
  11     compat_HTTPError,
  12     compat_str,
  13 )
  14 from ..utils import (
  15     ExtractorError,
  16     int_or_none,
  17     js_to_json,
  18     orderedSet,
  19     remove_quotes,
  20     str_to_int,
  21     url_or_none,
  22 )
  23
  24
  25 class PornHubIE(InfoExtractor):
  26     IE_DESC = 'PornHub and Thumbzilla'
  27     _VALID_URL = r'''(?x)
  28                     https?://
  29                         (?:
  30                             (?:[^/]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  31                             (?:www\.)?thumbzilla\.com/video/
  32                         )
  33                         (?P<id>[\da-z]+)
  34                     '''
  35     _TESTS = [{
  36         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  37         'md5': '1e19b41231a02eba417839222ac9d58e',
  38         'info_dict': {
  39             'id': '648719015',
  40             'ext': 'mp4',
  41             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  42             'uploader': 'Babes',
  43             'duration': 361,
  44             'view_count': int,
  45             'like_count': int,
  46             'dislike_count': int,
  47             'comment_count': int,
  48             'age_limit': 18,
  49             'tags': list,
  50             'categories': list,
  51         },
  52     }, {
  53         # non-ASCII title
  54         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  55         'info_dict': {
  56             'id': '1331683002',
  57             'ext': 'mp4',
  58             'title': '重庆婷婷女王足交',
  59             'uploader': 'Unknown',
  60             'duration': 1753,
  61             'view_count': int,
  62             'like_count': int,
  63             'dislike_count': int,
  64             'comment_count': int,
  65             'age_limit': 18,
  66             'tags': list,
  67             'categories': list,
  68         },
  69         'params': {
  70             'skip_download': True,
  71         },
  72     }, {
  73         # subtitles
  74         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  75         'info_dict': {
  76             'id': 'ph5af5fef7c2aa7',
  77             'ext': 'mp4',
  78             'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  79             'uploader': 'BFFs',
  80             'duration': 622,
  81             'view_count': int,
  82             'like_count': int,
  83             'dislike_count': int,
  84             'comment_count': int,
  85             'age_limit': 18,
  86             'tags': list,
  87             'categories': list,
  88             'subtitles': {
  89                 'en': [{
  90                     "ext": 'srt'
  91                 }]
  92             },
  93         },
  94         'params': {
  95             'skip_download': True,
  96         },
  97     }, {
  98         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
  99         'only_matching': True,
 100     }, {
 101         # removed at the request of cam4.com
 102         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
 103         'only_matching': True,
 104     }, {
 105         # removed at the request of the copyright owner
 106         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
 107         'only_matching': True,
 108     }, {
 109         # removed by uploader
 110         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
 111         'only_matching': True,
 112     }, {
 113         # private video
 114         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
 115         'only_matching': True,
 116     }, {
 117         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
 118         'only_matching': True,
 119     }, {
 120         'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
 121         'only_matching': True,
 122     }]
 123
 124     @staticmethod
 125     def _extract_urls(webpage):
 126         return re.findall(
 127             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
 128             webpage)
 129
 130     def _extract_count(self, pattern, webpage, name):
 131         return str_to_int(self._search_regex(
 132             pattern, webpage, '%s count' % name, fatal=False))
 133
 134     def _real_extract(self, url):
 135         video_id = self._match_id(url)
 136
 137         self._set_cookie('pornhub.com', 'age_verified', '1')
 138
 139         def dl_webpage(platform):
 140             self._set_cookie('pornhub.com', 'platform', platform)
 141             return self._download_webpage(
 142                 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
 143                 video_id, 'Downloading %s webpage' % platform)
 144
 145         webpage = dl_webpage('pc')
 146
 147         error_msg = self._html_search_regex(
 148             r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
 149             webpage, 'error message', default=None, group='error')
 150         if error_msg:
 151             error_msg = re.sub(r'\s+', ' ', error_msg)
 152             raise ExtractorError(
 153                 'PornHub said: %s' % error_msg,
 154                 expected=True, video_id=video_id)
 155
 156         # video_title from flashvars contains whitespace instead of non-ASCII (see
 157         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
 158         # on that anymore.
 159         title = self._html_search_meta(
 160             'twitter:title', webpage, default=None) or self._search_regex(
 161             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
 162              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
 163              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
 164             webpage, 'title', group='title')
 165
 166         video_urls = []
 167         video_urls_set = set()
 168         subtitles = {}
 169
 170         flashvars = self._parse_json(
 171             self._search_regex(
 172                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
 173             video_id)
 174         if flashvars:
 175             subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
 176             if subtitle_url:
 177                 subtitles.setdefault('en', []).append({
 178                     'url': subtitle_url,
 179                     'ext': 'srt',
 180                 })
 181             thumbnail = flashvars.get('image_url')
 182             duration = int_or_none(flashvars.get('video_duration'))
 183             media_definitions = flashvars.get('mediaDefinitions')
 184             if isinstance(media_definitions, list):
 185                 for definition in media_definitions:
 186                     if not isinstance(definition, dict):
 187                         continue
 188                     video_url = definition.get('videoUrl')
 189                     if not video_url or not isinstance(video_url, compat_str):
 190                         continue
 191                     if video_url in video_urls_set:
 192                         continue
 193                     video_urls_set.add(video_url)
 194                     video_urls.append(
 195                         (video_url, int_or_none(definition.get('quality'))))
 196         else:
 197             thumbnail, duration = [None] * 2
 198
 199         if not video_urls:
 200             tv_webpage = dl_webpage('tv')
 201
 202             assignments = self._search_regex(
 203                 r'(var.+?mediastring.+?)</script>', tv_webpage,
 204                 'encoded url').split(';')
 205
 206             js_vars = {}
 207
 208             def parse_js_value(inp):
 209                 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
 210                 if '+' in inp:
 211                     inps = inp.split('+')
 212                     return functools.reduce(
 213                         operator.concat, map(parse_js_value, inps))
 214                 inp = inp.strip()
 215                 if inp in js_vars:
 216                     return js_vars[inp]
 217                 return remove_quotes(inp)
 218
 219             for assn in assignments:
 220                 assn = assn.strip()
 221                 if not assn:
 222                     continue
 223                 assn = re.sub(r'var\s+', '', assn)
 224                 vname, value = assn.split('=', 1)
 225                 js_vars[vname] = parse_js_value(value)
 226
 227             video_url = js_vars['mediastring']
 228             if video_url not in video_urls_set:
 229                 video_urls.append((video_url, None))
 230                 video_urls_set.add(video_url)
 231
 232         for mobj in re.finditer(
 233                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
 234                 webpage):
 235             video_url = mobj.group('url')
 236             if video_url not in video_urls_set:
 237                 video_urls.append((video_url, None))
 238                 video_urls_set.add(video_url)
 239
 240         formats = []
 241         for video_url, height in video_urls:
 242             tbr = None
 243             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
 244             if mobj:
 245                 if not height:
 246                     height = int(mobj.group('height'))
 247                 tbr = int(mobj.group('tbr'))
 248             formats.append({
 249                 'url': video_url,
 250                 'format_id': '%dp' % height if height else None,
 251                 'height': height,
 252                 'tbr': tbr,
 253             })
 254         self._sort_formats(formats)
 255
 256         video_uploader = self._html_search_regex(
 257             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
 258             webpage, 'uploader', fatal=False)
 259
 260         view_count = self._extract_count(
 261             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
 262         like_count = self._extract_count(
 263             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
 264         dislike_count = self._extract_count(
 265             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
 266         comment_count = self._extract_count(
 267             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 268
 269         page_params = self._parse_json(self._search_regex(
 270             r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
 271             webpage, 'page parameters', group='data', default='{}'),
 272             video_id, transform_source=js_to_json, fatal=False)
 273         tags = categories = None
 274         if page_params:
 275             tags = page_params.get('tags', '').split(',')
 276             categories = page_params.get('categories', '').split(',')
 277
 278         return {
 279             'id': video_id,
 280             'uploader': video_uploader,
 281             'title': title,
 282             'thumbnail': thumbnail,
 283             'duration': duration,
 284             'view_count': view_count,
 285             'like_count': like_count,
 286             'dislike_count': dislike_count,
 287             'comment_count': comment_count,
 288             'formats': formats,
 289             'age_limit': 18,
 290             'tags': tags,
 291             'categories': categories,
 292             'subtitles': subtitles,
 293         }
 294
 295
 296 class PornHubPlaylistBaseIE(InfoExtractor):
 297     def _extract_entries(self, webpage):
 298         # Only process container div with main playlist content skipping
 299         # drop-down menu that uses similar pattern for videos (see
 300         # https://github.com/rg3/youtube-dl/issues/11594).
 301         container = self._search_regex(
 302             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
 303             'container', default=webpage)
 304
 305         return [
 306             self.url_result(
 307                 'http://www.pornhub.com/%s' % video_url,
 308                 PornHubIE.ie_key(), video_title=title)
 309             for video_url, title in orderedSet(re.findall(
 310                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
 311                 container))
 312         ]
 313
 314     def _real_extract(self, url):
 315         playlist_id = self._match_id(url)
 316
 317         webpage = self._download_webpage(url, playlist_id)
 318
 319         entries = self._extract_entries(webpage)
 320
 321         playlist = self._parse_json(
 322             self._search_regex(
 323                 r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
 324                 'playlist', default='{}'),
 325             playlist_id, fatal=False)
 326         title = playlist.get('title') or self._search_regex(
 327             r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
 328
 329         return self.playlist_result(
 330             entries, playlist_id, title, playlist.get('description'))
 331
 332
 333 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
 334     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/playlist/(?P<id>\d+)'
 335     _TESTS = [{
 336         'url': 'http://www.pornhub.com/playlist/4667351',
 337         'info_dict': {
 338             'id': '4667351',
 339             'title': 'Nataly Hot',
 340         },
 341         'playlist_mincount': 2,
 342     }, {
 343         'url': 'https://de.pornhub.com/playlist/4667351',
 344         'only_matching': True,
 345     }]
 346
 347
 348 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
 349     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.com/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
 350     _TESTS = [{
 351         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 352         'info_dict': {
 353             'id': 'zoe_ph',
 354         },
 355         'playlist_mincount': 171,
 356     }, {
 357         'url': 'http://www.pornhub.com/users/rushandlia/videos',
 358         'only_matching': True,
 359     }, {
 360         # default sorting as Top Rated Videos
 361         'url': 'https://www.pornhub.com/channels/povd/videos',
 362         'info_dict': {
 363             'id': 'povd',
 364         },
 365         'playlist_mincount': 293,
 366     }, {
 367         # Top Rated Videos
 368         'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
 369         'only_matching': True,
 370     }, {
 371         # Most Recent Videos
 372         'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
 373         'only_matching': True,
 374     }, {
 375         # Most Viewed Videos
 376         'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
 377         'only_matching': True,
 378     }, {
 379         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 380         'only_matching': True,
 381     }, {
 382         'url': 'https://www.pornhub.com/model/jayndrea/videos/upload',
 383         'only_matching': True,
 384     }, {
 385         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
 386         'only_matching': True,
 387     }]
 388
 389     def _real_extract(self, url):
 390         user_id = self._match_id(url)
 391
 392         entries = []
 393         for page_num in itertools.count(1):
 394             try:
 395                 webpage = self._download_webpage(
 396                     url, user_id, 'Downloading page %d' % page_num,
 397                     query={'page': page_num})
 398             except ExtractorError as e:
 399                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
 400                     break
 401                 raise
 402             page_entries = self._extract_entries(webpage)
 403             if not page_entries:
 404                 break
 405             entries.extend(page_entries)
 406
 407         return self.playlist_result(entries, user_id)