_ Git - youtube-dl/blob - youtube_dl/extractor/pornhub.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import functools
   5 import itertools
   6 import operator
   7 import re
   8
   9 from .common import InfoExtractor
  10 from ..compat import (
  11     compat_HTTPError,
  12     compat_str,
  13 )
  14 from ..utils import (
  15     ExtractorError,
  16     int_or_none,
  17     js_to_json,
  18     orderedSet,
  19     remove_quotes,
  20     str_to_int,
  21     url_or_none,
  22 )
  23
  24
  25 class PornHubIE(InfoExtractor):
  26     IE_DESC = 'PornHub and Thumbzilla'
  27     _VALID_URL = r'''(?x)
  28                     https?://
  29                         (?:
  30                             (?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  31                             (?:www\.)?thumbzilla\.com/video/
  32                         )
  33                         (?P<id>[\da-z]+)
  34                     '''
  35     _TESTS = [{
  36         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  37         'md5': '1e19b41231a02eba417839222ac9d58e',
  38         'info_dict': {
  39             'id': '648719015',
  40             'ext': 'mp4',
  41             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  42             'uploader': 'Babes',
  43             'upload_date': '20130628',
  44             'duration': 361,
  45             'view_count': int,
  46             'like_count': int,
  47             'dislike_count': int,
  48             'comment_count': int,
  49             'age_limit': 18,
  50             'tags': list,
  51             'categories': list,
  52         },
  53     }, {
  54         # non-ASCII title
  55         'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  56         'info_dict': {
  57             'id': '1331683002',
  58             'ext': 'mp4',
  59             'title': '重庆婷婷女王足交',
  60             'uploader': 'Unknown',
  61             'upload_date': '20150213',
  62             'duration': 1753,
  63             'view_count': int,
  64             'like_count': int,
  65             'dislike_count': int,
  66             'comment_count': int,
  67             'age_limit': 18,
  68             'tags': list,
  69             'categories': list,
  70         },
  71         'params': {
  72             'skip_download': True,
  73         },
  74     }, {
  75         # subtitles
  76         'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  77         'info_dict': {
  78             'id': 'ph5af5fef7c2aa7',
  79             'ext': 'mp4',
  80             'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  81             'uploader': 'BFFs',
  82             'duration': 622,
  83             'view_count': int,
  84             'like_count': int,
  85             'dislike_count': int,
  86             'comment_count': int,
  87             'age_limit': 18,
  88             'tags': list,
  89             'categories': list,
  90             'subtitles': {
  91                 'en': [{
  92                     "ext": 'srt'
  93                 }]
  94             },
  95         },
  96         'params': {
  97             'skip_download': True,
  98         },
  99     }, {
 100         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
 101         'only_matching': True,
 102     }, {
 103         # removed at the request of cam4.com
 104         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
 105         'only_matching': True,
 106     }, {
 107         # removed at the request of the copyright owner
 108         'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
 109         'only_matching': True,
 110     }, {
 111         # removed by uploader
 112         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
 113         'only_matching': True,
 114     }, {
 115         # private video
 116         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
 117         'only_matching': True,
 118     }, {
 119         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
 120         'only_matching': True,
 121     }, {
 122         'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
 123         'only_matching': True,
 124     }, {
 125         'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
 126         'only_matching': True,
 127     }]
 128
 129     @staticmethod
 130     def _extract_urls(webpage):
 131         return re.findall(
 132             r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/[\da-z]+)',
 133             webpage)
 134
 135     def _extract_count(self, pattern, webpage, name):
 136         return str_to_int(self._search_regex(
 137             pattern, webpage, '%s count' % name, fatal=False))
 138
 139     def _real_extract(self, url):
 140         video_id = self._match_id(url)
 141
 142         self._set_cookie('pornhub.com', 'age_verified', '1')
 143
 144         def dl_webpage(platform):
 145             self._set_cookie('pornhub.com', 'platform', platform)
 146             return self._download_webpage(
 147                 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id,
 148                 video_id, 'Downloading %s webpage' % platform)
 149
 150         webpage = dl_webpage('pc')
 151
 152         error_msg = self._html_search_regex(
 153             r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
 154             webpage, 'error message', default=None, group='error')
 155         if error_msg:
 156             error_msg = re.sub(r'\s+', ' ', error_msg)
 157             raise ExtractorError(
 158                 'PornHub said: %s' % error_msg,
 159                 expected=True, video_id=video_id)
 160
 161         # video_title from flashvars contains whitespace instead of non-ASCII (see
 162         # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
 163         # on that anymore.
 164         title = self._html_search_meta(
 165             'twitter:title', webpage, default=None) or self._search_regex(
 166             (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
 167              r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
 168              r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
 169             webpage, 'title', group='title')
 170
 171         video_urls = []
 172         video_urls_set = set()
 173         subtitles = {}
 174
 175         flashvars = self._parse_json(
 176             self._search_regex(
 177                 r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
 178             video_id)
 179         if flashvars:
 180             subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
 181             if subtitle_url:
 182                 subtitles.setdefault('en', []).append({
 183                     'url': subtitle_url,
 184                     'ext': 'srt',
 185                 })
 186             thumbnail = flashvars.get('image_url')
 187             duration = int_or_none(flashvars.get('video_duration'))
 188             media_definitions = flashvars.get('mediaDefinitions')
 189             if isinstance(media_definitions, list):
 190                 for definition in media_definitions:
 191                     if not isinstance(definition, dict):
 192                         continue
 193                     video_url = definition.get('videoUrl')
 194                     if not video_url or not isinstance(video_url, compat_str):
 195                         continue
 196                     if video_url in video_urls_set:
 197                         continue
 198                     video_urls_set.add(video_url)
 199                     video_urls.append(
 200                         (video_url, int_or_none(definition.get('quality'))))
 201         else:
 202             thumbnail, duration = [None] * 2
 203
 204         if not video_urls:
 205             tv_webpage = dl_webpage('tv')
 206
 207             assignments = self._search_regex(
 208                 r'(var.+?mediastring.+?)</script>', tv_webpage,
 209                 'encoded url').split(';')
 210
 211             js_vars = {}
 212
 213             def parse_js_value(inp):
 214                 inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
 215                 if '+' in inp:
 216                     inps = inp.split('+')
 217                     return functools.reduce(
 218                         operator.concat, map(parse_js_value, inps))
 219                 inp = inp.strip()
 220                 if inp in js_vars:
 221                     return js_vars[inp]
 222                 return remove_quotes(inp)
 223
 224             for assn in assignments:
 225                 assn = assn.strip()
 226                 if not assn:
 227                     continue
 228                 assn = re.sub(r'var\s+', '', assn)
 229                 vname, value = assn.split('=', 1)
 230                 js_vars[vname] = parse_js_value(value)
 231
 232             video_url = js_vars['mediastring']
 233             if video_url not in video_urls_set:
 234                 video_urls.append((video_url, None))
 235                 video_urls_set.add(video_url)
 236
 237         for mobj in re.finditer(
 238                 r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
 239                 webpage):
 240             video_url = mobj.group('url')
 241             if video_url not in video_urls_set:
 242                 video_urls.append((video_url, None))
 243                 video_urls_set.add(video_url)
 244
 245         upload_date = None
 246         formats = []
 247         for video_url, height in video_urls:
 248             if not upload_date:
 249                 upload_date = self._search_regex(
 250                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
 251                 if upload_date:
 252                     upload_date = upload_date.replace('/', '')
 253             tbr = None
 254             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
 255             if mobj:
 256                 if not height:
 257                     height = int(mobj.group('height'))
 258                 tbr = int(mobj.group('tbr'))
 259             formats.append({
 260                 'url': video_url,
 261                 'format_id': '%dp' % height if height else None,
 262                 'height': height,
 263                 'tbr': tbr,
 264             })
 265         self._sort_formats(formats)
 266
 267         video_uploader = self._html_search_regex(
 268             r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
 269             webpage, 'uploader', fatal=False)
 270
 271         view_count = self._extract_count(
 272             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
 273         like_count = self._extract_count(
 274             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
 275         dislike_count = self._extract_count(
 276             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
 277         comment_count = self._extract_count(
 278             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 279
 280         page_params = self._parse_json(self._search_regex(
 281             r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
 282             webpage, 'page parameters', group='data', default='{}'),
 283             video_id, transform_source=js_to_json, fatal=False)
 284         tags = categories = None
 285         if page_params:
 286             tags = page_params.get('tags', '').split(',')
 287             categories = page_params.get('categories', '').split(',')
 288
 289         return {
 290             'id': video_id,
 291             'uploader': video_uploader,
 292             'upload_date': upload_date,
 293             'title': title,
 294             'thumbnail': thumbnail,
 295             'duration': duration,
 296             'view_count': view_count,
 297             'like_count': like_count,
 298             'dislike_count': dislike_count,
 299             'comment_count': comment_count,
 300             'formats': formats,
 301             'age_limit': 18,
 302             'tags': tags,
 303             'categories': categories,
 304             'subtitles': subtitles,
 305         }
 306
 307
 308 class PornHubPlaylistBaseIE(InfoExtractor):
 309     def _extract_entries(self, webpage):
 310         # Only process container div with main playlist content skipping
 311         # drop-down menu that uses similar pattern for videos (see
 312         # https://github.com/rg3/youtube-dl/issues/11594).
 313         container = self._search_regex(
 314             r'(?s)(<div[^>]+class=["\']container.+)', webpage,
 315             'container', default=webpage)
 316
 317         return [
 318             self.url_result(
 319                 'http://www.pornhub.com/%s' % video_url,
 320                 PornHubIE.ie_key(), video_title=title)
 321             for video_url, title in orderedSet(re.findall(
 322                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
 323                 container))
 324         ]
 325
 326     def _real_extract(self, url):
 327         playlist_id = self._match_id(url)
 328
 329         webpage = self._download_webpage(url, playlist_id)
 330
 331         entries = self._extract_entries(webpage)
 332
 333         playlist = self._parse_json(
 334             self._search_regex(
 335                 r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
 336                 'playlist', default='{}'),
 337             playlist_id, fatal=False)
 338         title = playlist.get('title') or self._search_regex(
 339             r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
 340
 341         return self.playlist_result(
 342             entries, playlist_id, title, playlist.get('description'))
 343
 344
 345 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
 346     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/playlist/(?P<id>\d+)'
 347     _TESTS = [{
 348         'url': 'http://www.pornhub.com/playlist/4667351',
 349         'info_dict': {
 350             'id': '4667351',
 351             'title': 'Nataly Hot',
 352         },
 353         'playlist_mincount': 2,
 354     }, {
 355         'url': 'https://de.pornhub.com/playlist/4667351',
 356         'only_matching': True,
 357     }]
 358
 359
 360 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
 361     _VALID_URL = r'https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos'
 362     _TESTS = [{
 363         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 364         'info_dict': {
 365             'id': 'zoe_ph',
 366         },
 367         'playlist_mincount': 171,
 368     }, {
 369         'url': 'http://www.pornhub.com/users/rushandlia/videos',
 370         'only_matching': True,
 371     }, {
 372         # default sorting as Top Rated Videos
 373         'url': 'https://www.pornhub.com/channels/povd/videos',
 374         'info_dict': {
 375             'id': 'povd',
 376         },
 377         'playlist_mincount': 293,
 378     }, {
 379         # Top Rated Videos
 380         'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
 381         'only_matching': True,
 382     }, {
 383         # Most Recent Videos
 384         'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
 385         'only_matching': True,
 386     }, {
 387         # Most Viewed Videos
 388         'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
 389         'only_matching': True,
 390     }, {
 391         'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
 392         'only_matching': True,
 393     }, {
 394         'url': 'https://www.pornhub.com/model/jayndrea/videos/upload',
 395         'only_matching': True,
 396     }, {
 397         'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
 398         'only_matching': True,
 399     }]
 400
 401     def _real_extract(self, url):
 402         user_id = self._match_id(url)
 403
 404         entries = []
 405         for page_num in itertools.count(1):
 406             try:
 407                 webpage = self._download_webpage(
 408                     url, user_id, 'Downloading page %d' % page_num,
 409                     query={'page': page_num})
 410             except ExtractorError as e:
 411                 if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
 412                     break
 413                 raise
 414             page_entries = self._extract_entries(webpage)
 415             if not page_entries:
 416                 break
 417             entries.extend(page_entries)
 418
 419         return self.playlist_result(entries, user_id)