_ Git - youtube-dl/blob - youtube_dl/extractor/pornhub.py

   1 from __future__ import unicode_literals
   2
   3 import os
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_urllib_parse_unquote,
   9     compat_urllib_parse_unquote_plus,
  10     compat_urllib_parse_urlparse,
  11 )
  12 from ..utils import (
  13     ExtractorError,
  14     int_or_none,
  15     orderedSet,
  16     sanitized_Request,
  17     str_to_int,
  18 )
  19 from ..aes import (
  20     aes_decrypt_text
  21 )
  22
  23
  24 class PornHubIE(InfoExtractor):
  25     _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)'
  26     _TESTS = [{
  27         'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  28         'md5': '1e19b41231a02eba417839222ac9d58e',
  29         'info_dict': {
  30             'id': '648719015',
  31             'ext': 'mp4',
  32             'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  33             'uploader': 'Babes',
  34             'duration': 361,
  35             'view_count': int,
  36             'like_count': int,
  37             'dislike_count': int,
  38             'comment_count': int,
  39             'age_limit': 18,
  40         }
  41     }, {
  42         'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
  43         'only_matching': True,
  44     }, {
  45         'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
  46         'only_matching': True,
  47     }]
  48
  49     @classmethod
  50     def _extract_url(cls, webpage):
  51         mobj = re.search(
  52             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?pornhub\.com/embed/\d+)\1', webpage)
  53         if mobj:
  54             return mobj.group('url')
  55
  56     def _extract_count(self, pattern, webpage, name):
  57         return str_to_int(self._search_regex(
  58             pattern, webpage, '%s count' % name, fatal=False))
  59
  60     def _real_extract(self, url):
  61         video_id = self._match_id(url)
  62
  63         req = sanitized_Request(
  64             'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id)
  65         req.add_header('Cookie', 'age_verified=1')
  66         webpage = self._download_webpage(req, video_id)
  67
  68         error_msg = self._html_search_regex(
  69             r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>',
  70             webpage, 'error message', default=None)
  71         if error_msg:
  72             error_msg = re.sub(r'\s+', ' ', error_msg)
  73             raise ExtractorError(
  74                 'PornHub said: %s' % error_msg,
  75                 expected=True, video_id=video_id)
  76
  77         flashvars = self._parse_json(
  78             self._search_regex(
  79                 r'var\s+flashv1ars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
  80             video_id)
  81         if flashvars:
  82             video_title = flashvars.get('video_title')
  83             thumbnail = flashvars.get('image_url')
  84             duration = int_or_none(flashvars.get('video_duration'))
  85         else:
  86             video_title, thumbnail, duration = [None] * 3
  87
  88         if not video_title:
  89             video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
  90
  91         video_uploader = self._html_search_regex(
  92             r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
  93             webpage, 'uploader', fatal=False)
  94
  95         view_count = self._extract_count(
  96             r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
  97         like_count = self._extract_count(
  98             r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
  99         dislike_count = self._extract_count(
 100             r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
 101         comment_count = self._extract_count(
 102             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 103
 104         video_urls = list(map(compat_urllib_parse_unquote, re.findall(r"player_quality_[0-9]{3}p\s*=\s*'([^']+)'", webpage)))
 105         if webpage.find('"encrypted":true') != -1:
 106             password = compat_urllib_parse_unquote_plus(
 107                 self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
 108             video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
 109
 110         formats = []
 111         for video_url in video_urls:
 112             path = compat_urllib_parse_urlparse(video_url).path
 113             extension = os.path.splitext(path)[1][1:]
 114             format = path.split('/')[5].split('_')[:2]
 115             format = '-'.join(format)
 116
 117             m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
 118             if m is None:
 119                 height = None
 120                 tbr = None
 121             else:
 122                 height = int(m.group('height'))
 123                 tbr = int(m.group('tbr'))
 124
 125             formats.append({
 126                 'url': video_url,
 127                 'ext': extension,
 128                 'format': format,
 129                 'format_id': format,
 130                 'tbr': tbr,
 131                 'height': height,
 132             })
 133         self._sort_formats(formats)
 134
 135         return {
 136             'id': video_id,
 137             'uploader': video_uploader,
 138             'title': video_title,
 139             'thumbnail': thumbnail,
 140             'duration': duration,
 141             'view_count': view_count,
 142             'like_count': like_count,
 143             'dislike_count': dislike_count,
 144             'comment_count': comment_count,
 145             'formats': formats,
 146             'age_limit': 18,
 147         }
 148
 149
 150 class PornHubPlaylistBaseIE(InfoExtractor):
 151     def _extract_entries(self, webpage):
 152         return [
 153             self.url_result(
 154                 'http://www.pornhub.com/%s' % video_url,
 155                 PornHubIE.ie_key(), video_title=title)
 156             for video_url, title in orderedSet(re.findall(
 157                 r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
 158                 webpage))
 159         ]
 160
 161     def _real_extract(self, url):
 162         playlist_id = self._match_id(url)
 163
 164         webpage = self._download_webpage(url, playlist_id)
 165
 166         entries = self._extract_entries(webpage)
 167
 168         playlist = self._parse_json(
 169             self._search_regex(
 170                 r'playlistObject\s*=\s*({.+?});', webpage, 'playlist'),
 171             playlist_id)
 172
 173         return self.playlist_result(
 174             entries, playlist_id, playlist.get('title'), playlist.get('description'))
 175
 176
 177 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
 178     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
 179     _TESTS = [{
 180         'url': 'http://www.pornhub.com/playlist/6201671',
 181         'info_dict': {
 182             'id': '6201671',
 183             'title': 'P0p4',
 184         },
 185         'playlist_mincount': 35,
 186     }]
 187
 188
 189 class PornHubUserVideosIE(PornHubPlaylistBaseIE):
 190     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos'
 191     _TESTS = [{
 192         'url': 'http://www.pornhub.com/users/rushandlia/videos',
 193         'info_dict': {
 194             'id': 'rushandlia',
 195         },
 196         'playlist_mincount': 13,
 197     }]
 198
 199     def _real_extract(self, url):
 200         user_id = self._match_id(url)
 201
 202         webpage = self._download_webpage(url, user_id)
 203
 204         return self.playlist_result(self._extract_entries(webpage), user_id)