X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fpornhub.py;h=b25f1f193fc7b1590d57966e7c819d16a4b43cd0;hb=9cafc3fd8b54b9b91a145cddf9e4db0bd59e1b5f;hp=7a2737032ff27a73825a7787feece5d52a92507d;hpb=9a372f14b422de15acf91e25a90375688b2ba3fa;p=youtube-dl diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 7a2737032..b25f1f193 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -1,7 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals +import functools import itertools +import operator # import os import re @@ -18,6 +20,7 @@ from ..utils import ( js_to_json, orderedSet, # sanitized_Request, + remove_quotes, str_to_int, ) # from ..aes import ( @@ -129,9 +132,32 @@ class PornHubIE(InfoExtractor): tv_webpage = dl_webpage('tv') - video_url = self._search_regex( - r']+\bsrc=(["\'])(?P(?:https?:)?//.+?)\1', tv_webpage, - 'video url', group='url') + assignments = self._search_regex( + r'(var.+?mediastring.+?)', tv_webpage, + 'encoded url').split(';') + + js_vars = {} + + def parse_js_value(inp): + inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp) + if '+' in inp: + inps = inp.split('+') + return functools.reduce( + operator.concat, map(parse_js_value, inps)) + inp = inp.strip() + if inp in js_vars: + return js_vars[inp] + return remove_quotes(inp) + + for assn in assignments: + assn = assn.strip() + if not assn: + continue + assn = re.sub(r'var\s+', '', assn) + vname, value = assn.split('=', 1) + js_vars[vname] = parse_js_value(value) + + video_url = js_vars['mediastring'] title = self._search_regex( r'

([^>]+)

', tv_webpage, 'title', default=None) @@ -169,50 +195,6 @@ class PornHubIE(InfoExtractor): comment_count = self._extract_count( r'All Comments\s*\(([\d,.]+)\)', webpage, 'comment') - """ - video_variables = {} - for video_variablename, quote, video_variable in re.findall( - r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage): - video_variables[video_variablename] = video_variable - - video_urls = [] - for encoded_video_url in re.findall( - r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage): - for varname, varval in video_variables.items(): - encoded_video_url = encoded_video_url.replace(varname, varval) - video_urls.append(re.sub(r'[\s+]', '', encoded_video_url)) - - if webpage.find('"encrypted":true') != -1: - password = compat_urllib_parse_unquote_plus( - self._search_regex(r'"video_title":"([^"]+)', webpage, 'password')) - video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls)) - - formats = [] - for video_url in video_urls: - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) - - m = re.match(r'^(?P[0-9]+)[pP]-(?P[0-9]+)[kK]$', format) - if m is None: - height = None - tbr = None - else: - height = int(m.group('height')) - tbr = int(m.group('tbr')) - - formats.append({ - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'tbr': tbr, - 'height': height, - }) - self._sort_formats(formats) - """ - page_params = self._parse_json(self._search_regex( r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P{[^}]+})', webpage, 'page parameters', group='data', default='{}'),