[pornhub] Extend _VALID_URL (closes #12996)
[youtube-dl] / youtube_dl / extractor / pornhub.py
index 7a2737032ff27a73825a7787feece5d52a92507d..1dcc8df00b4c2eb2da3b2719a35d874eb290c4ed 100644 (file)
@@ -1,7 +1,9 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import functools
 import itertools
+import operator
 # import os
 import re
 
@@ -18,6 +20,7 @@ from ..utils import (
     js_to_json,
     orderedSet,
     # sanitized_Request,
+    remove_quotes,
     str_to_int,
 )
 # from ..aes import (
@@ -30,7 +33,7 @@ class PornHubIE(InfoExtractor):
     _VALID_URL = r'''(?x)
                     https?://
                         (?:
-                            (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+                            (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
                             (?:www\.)?thumbzilla\.com/video/
                         )
                         (?P<id>[\da-z]+)
@@ -94,6 +97,9 @@ class PornHubIE(InfoExtractor):
     }, {
         'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
         'only_matching': True,
+    }, {
+        'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -129,9 +135,32 @@ class PornHubIE(InfoExtractor):
 
         tv_webpage = dl_webpage('tv')
 
-        video_url = self._search_regex(
-            r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage,
-            'video url', group='url')
+        assignments = self._search_regex(
+            r'(var.+?mediastring.+?)</script>', tv_webpage,
+            'encoded url').split(';')
+
+        js_vars = {}
+
+        def parse_js_value(inp):
+            inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
+            if '+' in inp:
+                inps = inp.split('+')
+                return functools.reduce(
+                    operator.concat, map(parse_js_value, inps))
+            inp = inp.strip()
+            if inp in js_vars:
+                return js_vars[inp]
+            return remove_quotes(inp)
+
+        for assn in assignments:
+            assn = assn.strip()
+            if not assn:
+                continue
+            assn = re.sub(r'var\s+', '', assn)
+            vname, value = assn.split('=', 1)
+            js_vars[vname] = parse_js_value(value)
+
+        video_url = js_vars['mediastring']
 
         title = self._search_regex(
             r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
@@ -169,50 +198,6 @@ class PornHubIE(InfoExtractor):
         comment_count = self._extract_count(
             r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 
-        """
-        video_variables = {}
-        for video_variablename, quote, video_variable in re.findall(
-                r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage):
-            video_variables[video_variablename] = video_variable
-
-        video_urls = []
-        for encoded_video_url in re.findall(
-                r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage):
-            for varname, varval in video_variables.items():
-                encoded_video_url = encoded_video_url.replace(varname, varval)
-            video_urls.append(re.sub(r'[\s+]', '', encoded_video_url))
-
-        if webpage.find('"encrypted":true') != -1:
-            password = compat_urllib_parse_unquote_plus(
-                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
-            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
-
-        formats = []
-        for video_url in video_urls:
-            path = compat_urllib_parse_urlparse(video_url).path
-            extension = os.path.splitext(path)[1][1:]
-            format = path.split('/')[5].split('_')[:2]
-            format = '-'.join(format)
-
-            m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
-            if m is None:
-                height = None
-                tbr = None
-            else:
-                height = int(m.group('height'))
-                tbr = int(m.group('tbr'))
-
-            formats.append({
-                'url': video_url,
-                'ext': extension,
-                'format': format,
-                'format_id': format,
-                'tbr': tbr,
-                'height': height,
-            })
-        self._sort_formats(formats)
-        """
-
         page_params = self._parse_json(self._search_regex(
             r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
             webpage, 'page parameters', group='data', default='{}'),