[pornhub] Decode obfuscated video URL (closes #12470)

[youtube-dl] / youtube_dl / extractor / pornhub.py
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py

index 7a2737032ff27a73825a7787feece5d52a92507d..eb316ad1413985f2bd3e6fd6ac94323f71c554f5 100644 (file)
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -1,7 +1,9 @@
  # coding: utf-8
  from __future__ import unicode_literals
  
+import functools
  import itertools
+import operator
  # import os
  import re
  
@@ -129,9 +131,38 @@ class PornHubIE(InfoExtractor):
  
          tv_webpage = dl_webpage('tv')
  
-        video_url = self._search_regex(
-            r'<video[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//.+?)\1', tv_webpage,
-            'video url', group='url')
+        encoded_url = self._search_regex(r'(var.*mediastring.*)</script>',
+            tv_webpage, 'encoded url')
+        assignments = encoded_url.split(";")
+        js_vars = {}
+
+        def parse_js_value(inp):
+            inp = re.sub(r'/\*[^*]*\*/', "", inp)
+
+            if "+" in inp:
+                inps = inp.split("+")
+                return functools.reduce(operator.concat, map(parse_js_value, inps))
+
+            inp = inp.strip()
+            if inp in js_vars:
+                return js_vars[inp]
+
+            # Hope it's a string!
+            assert inp.startswith('"') and inp.endswith('"')
+            return inp[1:-1]
+
+        for assn in assignments:
+            assn = assn.strip()
+            if len(assn) == 0:
+                continue
+
+            assert assn.startswith("var ")
+            assn = assn[4:]
+            vname, value = assn.split("=", 1)
+
+            js_vars[vname] = parse_js_value(value)
+
+        video_url = js_vars["mediastring"]
  
          title = self._search_regex(
              r'<h1>([^>]+)</h1>', tv_webpage, 'title', default=None)
@@ -169,50 +200,6 @@ class PornHubIE(InfoExtractor):
          comment_count = self._extract_count(
              r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
  
-        """
-        video_variables = {}
-        for video_variablename, quote, video_variable in re.findall(
-                r'(player_quality_[0-9]{3,4}p\w+)\s*=\s*(["\'])(.+?)\2;', webpage):
-            video_variables[video_variablename] = video_variable
-
-        video_urls = []
-        for encoded_video_url in re.findall(
-                r'player_quality_[0-9]{3,4}p\s*=(.+?);', webpage):
-            for varname, varval in video_variables.items():
-                encoded_video_url = encoded_video_url.replace(varname, varval)
-            video_urls.append(re.sub(r'[\s+]', '', encoded_video_url))
-
-        if webpage.find('"encrypted":true') != -1:
-            password = compat_urllib_parse_unquote_plus(
-                self._search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
-            video_urls = list(map(lambda s: aes_decrypt_text(s, password, 32).decode('utf-8'), video_urls))
-
-        formats = []
-        for video_url in video_urls:
-            path = compat_urllib_parse_urlparse(video_url).path
-            extension = os.path.splitext(path)[1][1:]
-            format = path.split('/')[5].split('_')[:2]
-            format = '-'.join(format)
-
-            m = re.match(r'^(?P<height>[0-9]+)[pP]-(?P<tbr>[0-9]+)[kK]$', format)
-            if m is None:
-                height = None
-                tbr = None
-            else:
-                height = int(m.group('height'))
-                tbr = int(m.group('tbr'))
-
-            formats.append({
-                'url': video_url,
-                'ext': extension,
-                'format': format,
-                'format_id': format,
-                'tbr': tbr,
-                'height': height,
-            })
-        self._sort_formats(formats)
-        """
-
          page_params = self._parse_json(self._search_regex(
              r'page_params\.zoneDetails\[([\'"])[^\'"]+\1\]\s*=\s*(?P<data>{[^}]+})',
              webpage, 'page parameters', group='data', default='{}'),