[pornhub] Fix uploader extraction and extract counts

author Sergey M․ <dstftw@gmail.com>

Sat, 22 Mar 2014 14:29:01 +0000 (21:29 +0700)

committer Sergey M․ <dstftw@gmail.com>

Sat, 22 Mar 2014 14:30:22 +0000 (21:30 +0700)
author Sergey M․ <dstftw@gmail.com>
Sat, 22 Mar 2014 14:29:01 +0000 (21:29 +0700)
committer Sergey M․ <dstftw@gmail.com>
Sat, 22 Mar 2014 14:30:22 +0000 (21:30 +0700)
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py

index 834fe7266c4e0bd4bc56bcd9807ec80c2cee7f05..7dd3dca0de94f41bb82c9baeacc45e5ab14ae0a2 100644 (file)
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -8,6 +8,7 @@ from ..utils import (
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urllib_parse,
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urllib_parse,
+    str_to_int,
  )
  from ..aes import (
      aes_decrypt_text
  )
  from ..aes import (
      aes_decrypt_text
@@ -27,6 +28,12 @@ class PornHubIE(InfoExtractor):
          }
      }
  
          }
      }
  
+    def _extract_count(self, pattern, webpage, name):
+        count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False)
+        if count:
+            count = str_to_int(count)
+        return count
+
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('videoid')
      def _real_extract(self, url):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group('videoid')
@@ -37,11 +44,19 @@ class PornHubIE(InfoExtractor):
          webpage = self._download_webpage(req, video_id)
  
          video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
          webpage = self._download_webpage(req, video_id)
  
          video_title = self._html_search_regex(r'<h1 [^>]+>([^<]+)', webpage, 'title')
-        video_uploader = self._html_search_regex(r'<b>From: </b>(?:\s|<[^>]*>)*(.+?)<', webpage, 'uploader', fatal=False)
+        video_uploader = self._html_search_regex(
+            r'(?s)<div class="video-info-row">\s*From:&nbsp;.+?<(?:a href="/users/|<span class="username)[^>]+>(.+?)<',
+            webpage, 'uploader', fatal=False)
          thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
          if thumbnail:
              thumbnail = compat_urllib_parse.unquote(thumbnail)
  
          thumbnail = self._html_search_regex(r'"image_url":"([^"]+)', webpage, 'thumbnail', fatal=False)
          if thumbnail:
              thumbnail = compat_urllib_parse.unquote(thumbnail)
  
+        view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
+        like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
+        dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+        comment_count = self._extract_count(
+            r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
+
          video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
          if webpage.find('"encrypted":true') != -1:
              password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
          video_urls = list(map(compat_urllib_parse.unquote , re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
          if webpage.find('"encrypted":true') != -1:
              password = compat_urllib_parse.unquote_plus(self._html_search_regex(r'"video_title":"([^"]+)', webpage, 'password'))
@@ -77,6 +92,10 @@ class PornHubIE(InfoExtractor):
              'uploader': video_uploader,
              'title': video_title,
              'thumbnail': thumbnail,
              'uploader': video_uploader,
              'title': video_title,
              'thumbnail': thumbnail,
+            'view_count': view_count,
+            'like_count': like_count,
+            'dislike_count': dislike_count,
+            'comment_count': comment_count,
              'formats': formats,
              'age_limit': 18,
          }
              'formats': formats,
              'age_limit': 18,
          }
author	Sergey M․ <dstftw@gmail.com>
	Sat, 22 Mar 2014 14:29:01 +0000 (21:29 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Sat, 22 Mar 2014 14:30:22 +0000 (21:30 +0700)