[pornhub:playlist] Improve extraction (closes #11594)

author Sergey M․ <dstftw@gmail.com>

Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)

committer Sergey M․ <dstftw@gmail.com>

Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)
author Sergey M․ <dstftw@gmail.com>
Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)
committer Sergey M․ <dstftw@gmail.com>
Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py

index 40dbe6967fac2126b7bf6e6a1245768b3c039c8e..3eaf56973ec35072d8f0549c5850357ca94ed12b 100644 (file)
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -229,7 +229,14 @@ class PornHubPlaylistBaseIE(InfoExtractor):
  
          webpage = self._download_webpage(url, playlist_id)
  
-        entries = self._extract_entries(webpage)
+        # Only process container div with main playlist content skipping
+        # drop-down menu that uses similar pattern for videos (see
+        # https://github.com/rg3/youtube-dl/issues/11594).
+        container = self._search_regex(
+            r'(?s)(<div[^>]+class=["\']container.+)', webpage,
+            'container', default=webpage)
+
+        entries = self._extract_entries(container)
  
          playlist = self._parse_json(
              self._search_regex(
@@ -243,12 +250,12 @@ class PornHubPlaylistBaseIE(InfoExtractor):
  class PornHubPlaylistIE(PornHubPlaylistBaseIE):
      _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
      _TESTS = [{
-        'url': 'http://www.pornhub.com/playlist/6201671',
+        'url': 'http://www.pornhub.com/playlist/4667351',
          'info_dict': {
-            'id': '6201671',
-            'title': 'P0p4',
+            'id': '4667351',
+            'title': 'Nataly Hot',
          },
-        'playlist_mincount': 35,
+        'playlist_mincount': 2,
      }]
author	Sergey M․ <dstftw@gmail.com>
	Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)
committer	Sergey M․ <dstftw@gmail.com>
	Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)