[pornhub:playlist] Improve extraction (closes #11594)
authorSergey M․ <dstftw@gmail.com>
Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)
committerSergey M․ <dstftw@gmail.com>
Tue, 3 Jan 2017 22:32:18 +0000 (05:32 +0700)
youtube_dl/extractor/pornhub.py

index 40dbe6967fac2126b7bf6e6a1245768b3c039c8e..3eaf56973ec35072d8f0549c5850357ca94ed12b 100644 (file)
@@ -229,7 +229,14 @@ class PornHubPlaylistBaseIE(InfoExtractor):
 
         webpage = self._download_webpage(url, playlist_id)
 
-        entries = self._extract_entries(webpage)
+        # Only process container div with main playlist content skipping
+        # drop-down menu that uses similar pattern for videos (see
+        # https://github.com/rg3/youtube-dl/issues/11594).
+        container = self._search_regex(
+            r'(?s)(<div[^>]+class=["\']container.+)', webpage,
+            'container', default=webpage)
+
+        entries = self._extract_entries(container)
 
         playlist = self._parse_json(
             self._search_regex(
@@ -243,12 +250,12 @@ class PornHubPlaylistBaseIE(InfoExtractor):
 class PornHubPlaylistIE(PornHubPlaylistBaseIE):
     _VALID_URL = r'https?://(?:www\.)?pornhub\.com/playlist/(?P<id>\d+)'
     _TESTS = [{
-        'url': 'http://www.pornhub.com/playlist/6201671',
+        'url': 'http://www.pornhub.com/playlist/4667351',
         'info_dict': {
-            'id': '6201671',
-            'title': 'P0p4',
+            'id': '4667351',
+            'title': 'Nataly Hot',
         },
-        'playlist_mincount': 35,
+        'playlist_mincount': 2,
     }]