[bandcamp] Fix extraction for incomplete albums

author Yen Chi Hsuan <yan12125@gmail.com>

Sun, 5 Feb 2017 14:47:04 +0000 (22:47 +0800)

committer Yen Chi Hsuan <yan12125@gmail.com>

Sun, 5 Feb 2017 14:47:04 +0000 (22:47 +0800)
author Yen Chi Hsuan <yan12125@gmail.com>
Sun, 5 Feb 2017 14:47:04 +0000 (22:47 +0800)
committer Yen Chi Hsuan <yan12125@gmail.com>
Sun, 5 Feb 2017 14:47:04 +0000 (22:47 +0800)
diff --git a/ChangeLog b/ChangeLog

index 77286dbef3c8f190a2e304f71251965f1c69d872..984191925d7a1bf87c6e4d78a019c767e53c80b3 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,7 @@
  version <unreleased>
  
  Extractors
+* [bandcamp] Fix extraction for incomplete albums (#11727)
  * [iwara] Fix extraction (#11781)
  * [googledrive] Fix extraction on Python 3.6
  
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py

index 88c590e98388d5f6058dd71ffb97f4f0254f0c5b..056e06376667e02b34c8efa7b2565be51e4625a4 100644 (file)
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -209,6 +209,15 @@ class BandcampAlbumIE(InfoExtractor):
              'id': 'entropy-ep',
          },
          'playlist_mincount': 3,
+    }, {
+        # not all tracks have songs
+        'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
+        'info_dict': {
+            'id': 'we-are-the-plague',
+            'title': 'WE ARE THE PLAGUE',
+            'uploader_id': 'insulters',
+        },
+        'playlist_count': 2,
      }]
  
      def _real_extract(self, url):
@@ -217,12 +226,16 @@ class BandcampAlbumIE(InfoExtractor):
          album_id = mobj.group('album_id')
          playlist_id = album_id or uploader_id
          webpage = self._download_webpage(url, playlist_id)
-        tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
-        if not tracks_paths:
+        track_elements = re.findall(
+            r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
+        if not track_elements:
              raise ExtractorError('The page doesn\'t contain any tracks')
+        # Only tracks with duration info have songs
          entries = [
              self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
-            for t_path in tracks_paths]
+            for elem_content, t_path in track_elements
+            if self._html_search_meta('duration', elem_content, default=None)]
+
          title = self._html_search_regex(
              r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
              webpage, 'title', fatal=False)
author	Yen Chi Hsuan <yan12125@gmail.com>
	Sun, 5 Feb 2017 14:47:04 +0000 (22:47 +0800)
committer	Yen Chi Hsuan <yan12125@gmail.com>
	Sun, 5 Feb 2017 14:47:04 +0000 (22:47 +0800)
ChangeLog		patch \| blob \| history
youtube_dl/extractor/bandcamp.py		patch \| blob \| history