Merge pull request #12861 from Tithen-Firion/cbsinteractive-fix

[youtube-dl] / youtube_dl / extractor / bandcamp.py
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py

index 88c590e98388d5f6058dd71ffb97f4f0254f0c5b..489d0ba53f672363213c7f788e83b692eb11894d 100644 (file)
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor):
          '_skip': 'There is a limit of 200 free downloads / month for the test song'
      }, {
          'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
          '_skip': 'There is a limit of 200 free downloads / month for the test song'
      }, {
          'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
-        'md5': '73d0b3171568232574e45652f8720b5c',
+        'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
          'info_dict': {
              'id': '2650410135',
          'info_dict': {
              'id': '2650410135',
-            'ext': 'mp3',
-            'title': 'Lanius (Battle)',
-            'uploader': 'Ben Prunty Music',
+            'ext': 'aiff',
+            'title': 'Ben Prunty - Lanius (Battle)',
+            'uploader': 'Ben Prunty',
          },
      }]
  
          },
      }]
  
@@ -47,6 +47,7 @@ class BandcampIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
          title = mobj.group('title')
          webpage = self._download_webpage(url, title)
          mobj = re.match(self._VALID_URL, url)
          title = mobj.group('title')
          webpage = self._download_webpage(url, title)
+        thumbnail = self._html_search_meta('og:image', webpage, default=None)
          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
          if not m_download:
              m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
          m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
          if not m_download:
              m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
@@ -75,6 +76,7 @@ class BandcampIE(InfoExtractor):
                  return {
                      'id': track_id,
                      'title': data['title'],
                  return {
                      'id': track_id,
                      'title': data['title'],
+                    'thumbnail': thumbnail,
                      'formats': formats,
                      'duration': float_or_none(data.get('duration')),
                  }
                      'formats': formats,
                      'duration': float_or_none(data.get('duration')),
                  }
@@ -143,7 +145,7 @@ class BandcampIE(InfoExtractor):
          return {
              'id': video_id,
              'title': title,
          return {
              'id': video_id,
              'title': title,
-            'thumbnail': info.get('thumb_url'),
+            'thumbnail': info.get('thumb_url') or thumbnail,
              'uploader': info.get('artist'),
              'artist': artist,
              'track': track,
              'uploader': info.get('artist'),
              'artist': artist,
              'track': track,
@@ -209,6 +211,15 @@ class BandcampAlbumIE(InfoExtractor):
              'id': 'entropy-ep',
          },
          'playlist_mincount': 3,
              'id': 'entropy-ep',
          },
          'playlist_mincount': 3,
+    }, {
+        # not all tracks have songs
+        'url': 'https://insulters.bandcamp.com/album/we-are-the-plague',
+        'info_dict': {
+            'id': 'we-are-the-plague',
+            'title': 'WE ARE THE PLAGUE',
+            'uploader_id': 'insulters',
+        },
+        'playlist_count': 2,
      }]
  
      def _real_extract(self, url):
      }]
  
      def _real_extract(self, url):
@@ -217,12 +228,16 @@ class BandcampAlbumIE(InfoExtractor):
          album_id = mobj.group('album_id')
          playlist_id = album_id or uploader_id
          webpage = self._download_webpage(url, playlist_id)
          album_id = mobj.group('album_id')
          playlist_id = album_id or uploader_id
          webpage = self._download_webpage(url, playlist_id)
-        tracks_paths = re.findall(r'<a href="(.*?)" itemprop="url">', webpage)
-        if not tracks_paths:
+        track_elements = re.findall(
+            r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
+        if not track_elements:
              raise ExtractorError('The page doesn\'t contain any tracks')
              raise ExtractorError('The page doesn\'t contain any tracks')
+        # Only tracks with duration info have songs
          entries = [
              self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
          entries = [
              self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
-            for t_path in tracks_paths]
+            for elem_content, t_path in track_elements
+            if self._html_search_meta('duration', elem_content, default=None)]
+
          title = self._html_search_regex(
              r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
              webpage, 'title', fatal=False)
          title = self._html_search_regex(
              r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"',
              webpage, 'title', fatal=False)