Match --download-archive during playlist processing (Fixes #1745)

author Philipp Hagemeister <phihag@phihag.de>

Fri, 22 Nov 2013 21:46:46 +0000 (22:46 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Fri, 22 Nov 2013 21:46:46 +0000 (22:46 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Fri, 22 Nov 2013 21:46:46 +0000 (22:46 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Fri, 22 Nov 2013 21:46:46 +0000 (22:46 +0100)
diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py

index 50ad52695e37bd7cb4a950fc1da5d2e65790dc6e..938517a2de02c1d81f88f5ca64efbc13e53b54fa 100644 (file)
--- a/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@ -84,16 +84,16 @@ class TestYoutubeLists(unittest.TestCase):
          dl = FakeYDL()
          ie = YoutubeChannelIE(dl)
          #test paginated channel
-        result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')[0]
+        result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')
          self.assertTrue(len(result['entries']) > 90)
          #test autogenerated channel
-        result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')[0]
+        result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
          self.assertTrue(len(result['entries']) >= 18)
  
      def test_youtube_user(self):
          dl = FakeYDL()
          ie = YoutubeUserIE(dl)
-        result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
+        result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')
          self.assertTrue(len(result['entries']) >= 320)
  
      def test_youtube_safe_search(self):
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py

index 2700051cf32309e5ba93397d0cf1f6ccb70be9db..beb7d0cd1970021509f704a79cab8b9b61781413 100644 (file)
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -355,15 +355,17 @@ class YoutubeDL(object):
      def _match_entry(self, info_dict):
          """ Returns None iff the file should be downloaded """
  
-        title = info_dict['title']
-        matchtitle = self.params.get('matchtitle', False)
-        if matchtitle:
-            if not re.search(matchtitle, title, re.IGNORECASE):
-                return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
-        rejecttitle = self.params.get('rejecttitle', False)
-        if rejecttitle:
-            if re.search(rejecttitle, title, re.IGNORECASE):
-                return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
+        if 'title' in info_dict:
+            # This can happen when we're just evaluating the playlist
+            title = info_dict['title']
+            matchtitle = self.params.get('matchtitle', False)
+            if matchtitle:
+                if not re.search(matchtitle, title, re.IGNORECASE):
+                    return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"'
+            rejecttitle = self.params.get('rejecttitle', False)
+            if rejecttitle:
+                if re.search(rejecttitle, title, re.IGNORECASE):
+                    return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"'
          date = info_dict.get('upload_date', None)
          if date is not None:
              dateRange = self.params.get('daterange', DateRange())
@@ -374,8 +376,8 @@ class YoutubeDL(object):
              if age_limit < info_dict.get('age_limit', 0):
                  return u'Skipping "' + title + '" because it is age restricted'
          if self.in_download_archive(info_dict):
-            return (u'%(title)s has already been recorded in archive'
-                    % info_dict)
+            return (u'%s has already been recorded in archive'
+                    % info_dict.get('title', info_dict.get('id', u'video')))
          return None
  
      @staticmethod
@@ -454,7 +456,7 @@ class YoutubeDL(object):
                                       ie_key=ie_result.get('ie_key'),
                                       extra_info=extra_info)
          elif result_type == 'playlist':
-            self.add_extra_info(ie_result, extra_info)
+
              # We process each entry in the playlist
              playlist = ie_result.get('title', None) or ie_result.get('id', None)
              self.to_screen(u'[download] Downloading playlist: %s' % playlist)
@@ -484,6 +486,12 @@ class YoutubeDL(object):
                      'webpage_url': ie_result['webpage_url'],
                      'extractor_key': ie_result['extractor_key'],
                  }
+
+                reason = self._match_entry(entry)
+                if reason is not None:
+                    self.to_screen(u'[download] ' + reason)
+                    continue
+
                  entry_result = self.process_ie_result(entry,
                                                        download=download,
                                                        extra_info=extra)
@@ -810,7 +818,16 @@ class YoutubeDL(object):
          fn = self.params.get('download_archive')
          if fn is None:
              return False
-        vid_id = info_dict['extractor'] + u' ' + info_dict['id']
+        extractor = info_dict.get('extractor_id')
+        if extractor is None:
+            if 'id' in info_dict:
+                extractor = info_dict.get('ie_key')  # key in a playlist
+        if extractor is None:
+            return False  # Incomplete video information
+        # Future-proof against any change in case
+        # and backwards compatibility with prior versions
+        extractor = extractor.lower()
+        vid_id = extractor + u' ' + info_dict['id']
          try:
              with locked_file(fn, 'r', encoding='utf-8') as archive_file:
                  for line in archive_file:
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index eb3435c775c408bb07b3adc176a75b85704ffc77..3cebeaf29883875ca30e45c32234d83148b45caa 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -229,12 +229,14 @@ class InfoExtractor(object):
          self.to_screen(u'Logging in')
  
      #Methods for following #608
-    def url_result(self, url, ie=None):
+    def url_result(self, url, ie=None, video_id=None):
          """Returns a url that points to a page that should be processed"""
          #TODO: ie should be the class used for getting the info
          video_info = {'_type': 'url',
                        'url': url,
                        'ie_key': ie}
+        if video_id is not None:
+            video_info['id'] = video_id
          return video_info
      def playlist_result(self, entries, playlist_id=None, playlist_title=None):
          """Returns a playlist"""
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 9b09793eb307b4ca899942e57cfa3dda88ec5caa..126688652743bc81d7b178b98569b3827e24adfa 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1552,7 +1552,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
              video_id = query_dict['v'][0]
              if self._downloader.params.get('noplaylist'):
                  self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
-                return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
+                return self.url_result(video_id, 'Youtube', video_id=video_id)
              else:
                  self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
@@ -1571,7 +1571,8 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
  
          playlist_title = self._og_search_title(page)
  
-        url_results = [self.url_result(vid, 'Youtube') for vid in ids]
+        url_results = [self.url_result(video_id, 'Youtube', video_id=video_id)
+                       for video_id in ids]
          return self.playlist_result(url_results, playlist_id, playlist_title)
  
  
@@ -1626,9 +1627,9 @@ class YoutubeChannelIE(InfoExtractor):
  
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  
-        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
-        url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
-        return [self.playlist_result(url_entries, channel_id)]
+        url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
+                       for video_id in video_ids]
+        return self.playlist_result(url_entries, channel_id)
  
  
  class YoutubeUserIE(InfoExtractor):
@@ -1692,9 +1693,11 @@ class YoutubeUserIE(InfoExtractor):
              if len(ids_in_page) < self._GDATA_PAGE_SIZE:
                  break
  
-        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
-        url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
-        return [self.playlist_result(url_results, playlist_title = username)]
+        url_results = [
+            self.url_result(video_id, 'Youtube', video_id=video_id)
+            for video_id in video_ids]
+        return self.playlist_result(url_results, playlist_title=username)
+
  
  class YoutubeSearchIE(SearchInfoExtractor):
      IE_DESC = u'YouTube.com searches'
@@ -1735,7 +1738,8 @@ class YoutubeSearchIE(SearchInfoExtractor):
  
          if len(video_ids) > n:
              video_ids = video_ids[:n]
-        videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
+        videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
+                  for video_id in video_ids]
          return self.playlist_result(videos, query)
  
  class YoutubeSearchDateIE(YoutubeSearchIE):
@@ -1795,7 +1799,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
              feed_html = info['feed_html']
              m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
              ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+            feed_entries.extend(
+                self.url_result(video_id, 'Youtube', video_id=video_id)
+                for video_id in ids)
              if info['paging'] is None:
                  break
          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
author	Philipp Hagemeister <phihag@phihag.de>
	Fri, 22 Nov 2013 21:46:46 +0000 (22:46 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Fri, 22 Nov 2013 21:46:46 +0000 (22:46 +0100)
test/test_youtube_lists.py		patch \| blob \| history
youtube_dl/YoutubeDL.py		patch \| blob \| history
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/youtube.py		patch \| blob \| history