Merge branch 'sportbox-fix' of https://github.com/maddoger/youtube-dl into maddoger...
authorSergey M․ <dstftw@gmail.com>
Fri, 15 May 2015 16:09:18 +0000 (22:09 +0600)
committerSergey M․ <dstftw@gmail.com>
Fri, 15 May 2015 16:09:18 +0000 (22:09 +0600)
test/test_YoutubeDL.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/ccc.py
youtube_dl/extractor/youtube.py

index 82b827536d746246d3241fd856f749945cf6d561..a13c09ef40c7d8c101c86b471cae142ada9ccd7c 100644 (file)
@@ -12,6 +12,7 @@ import copy
 
 from test.helper import FakeYDL, assertRegexpMatches
 from youtube_dl import YoutubeDL
+from youtube_dl.compat import compat_str
 from youtube_dl.extractor import YoutubeIE
 from youtube_dl.postprocessor.common import PostProcessor
 from youtube_dl.utils import match_filter_func
@@ -507,6 +508,51 @@ class TestYoutubeDL(unittest.TestCase):
         res = get_videos(f)
         self.assertEqual(res, ['1'])
 
+    def test_playlist_items_selection(self):
+        entries = [{
+            'id': compat_str(i),
+            'title': compat_str(i),
+            'url': TEST_URL,
+        } for i in range(1, 5)]
+        playlist = {
+            '_type': 'playlist',
+            'id': 'test',
+            'entries': entries,
+            'extractor': 'test:playlist',
+            'extractor_key': 'test:playlist',
+            'webpage_url': 'http://example.com',
+        }
+
+        def get_ids(params):
+            ydl = YDL(params)
+            # make a copy because the dictionary can be modified
+            ydl.process_ie_result(playlist.copy())
+            return [int(v['id']) for v in ydl.downloaded_info_dicts]
+
+        result = get_ids({})
+        self.assertEqual(result, [1, 2, 3, 4])
+
+        result = get_ids({'playlistend': 10})
+        self.assertEqual(result, [1, 2, 3, 4])
+
+        result = get_ids({'playlistend': 2})
+        self.assertEqual(result, [1, 2])
+
+        result = get_ids({'playliststart': 10})
+        self.assertEqual(result, [])
+
+        result = get_ids({'playliststart': 2})
+        self.assertEqual(result, [2, 3, 4])
+
+        result = get_ids({'playlist_items': '2-4'})
+        self.assertEqual(result, [2, 3, 4])
+
+        result = get_ids({'playlist_items': '2,4'})
+        self.assertEqual(result, [2, 4])
+
+        result = get_ids({'playlist_items': '10'})
+        self.assertEqual(result, [])
+
 
 if __name__ == '__main__':
     unittest.main()
index 691f3e09f807de52c1c19f334befd0ccc0d4f82c..5df889945947eda4ae1b8a152a1e325c9cb56936 100755 (executable)
@@ -759,7 +759,9 @@ class YoutubeDL(object):
             if isinstance(ie_entries, list):
                 n_all_entries = len(ie_entries)
                 if playlistitems:
-                    entries = [ie_entries[i - 1] for i in playlistitems]
+                    entries = [
+                        ie_entries[i - 1] for i in playlistitems
+                        if -n_all_entries <= i - 1 < n_all_entries]
                 else:
                     entries = ie_entries[playliststart:playlistend]
                 n_entries = len(entries)
index 2a5d4be185d64f47ce37ca83f9bb6bd1d7eac829..6924eac704cd5cf02d266bd60125dad9cf13e765 100644 (file)
@@ -16,7 +16,7 @@ class CCCIE(InfoExtractor):
 
     _TEST = {
         'url': 'http://media.ccc.de/browse/congress/2013/30C3_-_5443_-_en_-_saal_g_-_201312281830_-_introduction_to_processor_design_-_byterazor.html#video',
-        'md5': '205a365d0d57c0b1e43a12c9ffe8f9be',
+        'md5': '3a1eda8f3a29515d27f5adb967d7e740',
         'info_dict': {
             'id': '20131228183',
             'ext': 'mp4',
@@ -51,7 +51,7 @@ class CCCIE(InfoExtractor):
 
         matches = re.finditer(r'''(?xs)
             <(?:span|div)\s+class='label\s+filetype'>(?P<format>.*?)</(?:span|div)>\s*
-            <a\s+href='(?P<http_url>[^']+)'>\s*
+            <a\s+download\s+href='(?P<http_url>[^']+)'>\s*
             (?:
                 .*?
                 <a\s+href='(?P<torrent_url>[^']+\.torrent)'
index e58184adcf4b329c85ddb8bde2f352e383404ed8..1f9940cf5c1e4c8698a8a0bed9a874d5a79f18b0 100644 (file)
@@ -49,6 +49,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             # YouTube sets the expire time to about two months
             expire_time=time.time() + 2 * 30 * 24 * 3600)
 
+    def _ids_to_results(self, ids):
+        return [
+            self.url_result(vid_id, 'Youtube', video_id=vid_id)
+            for vid_id in ids]
+
     def _login(self):
         """
         Attempt to log in to YouTube.
@@ -1261,11 +1266,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
     def _real_initialize(self):
         self._login()
 
-    def _ids_to_results(self, ids):
-        return [
-            self.url_result(vid_id, 'Youtube', video_id=vid_id)
-            for vid_id in ids]
-
     def _extract_mix(self, playlist_id):
         # The mixes are generated from a single video
         # the id of the playlist is just 'RD' + video_id
@@ -1601,20 +1601,10 @@ class YoutubeShowIE(InfoExtractor):
 
 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
     """
-    Base class for extractors that fetch info from
-    http://www.youtube.com/feed_ajax
+    Base class for feed extractors
     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
     """
     _LOGIN_REQUIRED = True
-    # use action_load_personal_feed instead of action_load_system_feed
-    _PERSONAL_FEED = False
-
-    @property
-    def _FEED_TEMPLATE(self):
-        action = 'action_load_system_feed'
-        if self._PERSONAL_FEED:
-            action = 'action_load_personal_feed'
-        return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
 
     @property
     def IE_NAME(self):
@@ -1624,67 +1614,23 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
         self._login()
 
     def _real_extract(self, url):
-        feed_entries = []
-        paging = 0
-        for i in itertools.count(1):
-            info = self._download_json(
-                self._FEED_TEMPLATE % paging,
-                '%s feed' % self._FEED_NAME,
-                'Downloading page %s' % i,
-                transform_source=uppercase_escape)
-            feed_html = info.get('feed_html') or info.get('content_html')
-            load_more_widget_html = info.get('load_more_widget_html') or feed_html
-            m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
-            ids = orderedSet(m.group(1) for m in m_ids)
-            feed_entries.extend(
-                self.url_result(video_id, 'Youtube', video_id=video_id)
-                for video_id in ids)
-            mobj = re.search(
-                r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                load_more_widget_html)
-            if mobj is None:
-                break
-            paging = mobj.group('paging')
-        return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
-
-
-class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
-    IE_NAME = 'youtube:recommended'
-    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
-    _FEED_NAME = 'recommended'
-    _PLAYLIST_TITLE = 'Youtube Recommended videos'
-
-
-class YoutubeWatchLaterIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:watchlater'
-    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
-
-    _TESTS = []  # override PlaylistIE tests
-
-    def _real_extract(self, url):
-        return self._extract_playlist('WL')
-
-
-class YoutubeHistoryIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:history'
-    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
-    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
-    _TESTS = []
-
-    def _real_extract(self, url):
-        title = 'Youtube History'
-        page = self._download_webpage('https://www.youtube.com/feed/history', title)
+        page = self._download_webpage(
+            'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
 
         # The extraction process is the same as for playlists, but the regex
         # for the video ids doesn't contain an index
         ids = []
         more_widget_html = content_html = page
-
         for page_num in itertools.count(1):
             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
+
+            # 'recommended' feed has infinite 'load more' and each new portion spins
+            # the same videos in (sometimes) slightly different order, so we'll check
+            # for unicity and break when portion has no new videos
+            new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+            if not new_ids:
+                break
+
             ids.extend(new_ids)
 
             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
@@ -1692,17 +1638,25 @@ class YoutubeHistoryIE(YoutubePlaylistIE):
                 break
 
             more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), title,
+                'https://youtube.com/%s' % mobj.group('more'), self._PLAYLIST_TITLE,
                 'Downloading page #%s' % page_num,
                 transform_source=uppercase_escape)
             content_html = more['content_html']
             more_widget_html = more['load_more_widget_html']
 
-        return {
-            '_type': 'playlist',
-            'title': title,
-            'entries': self._ids_to_results(ids),
-        }
+        return self.playlist_result(
+            self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
+
+
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+    IE_NAME = 'youtube:watchlater'
+    IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+
+    _TESTS = []  # override PlaylistIE tests
+
+    def _real_extract(self, url):
+        return self._extract_playlist('WL')
 
 
 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
@@ -1717,42 +1671,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
         return self.url_result(playlist_id, 'YoutubePlaylist')
 
 
-class YoutubeSubscriptionsIE(YoutubePlaylistIE):
-    IE_NAME = 'youtube:subscriptions'
-    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
-    _TESTS = []
-
-    def _real_extract(self, url):
-        title = 'Youtube Subscriptions'
-        page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
-
-        # The extraction process is the same as for playlists, but the regex
-        # for the video ids doesn't contain an index
-        ids = []
-        more_widget_html = content_html = page
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+    _FEED_NAME = 'recommended'
+    _PLAYLIST_TITLE = 'Youtube Recommended videos'
 
-        for page_num in itertools.count(1):
-            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
-            new_ids = orderedSet(matches)
-            ids.extend(new_ids)
 
-            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-            if not mobj:
-                break
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
+    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+    _FEED_NAME = 'subscriptions'
+    _PLAYLIST_TITLE = 'Youtube Subscriptions'
 
-            more = self._download_json(
-                'https://youtube.com/%s' % mobj.group('more'), title,
-                'Downloading page #%s' % page_num,
-                transform_source=uppercase_escape)
-            content_html = more['content_html']
-            more_widget_html = more['load_more_widget_html']
 
-        return {
-            '_type': 'playlist',
-            'title': title,
-            'entries': self._ids_to_results(ids),
-        }
+class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
+    _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
+    _FEED_NAME = 'history'
+    _PLAYLIST_TITLE = 'Youtube History'
 
 
 class YoutubeTruncatedURLIE(InfoExtractor):