projects
/
youtube-dl
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
25f14e9
)
[youtube:feed] Check each 'load more' portion for unique video ids
author
Sergey M․
<dstftw@gmail.com>
Fri, 15 May 2015 15:42:34 +0000
(21:42 +0600)
committer
Sergey M․
<dstftw@gmail.com>
Fri, 15 May 2015 15:42:34 +0000
(21:42 +0600)
youtube_dl/extractor/youtube.py
patch
|
blob
|
history
diff --git
a/youtube_dl/extractor/youtube.py
b/youtube_dl/extractor/youtube.py
index 9096a29756ca6e1a66ecd442a92977fa1b999b31..1f9940cf5c1e4c8698a8a0bed9a874d5a79f18b0 100644
(file)
--- a/
youtube_dl/extractor/youtube.py
+++ b/
youtube_dl/extractor/youtube.py
@@
-1621,10
+1621,16
@@
class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
# for the video ids doesn't contain an index
ids = []
more_widget_html = content_html = page
# for the video ids doesn't contain an index
ids = []
more_widget_html = content_html = page
-
for page_num in itertools.count(1):
matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
for page_num in itertools.count(1):
matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
+
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+ if not new_ids:
+ break
+
ids.extend(new_ids)
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
ids.extend(new_ids)
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)