aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSergey M․ <dstftw@gmail.com>2015-05-15 21:42:34 +0600
committerSergey M․ <dstftw@gmail.com>2015-05-15 21:42:34 +0600
commit62c95fd5fcb8dbea2faeb4edac4c5177cbac5912 (patch)
tree7b4761274ee89593399cd25998e754b6c262f4bb
parent25f14e9f93295a787e0cb436a5f6179d6174733d (diff)
downloadyoutube-dl-62c95fd5fcb8dbea2faeb4edac4c5177cbac5912.tar.xz
[youtube:feed] Check each 'load more' portion for unique video ids
-rw-r--r--youtube_dl/extractor/youtube.py10
1 files changed, 8 insertions, 2 deletions
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 9096a2975..1f9940cf5 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1621,10 +1621,16 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
# for the video ids doesn't contain an index
ids = []
more_widget_html = content_html = page
-
for page_num in itertools.count(1):
matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
- new_ids = orderedSet(matches)
+
+ # 'recommended' feed has infinite 'load more' and each new portion spins
+ # the same videos in (sometimes) slightly different order, so we'll check
+ # for unicity and break when portion has no new videos
+ new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+ if not new_ids:
+ break
+
ids.extend(new_ids)
mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)