Check load_more_widget_html for feed paging
[youtube-dl] / youtube_dl / extractor / youtube.py
index 6e77504bf1011bc80eb42ecd4a153fe8eaeb224b..cd35a16207273c3d6d9ccd81722102f998a0ebc1 100644 (file)
@@ -1,15 +1,12 @@
 # coding: utf-8
 
-import collections
 import errno
 import io
 import itertools
 import json
 import os.path
 import re
-import struct
 import traceback
-import zlib
 
 from .common import InfoExtractor, SearchInfoExtractor
 from .subtitles import SubtitlesInfoExtractor
@@ -349,8 +346,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 
     def _extract_signature_function(self, video_id, player_url, slen):
         id_m = re.match(
-            r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3)?\.(?P<ext>[a-z]+)$',
+            r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
             player_url)
+        if not id_m:
+            raise ExtractorError('Cannot identify player %r' % player_url)
         player_type = id_m.group('ext')
         player_id = id_m.group('id')
 
@@ -609,14 +608,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
             age_gate = True
             # We simulate the access to the video from www.youtube.com/v/{video_id}
             # this can be viewed without login into Youtube
-            data = compat_urllib_parse.urlencode({'video_id': video_id,
-                                                  'el': 'player_embedded',
-                                                  'gl': 'US',
-                                                  'hl': 'en',
-                                                  'eurl': 'https://youtube.googleapis.com/v/' + video_id,
-                                                  'asv': 3,
-                                                  'sts':'1588',
-                                                  })
+            data = compat_urllib_parse.urlencode({
+                'video_id': video_id,
+                'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+                'sts': self._search_regex(
+                    r'"sts"\s*:\s*(\d+)', video_webpage, 'sts'),
+            })
             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
             video_info_webpage = self._download_webpage(video_info_url, video_id,
                                     note=False,
@@ -839,7 +836,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                                     player_desc = 'flash player %s' % player_version
                                 else:
                                     player_version = self._search_regex(
-                                        r'html5player-(.+?)\.js', video_webpage,
+                                        r'html5player-([^/]+?)(?:/html5player)?\.js',
+                                        player_url,
                                         'html5 player', fatal=False)
                                     player_desc = u'html5 player %s' % player_version
 
@@ -1327,6 +1325,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
                                           u'%s feed' % self._FEED_NAME,
                                           u'Downloading page %s' % i)
             feed_html = info.get('feed_html') or info.get('content_html')
+            load_more_widget_html = info.get('load_more_widget_html') or feed_html
             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
             ids = orderedSet(m.group(1) for m in m_ids)
             feed_entries.extend(
@@ -1334,7 +1333,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
                 for video_id in ids)
             mobj = re.search(
                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                feed_html)
+                load_more_widget_html)
             if mobj is None:
                 break
             paging = mobj.group('paging')