Merge remote-tracking branch 'drags/yt-feed-loadmore'
[youtube-dl] / youtube_dl / extractor / youtube.py
index 78f3b7e7bddb1b085c96ae41696837178c2f42aa..e28db2b5a57c7208ac61ab90a026b0d26e050ba5 100644 (file)
@@ -1,7 +1,5 @@
 # coding: utf-8
 
-import errno
-import io
 import itertools
 import json
 import os.path
@@ -21,7 +19,6 @@ from ..utils import (
     compat_str,
 
     clean_html,
-    get_cachedir,
     get_element_by_id,
     get_element_by_attribute,
     ExtractorError,
@@ -30,7 +27,6 @@ from ..utils import (
     unescapeHTML,
     unified_strdate,
     orderedSet,
-    write_json_file,
     uppercase_escape,
 )
 
@@ -203,7 +199,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
     IE_DESC = u'YouTube.com'
     _VALID_URL = r"""(?x)^
                      (
-                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
+                         (?:https?://|//)                                    # http(s):// or protocol-independent URL
                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                             (?:www\.)?deturl\.com/www\.youtube\.com/|
                             (?:www\.)?pwnyoutube\.com/|
@@ -221,7 +217,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                              )
                          ))
                          |youtu\.be/                                          # just youtu.be/xxxx
-                         |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
+                         |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
                          )
                      )?                                                       # all until now is optional -> you can pass the naked ID
                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
@@ -435,26 +431,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         func_id = '%s_%s_%s' % (
             player_type, player_id, self._signature_cache_id(example_sig))
         assert os.path.basename(func_id) == func_id
-        cache_dir = get_cachedir(self._downloader.params)
 
-        cache_enabled = cache_dir is not None
-        if cache_enabled:
-            cache_fn = os.path.join(os.path.expanduser(cache_dir),
-                                    u'youtube-sigfuncs',
-                                    func_id + '.json')
-            try:
-                with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
-                    cache_spec = json.load(cachef)
-                return lambda s: u''.join(s[i] for i in cache_spec)
-            except IOError:
-                pass  # No cache available
-            except ValueError:
-                try:
-                    file_size = os.path.getsize(cache_fn)
-                except (OSError, IOError) as oe:
-                    file_size = str(oe)
-                self._downloader.report_warning(
-                    u'Cache %s failed (%s)' % (cache_fn, file_size))
+        cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
+        if cache_spec is not None:
+            return lambda s: u''.join(s[i] for i in cache_spec)
 
         if player_type == 'js':
             code = self._download_webpage(
@@ -472,22 +452,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
         else:
             assert False, 'Invalid player type %r' % player_type
 
-        if cache_enabled:
-            try:
-                test_string = u''.join(map(compat_chr, range(len(example_sig))))
-                cache_res = res(test_string)
-                cache_spec = [ord(c) for c in cache_res]
-                try:
-                    os.makedirs(os.path.dirname(cache_fn))
-                except OSError as ose:
-                    if ose.errno != errno.EEXIST:
-                        raise
-                write_json_file(cache_spec, cache_fn)
-            except Exception:
-                tb = traceback.format_exc()
-                self._downloader.report_warning(
-                    u'Writing cache to %r failed: %s' % (cache_fn, tb))
+        if cache_spec is None:
+            test_string = u''.join(map(compat_chr, range(len(example_sig))))
+            cache_res = res(test_string)
+            cache_spec = [ord(c) for c in cache_res]
 
+        self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
         return res
 
     def _print_sig_code(self, func, example_sig):
@@ -1055,21 +1025,26 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         self._login()
 
     def _ids_to_results(self, ids):
-        return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
-                       for vid_id in ids]
+        return [
+            self.url_result(vid_id, 'Youtube', video_id=vid_id)
+            for vid_id in ids]
 
     def _extract_mix(self, playlist_id):
         # The mixes are generated from a a single video
         # the id of the playlist is just 'RD' + video_id
         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
-        webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
+        webpage = self._download_webpage(
+            url, playlist_id, u'Downloading Youtube mix')
         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
-        title_span = (search_title('playlist-title') or
-            search_title('title long-title') or search_title('title'))
+        title_span = (
+            search_title('playlist-title') or
+            search_title('title long-title') or
+            search_title('title'))
         title = clean_html(title_span)
-        video_re = r'''(?x)data-video-username=".*?".*?
-                       href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id)
-        ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
+        ids = orderedSet(re.findall(
+            r'''(?xs)data-video-username=".*?".*?
+                       href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id),
+            webpage))
         url_results = self._ids_to_results(ids)
 
         return self.playlist_result(url_results, playlist_id, title)
@@ -1162,6 +1137,7 @@ class YoutubeTopListIE(YoutubePlaylistIE):
             msg = u'Downloading Youtube mix'
             if i > 0:
                 msg += ', retry #%d' % i
+
             webpage = self._download_webpage(url, title, msg)
             ids = orderedSet(re.findall(video_re, webpage))
             if ids:
@@ -1421,6 +1397,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
                                           u'%s feed' % self._FEED_NAME,
                                           u'Downloading page %s' % i)
             feed_html = info.get('feed_html') or info.get('content_html')
+            load_more_widget_html = info.get('load_more_widget_html') or feed_html
             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
             ids = orderedSet(m.group(1) for m in m_ids)
             feed_entries.extend(
@@ -1428,7 +1405,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
                 for video_id in ids)
             mobj = re.search(
                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                feed_html)
+                load_more_widget_html)
             if mobj is None:
                 break
             paging = mobj.group('paging')