X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=e28db2b5a57c7208ac61ab90a026b0d26e050ba5;hb=9b583dca4cf3b623323de8fadf6dc851b7111fd2;hp=5bfe5e7e586a84b89c3365866f80d1e91bbadf73;hpb=83317f693870424c2c769e4d964453401063fdf1;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5bfe5e7e5..e28db2b5a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,7 +1,5 @@ # coding: utf-8 -import errno -import io import itertools import json import os.path @@ -21,7 +19,6 @@ from ..utils import ( compat_str, clean_html, - get_cachedir, get_element_by_id, get_element_by_attribute, ExtractorError, @@ -30,7 +27,6 @@ from ..utils import ( unescapeHTML, unified_strdate, orderedSet, - write_json_file, uppercase_escape, ) @@ -203,7 +199,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""(?x)^ ( - (?:https?://|//)? # http(s):// or protocol-independent URL (optional) + (?:https?://|//) # http(s):// or protocol-independent URL (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| @@ -221,7 +217,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): ) )) |youtu\.be/ # just youtu.be/xxxx - |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= + |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID @@ -297,7 +293,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash webm audio - '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, + '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, # RTMP (unnamed) @@ -316,6 +312,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"upload_date": u"20121002", u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .", u"categories": [u'Science & Technology'], + 'like_count': int, + 'dislike_count': int, } }, { @@ -433,19 +431,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): func_id = '%s_%s_%s' % ( player_type, player_id, self._signature_cache_id(example_sig)) assert os.path.basename(func_id) == func_id - cache_dir = get_cachedir(self._downloader.params) - cache_enabled = cache_dir is not None - if cache_enabled: - cache_fn = os.path.join(os.path.expanduser(cache_dir), - u'youtube-sigfuncs', - func_id + '.json') - try: - with io.open(cache_fn, 'r', encoding='utf-8') as cachef: - cache_spec = json.load(cachef) - return lambda s: u''.join(s[i] for i in cache_spec) - except IOError: - pass # No cache available + cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id) + if cache_spec is not None: + return lambda s: u''.join(s[i] for i in cache_spec) if player_type == 'js': code = self._download_webpage( @@ -463,22 +452,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: assert False, 'Invalid player type %r' % player_type - if cache_enabled: - try: - test_string = u''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - try: - os.makedirs(os.path.dirname(cache_fn)) - except OSError as ose: - if ose.errno != errno.EEXIST: - raise - write_json_file(cache_spec, cache_fn) - except Exception: - tb = traceback.format_exc() - self._downloader.report_warning( - u'Writing cache to %r failed: %s' % (cache_fn, tb)) + if cache_spec is None: + test_string = u''.join(map(compat_chr, range(len(example_sig)))) + cache_res = res(test_string) + cache_spec = [ord(c) for c in cache_res] + self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec) return res def _print_sig_code(self, func, example_sig): @@ -573,6 +552,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): sub_lang_list = {} for l in lang_list: lang = l[1] + if lang in sub_lang_list: + continue params = compat_urllib_parse.urlencode({ 'lang': lang, 'v': video_id, @@ -775,7 +756,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) - m_cat_container = get_element_by_id("eow-category", video_webpage) + m_cat_container = self._search_regex( + r'(?s)]*>\s*Category\s*\s*]*>(.*?)', + video_webpage, 'categories', fatal=False) if m_cat_container: category = self._html_search_regex( r'(?s)(.*?)', m_cat_container, 'category', @@ -804,15 +787,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: video_description = u'' - def _extract_count(klass): + def _extract_count(count_name): count = self._search_regex( - r'class="%s">([\d,]+)' % re.escape(klass), - video_webpage, klass, default=None) + r'id="watch-%s"[^>]*>.*?([\d,]+)\s*' % re.escape(count_name), + video_webpage, count_name, default=None) if count is not None: return int(count.replace(',', '')) return None - like_count = _extract_count(u'likes-count') - dislike_count = _extract_count(u'dislikes-count') + like_count = _extract_count(u'like') + dislike_count = _extract_count(u'dislike') # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) @@ -1042,21 +1025,26 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): self._login() def _ids_to_results(self, ids): - return [self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] + return [ + self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] def _extract_mix(self, playlist_id): # The mixes are generated from a a single video # the id of the playlist is just 'RD' + video_id url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id) - webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') + webpage = self._download_webpage( + url, playlist_id, u'Downloading Youtube mix') search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) - title_span = (search_title('playlist-title') or - search_title('title long-title') or search_title('title')) + title_span = ( + search_title('playlist-title') or + search_title('title long-title') or + search_title('title')) title = clean_html(title_span) - video_re = r'''(?x)data-video-username=".*?".*? - href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) - ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) + ids = orderedSet(re.findall( + r'''(?xs)data-video-username=".*?".*? + href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), + webpage)) url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -1149,6 +1137,7 @@ class YoutubeTopListIE(YoutubePlaylistIE): msg = u'Downloading Youtube mix' if i > 0: msg += ', retry #%d' % i + webpage = self._download_webpage(url, title, msg) ids = orderedSet(re.findall(video_re, webpage)) if ids: @@ -1408,6 +1397,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) feed_html = info.get('feed_html') or info.get('content_html') + load_more_widget_html = info.get('load_more_widget_html') or feed_html m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) feed_entries.extend( @@ -1415,18 +1405,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for video_id in ids) mobj = re.search( r'data-uix-load-more-href="/?[^"]+paging=(?P\d+)', - feed_html) + load_more_widget_html) if mobj is None: break paging = mobj.group('paging') return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' - _FEED_NAME = 'subscriptions' - _PLAYLIST_TITLE = u'Youtube Subscriptions' - class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' @@ -1459,6 +1443,43 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): return self.url_result(playlist_id, 'YoutubePlaylist') +class YoutubeSubscriptionsIE(YoutubePlaylistIE): + IE_NAME = u'youtube:subscriptions' + IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' + + def _real_extract(self, url): + title = u'Youtube Subscriptions' + page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title) + + # The extraction process is the same as for playlists, but the regex + # for the video ids doesn't contain an index + ids = [] + more_widget_html = content_html = page + + for page_num in itertools.count(1): + matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html) + new_ids = orderedSet(matches) + ids.extend(new_ids) + + mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html) + if not mobj: + break + + more = self._download_json( + 'https://youtube.com/%s' % mobj.group('more'), title, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) + content_html = more['content_html'] + more_widget_html = more['load_more_widget_html'] + + return { + '_type': 'playlist', + 'title': title, + 'entries': self._ids_to_results(ids), + } + + class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list