X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=765b4a9bf769926350350e83d680d35c1e4d2775;hb=f8f60d27931421f969c7ec0a2a45caa743549994;hp=4c43d57394e430b59f67fcc1852152a020acdcc5;hpb=6e47b51eef26dbaa3634b73914e4ee7213ad38f7;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4c43d5739..765b4a9bf 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -11,7 +11,6 @@ import socket import string import struct import traceback -import xml.etree.ElementTree import zlib from .common import InfoExtractor, SearchInfoExtractor @@ -29,6 +28,7 @@ from ..utils import ( clean_html, get_cachedir, get_element_by_id, + get_element_by_attribute, ExtractorError, unescapeHTML, unified_strdate, @@ -1144,8 +1144,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'asrs': 1, }) list_url = caption_url + '&' + list_params - list_page = self._download_webpage(list_url, video_id) - caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) + caption_list = self._download_xml(list_url, video_id) original_lang_node = caption_list.find('track') if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' : self._downloader.report_warning(u'Video doesn\'t have automatic captions') @@ -1539,6 +1538,24 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): def _real_initialize(self): self._login() + def _ids_to_results(self, ids): + return [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + + def _extract_mix(self, playlist_id): + # The mixes are generated from a a single video + # the id of the playlist is just 'RD' + video_id + url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[2:], playlist_id) + webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix') + title_span = (get_element_by_attribute('class', 'title long-title', webpage) or + get_element_by_attribute('class', 'title ', webpage)) + title = clean_html(title_span) + video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id) + ids = orderedSet(re.findall(video_re, webpage)) + url_results = self._ids_to_results(ids) + + return self.playlist_result(url_results, playlist_id, title) + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -1556,6 +1573,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + if len(playlist_id) == 13: # 'RD' + 11 characters for the video id + # Mixes require a custom extraction process + return self._extract_mix(playlist_id) + # Extract the video ids from the playlist pages ids = [] @@ -1573,8 +1594,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): playlist_title = self._og_search_title(page) - url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) - for vid_id in ids] + url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) @@ -1771,7 +1791,6 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ _LOGIN_REQUIRED = True - _PAGING_STEP = 30 # use action_load_personal_feed instead of action_load_system_feed _PERSONAL_FEED = False @@ -1791,9 +1810,8 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): def _real_extract(self, url): feed_entries = [] - # The step argument is available only in 2.7 or higher - for i in itertools.count(0): - paging = i*self._PAGING_STEP + paging = 0 + for i in itertools.count(1): info = self._download_webpage(self._FEED_TEMPLATE % paging, u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) @@ -1806,6 +1824,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): for video_id in ids) if info['paging'] is None: break + paging = info['paging'] return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): @@ -1825,7 +1844,6 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' _FEED_NAME = 'watch_later' _PLAYLIST_TITLE = u'Youtube Watch Later' - _PAGING_STEP = 100 _PERSONAL_FEED = True class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): @@ -1835,13 +1853,6 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): _PERSONAL_FEED = True _PLAYLIST_TITLE = u'Youtube Watch History' - def _real_extract(self, url): - webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') - data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') - # The step is actually a ridiculously big number (like 1374343569725646) - self._PAGING_STEP = int(data_paging) - return super(YoutubeHistoryIE, self)._real_extract(url) - class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'