X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=f86929e06f2591f569c77b166fd7e1769831a772;hb=aedd6bb97de7ed82da746892e2a0b32ee9c6d6b8;hp=9d3f61be62bacc03ee9a415051943091a903ab6b;hpb=32a09b43820fbc27dc29c0c98f517462fabd47d6;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9d3f61be6..f86929e06 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -23,8 +23,114 @@ from ..utils import ( orderedSet, ) +class YoutubeBaseInfoExtractor(InfoExtractor): + """Provide base functions for Youtube extractors""" + _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' + _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' + _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' + _NETRC_MACHINE = 'youtube' + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = False + + def report_lang(self): + """Report attempt to set language.""" + self.to_screen(u'Setting language') + + def _set_language(self): + request = compat_urllib_request.Request(self._LANG_URL) + try: + self.report_lang() + compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) + return False + return True + + def _login(self): + (username, password) = self._get_login_info() + # No authentication to be performed + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return False + + request = compat_urllib_request.Request(self._LOGIN_URL) + try: + login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) + return False -class YoutubeIE(InfoExtractor): + galx = None + dsh = None + match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') + return False + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + return False + return True + + def _confirm_age(self): + age_form = { + 'next_url': '/', + 'action_confirm': 'Confirm', + } + request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) + try: + self.report_age_confirmation() + compat_urllib_request.urlopen(request).read().decode('utf-8') + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) + return True + + def _real_initialize(self): + if self._downloader is None: + return + if not self._set_language(): + return + if not self._login(): + return + self._confirm_age() + +class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( @@ -45,11 +151,7 @@ class YoutubeIE(InfoExtractor): ([0-9A-Za-z_-]+) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" - _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _NETRC_MACHINE = 'youtube' # Listed in order of quality _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] @@ -139,10 +241,6 @@ class YoutubeIE(InfoExtractor): if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - def report_lang(self): - """Report attempt to set language.""" - self.to_screen(u'Setting language') - def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -198,7 +296,7 @@ class YoutubeIE(InfoExtractor): elif len(s) == 82: return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] elif len(s) == 81: - return s[6] + s[3:6] + s[33] + s[7:24] + s[0] + s[25:33] + s[2] + s[34:53] + s[24] + s[54:81] + return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) @@ -306,91 +404,6 @@ class YoutubeIE(InfoExtractor): for x in formats: print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) - def _real_initialize(self): - if self._downloader is None: - return - - # Set language - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) - return - - (username, password) = self._get_login_info() - - # No authentication to be performed - if username is None: - return - - request = compat_urllib_request.Request(self._LOGIN_URL) - try: - login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) - return - - galx = None - dsh = None - match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) - return - - # Confirm age - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) if mobj is None: @@ -670,10 +683,10 @@ class YoutubePlaylistIE(InfoExtractor): \? (?:.*?&)*? (?:p|a|list)= | p/ ) - ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,}) .* | - ((?:PL|EC|UU)[0-9A-Za-z-_]{10,}) + ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' _MAX_RESULTS = 50 @@ -692,11 +705,14 @@ class YoutubePlaylistIE(InfoExtractor): # Download playlist videos from API playlist_id = mobj.group(1) or mobj.group(2) - page_num = 1 videos = [] - while True: - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1) + for page_num in itertools.count(1): + start_index = self._MAX_RESULTS * (page_num - 1) + 1 + if start_index >= 1000: + self._downloader.report_warning(u'Max number of results reached') + break + url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) try: @@ -716,10 +732,6 @@ class YoutubePlaylistIE(InfoExtractor): if 'media$group' in entry and 'media$player' in entry['media$group']: videos.append((index, entry['media$group']['media$player']['url'])) - if len(response['feed']['entry']) < self._MAX_RESULTS: - break - page_num += 1 - videos = [v[1] for v in sorted(videos)] url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] @@ -762,9 +774,7 @@ class YoutubeChannelIE(InfoExtractor): # Download any subsequent channel pages using the json-based channel_ajax query if self._MORE_PAGES_INDICATOR in page: - while True: - pagenum = pagenum + 1 - + for pagenum in itertools.count(1): url = self._MORE_PAGES_URL % (pagenum, channel_id) page = self._download_webpage(url, channel_id, u'Downloading page #%s' % pagenum) @@ -807,9 +817,8 @@ class YoutubeUserIE(InfoExtractor): # all of them. video_ids = [] - pagenum = 0 - while True: + for pagenum in itertools.count(0): start_index = pagenum * self._GDATA_PAGE_SIZE + 1 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index) @@ -834,8 +843,6 @@ class YoutubeUserIE(InfoExtractor): if len(ids_in_page) < self._GDATA_PAGE_SIZE: break - pagenum += 1 - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] return [self.playlist_result(url_results, playlist_title = username)] @@ -898,33 +905,30 @@ class YoutubeShowIE(InfoExtractor): return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons] -class YoutubeFeedsInfoExtractor(YoutubeIE): +class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): """ Base class for extractors that fetch info from http://www.youtube.com/feed_ajax Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. """ + _LOGIN_REQUIRED = True _PAGING_STEP = 30 - - # Overwrite YoutubeIE properties we don't want - _TESTS = [] - @classmethod - def suitable(cls, url): - return re.match(cls._VALID_URL, url) is not None + # use action_load_personal_feed instead of action_load_system_feed + _PERSONAL_FEED = False @property def _FEED_TEMPLATE(self): - return 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=%s&paging=%%s' % self._FEED_NAME + action = 'action_load_system_feed' + if self._PERSONAL_FEED: + action = 'action_load_personal_feed' + return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME) @property def IE_NAME(self): return u'youtube:%s' % self._FEED_NAME def _real_initialize(self): - (username, password) = self._get_login_info() - if username is None: - raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True) - super(YoutubeFeedsInfoExtractor, self)._real_initialize() + self._login() def _real_extract(self, url): feed_entries = [] @@ -936,7 +940,7 @@ class YoutubeFeedsInfoExtractor(YoutubeIE): u'Downloading page %s' % i) info = json.loads(info) feed_html = info['feed_html'] - m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html) + m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) if info['paging'] is None: @@ -954,3 +958,22 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor): _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?' _FEED_NAME = 'recommended' _PLAYLIST_TITLE = u'Youtube Recommended videos' + +class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater' + _FEED_NAME = 'watch_later' + _PLAYLIST_TITLE = u'Youtube Watch Later' + _PAGING_STEP = 100 + _PERSONAL_FEED = True + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): + IE_NAME = u'youtube:favorites' + IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' + _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' + _LOGIN_REQUIRED = True + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') + playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id') + return self.url_result(playlist_id, 'YoutubePlaylist')