X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=1bf9cb7d4a73260325f5b8e175b6e57fe87c0c35;hb=ea36cbac5e089d5e37a2f92ea58375a5883d0af2;hp=6ddd6ef06791d52de1245dc1d1744f32be4d96f5;hpb=9103bbc5cd11957de2e906e4401dcf4df9511d28;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 6ddd6ef06..1bf9cb7d4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -139,10 +139,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_DESC = u'YouTube.com' - _VALID_URL = r"""^ + _VALID_URL = r"""(?x)^ ( - (?:https?://)? # http(s):// (optional) - (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| + (?:https?://|//)? # http(s):// or protocol-independent URL (optional) + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls @@ -248,21 +248,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '248': 'webm', } _video_dimensions = { - '5': '240x400', + '5': '400x240', '6': '???', '13': '???', - '17': '144x176', - '18': '360x640', - '22': '720x1280', - '34': '360x640', - '35': '480x854', - '36': '240x320', - '37': '1080x1920', - '38': '3072x4096', - '43': '360x640', - '44': '480x854', - '45': '720x1280', - '46': '1080x1920', + '17': '176x144', + '18': '640x360', + '22': '1280x720', + '34': '640x360', + '35': '854x480', + '36': '320x240', + '37': '1920x1080', + '38': '4096x3072', + '43': '640x360', + '44': '854x480', + '45': '1280x720', + '46': '1920x1080', '82': '360p', '83': '480p', '84': '720p', @@ -363,6 +363,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader_id": u"justintimberlakeVEVO" } }, + { + u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ", + u"file": u"yZIXLfi8CZQ.mp4", + u"note": u"Embed-only video (#1746)", + u"info_dict": { + u"upload_date": u"20120608", + u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012", + u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7", + u"uploader": u"SET India", + u"uploader_id": u"setindia" + } + }, ] @@ -370,7 +382,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" if YoutubePlaylistIE.suitable(url): return False - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + return re.match(cls._VALID_URL, url) is not None def __init__(self, *args, **kwargs): super(YoutubeIE, self).__init__(*args, **kwargs) @@ -1019,6 +1031,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): """Turn the encrypted s field into a working signature""" if player_url is not None: + if player_url.startswith(u'//'): + player_url = u'https:' + player_url try: player_id = (player_url, len(s)) if player_id not in self._player_cache: @@ -1082,7 +1096,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s))) - def _get_available_subtitles(self, video_id): + def _get_available_subtitles(self, video_id, webpage): try: sub_list = self._download_webpage( 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, @@ -1098,7 +1112,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): params = compat_urllib_parse.urlencode({ 'lang': lang, 'v': video_id, - 'fmt': self._downloader.params.get('subtitlesformat'), + 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), 'name': l[0].encode('utf-8'), }) url = u'http://www.youtube.com/api/timedtext?' + params @@ -1111,7 +1125,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _get_available_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_format = self._downloader.params.get('subtitlesformat') + sub_format = self._downloader.params.get('subtitlesformat', 'srt') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) err_msg = u'Couldn\'t find automatic captions for %s' % video_id @@ -1270,7 +1284,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube data = compat_urllib_parse.urlencode({'video_id': video_id, - 'el': 'embedded', + 'el': 'player_embedded', 'gl': 'US', 'hl': 'en', 'eurl': 'https://youtube.googleapis.com/v/' + video_id, @@ -1299,6 +1313,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): else: raise ExtractorError(u'"token" parameter not in video info for unknown reason') + if 'view_count' in video_info: + view_count = int(video_info['view_count'][0]) + else: + view_count = None + # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: raise ExtractorError(u'"rental" videos not supported') @@ -1487,10 +1506,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id, + 'view_count': view_count, }) return results -class YoutubePlaylistIE(InfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com playlists' _VALID_URL = r"""(?: (?:https?://)? @@ -1506,8 +1526,9 @@ class YoutubePlaylistIE(InfoExtractor): | ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' - _MAX_RESULTS = 50 + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' + _MORE_PAGES_INDICATOR = r'data-link-type="next"' + _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&' IE_NAME = u'youtube:playlist' @classmethod @@ -1515,6 +1536,9 @@ class YoutubePlaylistIE(InfoExtractor): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def _real_initialize(self): + self._login() + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -1528,51 +1552,33 @@ class YoutubePlaylistIE(InfoExtractor): video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) - return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube') + return self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - # Download playlist videos from API - videos = [] + # Extract the video ids from the playlist pages + ids = [] for page_num in itertools.count(1): - start_index = self._MAX_RESULTS * (page_num - 1) + 1 - if start_index >= 1000: - self._downloader.report_warning(u'Max number of results reached') - break - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) + url = self._TEMPLATE_URL % (playlist_id, page_num) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) + # The ids are duplicated + new_ids = orderedSet(re.findall(self._VIDEO_RE, page)) + ids.extend(new_ids) - try: - response = json.loads(page) - except ValueError as err: - raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) - - if 'feed' not in response: - raise ExtractorError(u'Got a malformed response from YouTube API') - playlist_title = response['feed']['title']['$t'] - if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS + if re.search(self._MORE_PAGES_INDICATOR, page) is None: break - for entry in response['feed']['entry']: - index = entry['yt$position']['$t'] - if 'media$group' in entry and 'yt$videoid' in entry['media$group']: - videos.append(( - index, - 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] - )) - - videos = [v[1] for v in sorted(videos)] + playlist_title = self._og_search_title(page) - url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] - return [self.playlist_result(url_results, playlist_id, playlist_title)] + url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] + return self.playlist_result(url_results, playlist_id, playlist_title) class YoutubeChannelIE(InfoExtractor): IE_DESC = u'YouTube.com channels' _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)" - _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en' _MORE_PAGES_INDICATOR = 'yt-uix-load-more' _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s' IE_NAME = u'youtube:channel' @@ -1593,36 +1599,37 @@ class YoutubeChannelIE(InfoExtractor): # Download channel page channel_id = mobj.group(1) video_ids = [] - pagenum = 1 - - url = self._TEMPLATE_URL % (channel_id, pagenum) - page = self._download_webpage(url, channel_id, - u'Downloading page #%s' % pagenum) - - # Extract video identifiers - ids_in_page = self.extract_videos_from_page(page) - video_ids.extend(ids_in_page) + url = 'https://www.youtube.com/channel/%s/videos' % channel_id + channel_page = self._download_webpage(url, channel_id) + if re.search(r'channel-header-autogenerated-label', channel_page) is not None: + autogenerated = True + else: + autogenerated = False - # Download any subsequent channel pages using the json-based channel_ajax query - if self._MORE_PAGES_INDICATOR in page: + if autogenerated: + # The videos are contained in a single page + # the ajax pages can't be used, they are empty + video_ids = self.extract_videos_from_page(channel_page) + else: + # Download all channel pages using the json-based channel_ajax query for pagenum in itertools.count(1): url = self._MORE_PAGES_URL % (pagenum, channel_id) page = self._download_webpage(url, channel_id, u'Downloading page #%s' % pagenum) - + page = json.loads(page) - + ids_in_page = self.extract_videos_from_page(page['content_html']) video_ids.extend(ids_in_page) - - if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: + + if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']: break self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] - url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls] - return [self.playlist_result(url_entries, channel_id)] + url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_entries, channel_id) class YoutubeUserIE(InfoExtractor): @@ -1686,9 +1693,11 @@ class YoutubeUserIE(InfoExtractor): if len(ids_in_page) < self._GDATA_PAGE_SIZE: break - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] - url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] - return [self.playlist_result(url_results, playlist_title = username)] + url_results = [ + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_results, playlist_title=username) + class YoutubeSearchIE(SearchInfoExtractor): IE_DESC = u'YouTube.com searches' @@ -1729,9 +1738,14 @@ class YoutubeSearchIE(SearchInfoExtractor): if len(video_ids) > n: video_ids = video_ids[:n] - videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] + videos = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] return self.playlist_result(videos, query) +class YoutubeSearchDateIE(YoutubeSearchIE): + _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published' + _SEARCH_KEY = 'ytsearchdate' + IE_DESC = u'YouTube.com searches, newest videos first' class YoutubeShowIE(InfoExtractor): IE_DESC = u'YouTube.com (multi-season) shows' @@ -1785,7 +1799,9 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_html = info['feed_html'] m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) + feed_entries.extend( + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in ids) if info['paging'] is None: break return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) @@ -1810,6 +1826,20 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _PAGING_STEP = 100 _PERSONAL_FEED = True +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)' + _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PERSONAL_FEED = True + _PLAYLIST_TITLE = u'Youtube Watch History' + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') + data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') + # The step is actually a ridiculously big number (like 1374343569725646) + self._PAGING_STEP = int(data_paging) + return super(YoutubeHistoryIE, self)._real_extract(url) + class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'