X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=youtube-dl;a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=5e93b53294706ad3f5e766deff724faf37993aeb;hp=117a57911eb1a0b2cb8c277820597f9e6934c092;hb=2f483bc1c389709623117079439708783122b5ec;hpb=9a6628aaf918afdcdf4c661f474e318207155780 diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 117a57911..5e93b5329 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -41,12 +41,14 @@ from ..utils import ( remove_quotes, remove_start, smuggle_url, + str_or_none, str_to_int, try_get, unescapeHTML, unified_strdate, unsmuggle_url, uppercase_escape, + url_or_none, urlencode_postdata, ) @@ -64,7 +66,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' def _set_language(self): self._set_cookie( @@ -259,7 +261,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True def _download_webpage_handle(self, *args, **kwargs): - kwargs.setdefault('query', {})['disable_polymer'] = 'true' + query = kwargs.get('query', {}).copy() + query['disable_polymer'] = 'true' + kwargs['query'] = query return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle( *args, **compat_kwargs(kwargs)) @@ -347,6 +351,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + (?:www\.)?invidio\.us/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -490,12 +495,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/phihag', + 'channel_id': 'UCLqxVugv74EIW3VWh2NOa3Q', + 'channel_url': r're:https?://(?:www\.)?youtube\.com/channel/UCLqxVugv74EIW3VWh2NOa3Q', 'upload_date': '20121002', 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -578,6 +586,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], 'duration': 10, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -1064,6 +1073,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', 'only_matching': True, }, + { + 'url': 'https://invidio.us/watch?v=BaW_jenozKc', + 'only_matching': True, + }, + { + # DRM protected + 'url': 'https://www.youtube.com/watch?v=s7_qI6_mIXc', + 'only_matching': True, + } ] def __init__(self, *args, **kwargs): @@ -1092,7 +1110,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1178,7 +1196,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( (r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\.sig\|\|(?P[a-zA-Z0-9$]+)\('), + r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) @@ -1371,8 +1392,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(err_msg) return {} - def _mark_watched(self, video_id, video_info): - playback_url = video_info.get('videostats_playback_base_url', [None])[0] + def _mark_watched(self, video_id, video_info, player_response): + playback_url = url_or_none(try_get( + player_response, + lambda x: x['playbackTracking']['videostatsPlaybackUrl']['baseUrl']) or try_get( + video_info, lambda x: x['videostats_playback_base_url'][0])) if not playback_url: return parsed_playback_url = compat_urlparse.urlparse(playback_url) @@ -1521,12 +1545,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + def add_dash_mpd_pr(pl_response): + dash_mpd = url_or_none(try_get( + pl_response, lambda x: x['streamingData']['dashManifestUrl'], + compat_str)) + if dash_mpd and dash_mpd not in dash_mpds: + dash_mpds.append(dash_mpd) + is_live = None view_count = None def extract_view_count(v_info): return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + player_response = {} + # Get video info embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: @@ -1569,7 +1602,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if args.get('livestream') == '1' or args.get('live_playback') == 1: is_live = True sts = ytplayer_config.get('sts') + if not player_response: + pl_response = str_or_none(args.get('player_response')) + if pl_response: + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + player_response = pl_response if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): + add_dash_mpd_pr(player_response) # We also try looking in get_video_info since it may contain different dashmpd # URL that points to a DASH manifest with possibly different itag set (some itags # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH @@ -1597,6 +1637,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not video_info_webpage: continue get_video_info = compat_parse_qs(video_info_webpage) + if not player_response: + pl_response = get_video_info.get('player_response', [None])[0] + if isinstance(pl_response, dict): + player_response = pl_response + add_dash_mpd_pr(player_response) add_dash_mpd(get_video_info) if view_count is None: view_count = extract_view_count(get_video_info) @@ -1642,9 +1687,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + if video_info.get('license_info'): + raise ExtractorError('This video is DRM protected.', expected=True) + + video_details = try_get( + player_response, lambda x: x['videoDetails'], dict) or {} + # title if 'title' in video_info: video_title = video_info['title'][0] + elif 'title' in player_response: + video_title = video_details['title'] else: self._downloader.report_warning('Unable to extract video title') video_title = '_' @@ -1680,33 +1733,41 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: video_description = '' - if 'multifeed_metadata_list' in video_info and not smuggled_data.get('force_singlefeed', False): + if not smuggled_data.get('force_singlefeed', False): if not self._downloader.params.get('noplaylist'): - entries = [] - feed_ids = [] - multifeed_metadata_list = video_info['multifeed_metadata_list'][0] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/rg3/youtube-dl/issues/8536) - feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': '%s (%s)' % (video_title, feed_data['title'][0]), - }) - feed_ids.append(feed_data['id'][0]) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result(entries, video_id, video_title, video_description) - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + multifeed_metadata_list = try_get( + player_response, + lambda x: x['multicamera']['playerLegacyMulticameraRenderer']['metadataList'], + compat_str) or try_get( + video_info, lambda x: x['multifeed_metadata_list'][0], compat_str) + if multifeed_metadata_list: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/rg3/youtube-dl/issues/8536) + feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%s://www.youtube.com/watch?v=%s' % (proto, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': '%s (%s)' % (video_title, feed_data['title'][0]), + }) + feed_ids.append(feed_data['id'][0]) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result(entries, video_id, video_title, video_description) + else: + self.to_screen('Downloading just video %s because of --no-playlist' % video_id) if view_count is None: view_count = extract_view_count(video_info) + if view_count is None and video_details: + view_count = int_or_none(video_details.get('viewCount')) # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: @@ -1742,11 +1803,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'height': int_or_none(width_height[1]), } q = qualities(['small', 'medium', 'hd720']) + streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) + if streaming_formats: + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: + continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + 'quality': q(quality), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } formats = [] for url_data_str in encoded_url_map.split(','): url_data = compat_parse_qs(url_data_str) if 'itag' not in url_data or 'url' not in url_data: continue + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) + # Unsupported FORMAT_STREAM_TYPE_OTF + if stream_type == 3: + continue format_id = url_data['itag'][0] url = url_data['url'][0] @@ -1790,7 +1874,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1824,7 +1908,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) - quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0] + quality = url_data.get('quality', [None])[0] more_fields = { 'filesize': filesize, @@ -1832,7 +1916,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': width, 'height': height, 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': quality, + 'format_note': url_data.get('quality_label', [None])[0] or quality, 'quality': q(quality), } for key, value in more_fields.items(): @@ -1860,34 +1944,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'http_chunk_size': 10485760, } formats.append(dct) - elif video_info.get('hlsvp'): - manifest_url = video_info['hlsvp'][0] - formats = [] - m3u8_formats = self._extract_m3u8_formats( - manifest_url, video_id, 'mp4', fatal=False) - for a_format in m3u8_formats: - itag = self._search_regex( - r'/itag/(\d+)/', a_format['url'], 'itag', default=None) - if itag: - a_format['format_id'] = itag - if itag in self._formats: - dct = self._formats[itag].copy() - dct.update(a_format) - a_format = dct - a_format['player_url'] = player_url - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' - formats.append(a_format) else: - error_message = clean_html(video_info.get('reason', [None])[0]) - if not error_message: - error_message = extract_unavailable_message() - if error_message: - raise ExtractorError(error_message, expected=True) - raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info') + manifest_url = ( + url_or_none(try_get( + player_response, + lambda x: x['streamingData']['hlsManifestUrl'], + compat_str)) or + url_or_none(try_get( + video_info, lambda x: x['hlsvp'][0], compat_str))) + if manifest_url: + formats = [] + m3u8_formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', fatal=False) + for a_format in m3u8_formats: + itag = self._search_regex( + r'/itag/(\d+)/', a_format['url'], 'itag', default=None) + if itag: + a_format['format_id'] = itag + if itag in self._formats: + dct = self._formats[itag].copy() + dct.update(a_format) + a_format = dct + a_format['player_url'] = player_url + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming + a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' + formats.append(a_format) + else: + error_message = clean_html(video_info.get('reason', [None])[0]) + if not error_message: + error_message = extract_unavailable_message() + if error_message: + raise ExtractorError(error_message, expected=True) + raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') # uploader - video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str) + video_uploader = try_get( + video_info, lambda x: x['author'][0], + compat_str) or str_or_none(video_details.get('author')) if video_uploader: video_uploader = compat_urllib_parse_unquote_plus(video_uploader) else: @@ -1905,6 +1998,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') + channel_id = self._html_search_meta( + 'channelId', video_webpage, 'channel id') + channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None + # thumbnail image # We try first to get a high quality image: m_thumb = re.search(r'', @@ -1966,7 +2063,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)', video_webpage) if m_episode: - series = m_episode.group('series') + series = unescapeHTML(m_episode.group('series')) season_number = int(m_episode.group('season')) episode_number = int(m_episode.group('episode')) else: @@ -1996,12 +2093,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): like_count = _extract_count('like') dislike_count = _extract_count('dislike') + if view_count is None: + view_count = str_to_int(self._search_regex( + r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, + 'view count', default=None)) + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) video_duration = try_get( video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = int_or_none(video_details.get('lengthSeconds')) if not video_duration: video_duration = parse_duration(self._html_search_meta( 'duration', video_webpage, 'video duration')) @@ -2069,13 +2173,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._sort_formats(formats) - self.mark_watched(video_id, video_info) + self.mark_watched(video_id, video_info, player_response) return { 'id': video_id, 'uploader': video_uploader, 'uploader_id': video_uploader_id, 'uploader_url': video_uploader_url, + 'channel_id': channel_id, + 'channel_url': channel_url, 'upload_date': upload_date, 'license': video_license, 'creator': video_creator or artist, @@ -2114,7 +2220,11 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?:https?://)? (?:\w+\.)? (?: - youtube\.com/ + (?: + youtube\.com| + invidio\.us + ) + / (?: (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) \? (?:.*?[&;])*? (?:p|a|list)= @@ -2123,7 +2233,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) @@ -2227,6 +2337,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', 'categories': ['People & Blogs'], 'tags': list, + 'view_count': int, 'like_count': int, 'dislike_count': int, }, @@ -2261,6 +2372,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', 'only_matching': True, + }, { + # music album playlist + 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', + 'only_matching': True, + }, { + 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', + 'only_matching': True, }] def _real_initialize(self): @@ -2403,7 +2521,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -2424,6 +2542,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', }, + }, { + 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', + 'only_matching': True, }] @classmethod