X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=05eea0e4e5196ddc60e79fa703045c8320641941;hb=666d808e7096e782aa6e4ff456120cb91d868120;hp=4002dcfdd79eda96562e186a43027877054c9bde;hpb=f8c55c6664e0d279ed01702b2af2ba5ee290ee4b;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4002dcfdd..05eea0e4e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -27,9 +27,11 @@ from ..compat import ( compat_str, ) from ..utils import ( + bool_or_none, clean_html, dict_get, error_to_compat_str, + extract_attributes, ExtractorError, float_or_none, get_element_by_attribute, @@ -39,7 +41,6 @@ from ..utils import ( orderedSet, parse_codecs, parse_duration, - qualities, remove_quotes, remove_start, smuggle_url, @@ -116,6 +117,8 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'f.req': json.dumps(f_req), 'flowName': 'GlifWebSignIn', 'flowEntry': 'ServiceLogin', + # TODO: reverse actual botguard identifier generation algo + 'bgRequest': '["identifier",""]', }) return self._download_json( url, None, note=note, errnote=errnote, @@ -321,17 +324,18 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): for video_id, video_title in self.extract_videos_from_page(content): yield self.url_result(video_id, 'Youtube', video_id, video_title) - def extract_videos_from_page(self, page): - ids_in_page = [] - titles_in_page = [] - for mobj in re.finditer(self._VIDEO_RE, page): + def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page): + for mobj in re.finditer(video_re, page): # The link with index 0 is not the first video of the playlist (not sure if still actual) if 'index' in mobj.groupdict() and mobj.group('id') == '0': continue video_id = mobj.group('id') - video_title = unescapeHTML(mobj.group('title')) + video_title = unescapeHTML( + mobj.group('title')) if 'title' in mobj.groupdict() else None if video_title: video_title = video_title.strip() + if video_title == '► Play all': + video_title = None try: idx = ids_in_page.index(video_id) if video_title and not titles_in_page[idx]: @@ -339,6 +343,12 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): except ValueError: ids_in_page.append(video_id) titles_in_page.append(video_title) + + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + self.extract_videos_from_page_impl( + self._VIDEO_RE, page, ids_in_page, titles_in_page) return zip(ids_in_page, titles_in_page) @@ -368,11 +378,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| + # Invidious instances taken from https://github.com/omarroth/invidious/wiki/Invidious-Instances (?:(?:www|dev)\.)?invidio\.us/| - (?:www\.)?invidiou\.sh/| - (?:www\.)?invidious\.snopyta\.org/| + (?:(?:www|no)\.)?invidiou\.sh/| + (?:(?:www|fi|de)\.)?invidious\.snopyta\.org/| (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?invidious\.enkirton\.net/| + (?:www\.)?invidious\.13ad\.de/| + (?:www\.)?invidious\.mastodon\.host/| + (?:www\.)?invidious\.nixnet\.xyz/| + (?:www\.)?invidious\.drycat\.fr/| + (?:www\.)?tube\.poal\.co/| (?:www\.)?vid\.wxzm\.sx/| + (?:www\.)?yt\.elukerio\.org/| + (?:www\.)?kgg2m7yk5aybusll\.onion/| + (?:www\.)?qklhadlycap4cnod\.onion/| + (?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion/| + (?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion/| + (?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion/| + (?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -500,6 +524,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, + + # av01 video only formats sometimes served with "unknown" codecs + '394': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '395': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, + '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') @@ -1306,11 +1336,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - (r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', + # Obsolete patterns + r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*a\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) @@ -1574,10 +1611,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_annotations(self, video_id): - url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id - return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') - @staticmethod def _extract_chapters(description, duration): if not description: @@ -1672,6 +1705,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def extract_token(v_info): return dict_get(v_info, ('account_playback_token', 'accountPlaybackToken', 'token')) + def extract_player_response(player_response, video_id): + pl_response = str_or_none(player_response) + if not pl_response: + return + pl_response = self._parse_json(pl_response, video_id, fatal=False) + if isinstance(pl_response, dict): + add_dash_mpd_pr(pl_response) + return pl_response + player_response = {} # Get video info @@ -1694,7 +1736,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): note='Refetching age-gated info webpage', errnote='unable to download video info webpage') video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False video_info = None @@ -1717,11 +1762,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): is_live = True sts = ytplayer_config.get('sts') if not player_response: - pl_response = str_or_none(args.get('player_response')) - if pl_response: - pl_response = self._parse_json(pl_response, video_id, fatal=False) - if isinstance(pl_response, dict): - player_response = pl_response + player_response = extract_player_response(args.get('player_response'), video_id) if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True): add_dash_mpd_pr(player_response) # We also try looking in get_video_info since it may contain different dashmpd @@ -1753,9 +1794,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): get_video_info = compat_parse_qs(video_info_webpage) if not player_response: pl_response = get_video_info.get('player_response', [None])[0] - if isinstance(pl_response, dict): - player_response = pl_response - add_dash_mpd_pr(player_response) + player_response = extract_player_response(pl_response, video_id) add_dash_mpd(get_video_info) if view_count is None: view_count = extract_view_count(get_video_info) @@ -1778,9 +1817,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break def extract_unavailable_message(): - return self._html_search_regex( - r'(?s)]+id="unavailable-message"[^>]*>(.+?)', - video_webpage, 'unavailable message', default=None) + messages = [] + for tag, kind in (('h1', 'message'), ('div', 'submessage')): + msg = self._html_search_regex( + r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)'.format(tag=tag, kind=kind), + video_webpage, 'unavailable %s' % kind, default=None) + if msg: + messages.append(msg) + if messages: + return '\n'.join(messages) if not video_info: unavailable_message = extract_unavailable_message() @@ -1789,22 +1834,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError( 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) - if video_info.get('license_info'): - raise ExtractorError('This video is DRM protected.', expected=True) - video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} - # title - if 'title' in video_info: - video_title = video_info['title'][0] - elif 'title' in player_response: - video_title = video_details['title'] - else: + video_title = video_info.get('title', [None])[0] or video_details.get('title') + if not video_title: self._downloader.report_warning('Unable to extract video title') video_title = '_' - # description description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1829,11 +1866,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ''', replace_url, video_description) video_description = clean_html(video_description) else: - fd_mobj = re.search(r'= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): + elif not is_live and (streaming_formats or len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) + formats = [] formats_spec = {} fmt_list = video_info.get('fmt_list', [''])[0] if fmt_list: @@ -1904,90 +1944,104 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'width': int_or_none(width_height[0]), 'height': int_or_none(width_height[1]), } - q = qualities(['small', 'medium', 'hd720']) - streaming_formats = try_get(player_response, lambda x: x['streamingData']['formats'], list) - if streaming_formats: - for fmt in streaming_formats: - itag = str_or_none(fmt.get('itag')) - if not itag: - continue - quality = fmt.get('quality') - quality_label = fmt.get('qualityLabel') or quality - formats_spec[itag] = { - 'asr': int_or_none(fmt.get('audioSampleRate')), - 'filesize': int_or_none(fmt.get('contentLength')), - 'format_note': quality_label, - 'fps': int_or_none(fmt.get('fps')), - 'height': int_or_none(fmt.get('height')), - 'quality': q(quality), - # bitrate for itag 43 is always 2147483647 - 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, - 'width': int_or_none(fmt.get('width')), - } - formats = [] - for url_data_str in encoded_url_map.split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' not in url_data or 'url' not in url_data: + for fmt in streaming_formats: + itag = str_or_none(fmt.get('itag')) + if not itag: continue + quality = fmt.get('quality') + quality_label = fmt.get('qualityLabel') or quality + formats_spec[itag] = { + 'asr': int_or_none(fmt.get('audioSampleRate')), + 'filesize': int_or_none(fmt.get('contentLength')), + 'format_note': quality_label, + 'fps': int_or_none(fmt.get('fps')), + 'height': int_or_none(fmt.get('height')), + # bitrate for itag 43 is always 2147483647 + 'tbr': float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) if itag != '43' else None, + 'width': int_or_none(fmt.get('width')), + } + + for fmt in streaming_formats: + if fmt.get('drm_families'): + continue + url = url_or_none(fmt.get('url')) + + if not url: + cipher = fmt.get('cipher') + if not cipher: + continue + url_data = compat_parse_qs(cipher) + url = url_or_none(try_get(url_data, lambda x: x['url'][0], compat_str)) + if not url: + continue + else: + cipher = None + url_data = compat_parse_qs(compat_urllib_parse_urlparse(url).query) + stream_type = int_or_none(try_get(url_data, lambda x: x['stream_type'][0])) # Unsupported FORMAT_STREAM_TYPE_OTF if stream_type == 3: continue - format_id = url_data['itag'][0] - url = url_data['url'][0] - - if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( - ASSETS_RE, - embed_webpage if age_gate else video_webpage, - 'JS player URL (1)', default=None) - if not jsplayer_url_json and not age_gate: - # We need the embed website after all - if embed_webpage is None: - embed_url = proto + '://www.youtube.com/embed/%s' % video_id - embed_webpage = self._download_webpage( - embed_url, video_id, 'Downloading embed webpage') - jsplayer_url_json = self._search_regex( - ASSETS_RE, embed_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) - if player_url is None: - player_url_json = self._search_regex( - r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', - video_webpage, 'age gate player URL') - player_url = json.loads(player_url_json) - - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] + format_id = fmt.get('itag') or url_data['itag'][0] + if not format_id: + continue + format_id = compat_str(format_id) - if self._downloader.params.get('verbose'): + if cipher: + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): + ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + jsplayer_url_json = self._search_regex( + ASSETS_RE, + embed_webpage if age_gate else video_webpage, + 'JS player URL (1)', default=None) + if not jsplayer_url_json and not age_gate: + # We need the embed website after all + if embed_webpage is None: + embed_url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') + jsplayer_url_json = self._search_regex( + ASSETS_RE, embed_webpage, 'JS player URL') + + player_url = json.loads(jsplayer_url_json) if player_url is None: - player_version = 'unknown' - player_desc = 'unknown' - else: - if player_url.endswith('swf'): - player_version = self._search_regex( - r'-(.+?)(?:/watch_as3)?\.swf$', player_url, - 'flash player', fatal=False) - player_desc = 'flash player %s' % player_version + player_url_json = self._search_regex( + r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', + video_webpage, 'age gate player URL') + player_url = json.loads(player_url_json) + + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + + if self._downloader.params.get('verbose'): + if player_url is None: + player_version = 'unknown' + player_desc = 'unknown' else: - player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], - player_url, - 'html5 player', fatal=False) - player_desc = 'html5 player %s' % player_version - - parts_sizes = self._signature_cache_id(encrypted_sig) - self.to_screen('{%s} signature length %s, %s' % - (format_id, parts_sizes, player_desc)) - - signature = self._decrypt_signature( - encrypted_sig, video_id, player_url, age_gate) - url += '&signature=' + signature + if player_url.endswith('swf'): + player_version = self._search_regex( + r'-(.+?)(?:/watch_as3)?\.swf$', player_url, + 'flash player', fatal=False) + player_desc = 'flash player %s' % player_version + else: + player_version = self._search_regex( + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], + player_url, + 'html5 player', fatal=False) + player_desc = 'html5 player %s' % player_version + + parts_sizes = self._signature_cache_id(encrypted_sig) + self.to_screen('{%s} signature length %s, %s' % + (format_id, parts_sizes, player_desc)) + + signature = self._decrypt_signature( + encrypted_sig, video_id, player_url, age_gate) + sp = try_get(url_data, lambda x: x['sp'][0], compat_str) or 'signature' + url += '&%s=%s' % (sp, signature) if 'ratebypass' not in url: url += '&ratebypass=yes' @@ -2007,24 +2061,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor): mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) + if width is None: + width = int_or_none(fmt.get('width')) + if height is None: + height = int_or_none(fmt.get('height')) + filesize = int_or_none(url_data.get( 'clen', [None])[0]) or _extract_filesize(url) - quality = url_data.get('quality', [None])[0] + quality = url_data.get('quality', [None])[0] or fmt.get('quality') + quality_label = url_data.get('quality_label', [None])[0] or fmt.get('qualityLabel') + + tbr = (float_or_none(url_data.get('bitrate', [None])[0], 1000) + or float_or_none(fmt.get('bitrate'), 1000)) if format_id != '43' else None + fps = int_or_none(url_data.get('fps', [None])[0]) or int_or_none(fmt.get('fps')) more_fields = { 'filesize': filesize, - 'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000), + 'tbr': tbr, 'width': width, 'height': height, - 'fps': int_or_none(url_data.get('fps', [None])[0]), - 'format_note': url_data.get('quality_label', [None])[0] or quality, - 'quality': q(quality), + 'fps': fps, + 'format_note': quality_label or quality, } for key, value in more_fields.items(): if value: dct[key] = value - type_ = url_data.get('type', [None])[0] + type_ = url_data.get('type', [None])[0] or fmt.get('mimeType') if type_: type_split = type_.split(';') kind_ext = type_split[0].split('/') @@ -2051,8 +2114,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): url_or_none(try_get( player_response, lambda x: x['streamingData']['hlsManifestUrl'], - compat_str)) or - url_or_none(try_get( + compat_str)) + or url_or_none(try_get( video_info, lambda x: x['hlsvp'][0], compat_str))) if manifest_url: formats = [] @@ -2072,9 +2135,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' formats.append(a_format) else: - error_message = clean_html(video_info.get('reason', [None])[0]) + error_message = extract_unavailable_message() if not error_message: - error_message = extract_unavailable_message() + error_message = clean_html(try_get( + player_response, lambda x: x['playabilityStatus']['reason'], + compat_str)) + if not error_message: + error_message = clean_html( + try_get(video_info, lambda x: x['reason'][0], compat_str)) if error_message: raise ExtractorError(error_message, expected=True) raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info') @@ -2100,8 +2168,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: self._downloader.report_warning('unable to extract uploader nickname') - channel_id = self._html_search_meta( - 'channelId', video_webpage, 'channel id') + channel_id = ( + str_or_none(video_details.get('channelId')) + or self._html_search_meta( + 'channelId', video_webpage, 'channel id', default=None) + or self._search_regex( + r'data-channel-external-id=(["\'])(?P(?:(?!\1).)+)\1', + video_webpage, 'channel id', default=None, group='id')) channel_url = 'http://www.youtube.com/channel/%s' % channel_id if channel_id else None # thumbnail image @@ -2221,6 +2294,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'<[^>]+class=["\']watch-view-count[^>]+>\s*([\d,\s]+)', video_webpage, 'view count', default=None)) + average_rating = ( + float_or_none(video_details.get('averageRating')) + or try_get(video_info, lambda x: float_or_none(x['avg_rating'][0]))) + # subtitles video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) @@ -2236,7 +2313,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # annotations video_annotations = None if self._downloader.params.get('writeannotations', False): - video_annotations = self._extract_annotations(video_id) + xsrf_token = self._search_regex( + r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P[A-Za-z0-9+/=]+)\2', + video_webpage, 'xsrf token', group='xsrf_token', fatal=False) + invideo_url = try_get( + player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str) + if xsrf_token and invideo_url: + xsrf_field_name = self._search_regex( + r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P\w+)\2', + video_webpage, 'xsrf field name', + group='xsrf_field_name', default='session_token') + video_annotations = self._download_webpage( + self._proto_relative_url(invideo_url), + video_id, note='Downloading annotations', + errnote='Unable to download video annotations', fatal=False, + data=urlencode_postdata({xsrf_field_name: xsrf_token})) chapters = self._extract_chapters(description_original, video_duration) @@ -2317,6 +2408,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '"token" parameter not in video info for unknown reason', video_id=video_id) + if not formats and (video_info.get('license_info') or try_get(player_response, lambda x: x['streamingData']['licenseInfos'])): + raise ExtractorError('This video is DRM protected.', expected=True) + self._sort_formats(formats) self.mark_watched(video_id, video_info, player_response) @@ -2347,7 +2441,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, - 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]), + 'average_rating': average_rating, 'formats': formats, 'is_live': is_live, 'start_time': start_time, @@ -2391,7 +2485,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' - _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' + _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' + _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', @@ -2414,6 +2509,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': '29C3: Not my department', 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'ChRiStIaAn008', }, 'playlist_count': 95, }, { @@ -2422,6 +2519,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', 'id': 'PLBB231211A4F62143', + 'uploader': 'Wickydoo', + 'uploader_id': 'Wickydoo', }, 'playlist_mincount': 26, }, { @@ -2430,6 +2529,8 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': 'Uploads from Cauchemar', 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'Cauchemar89', }, 'playlist_mincount': 799, }, { @@ -2447,13 +2548,17 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', + 'uploader': 'milan', + 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', } }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', 'playlist_mincount': 485, 'info_dict': { - 'title': '2017 華語最新單曲 (2/24更新)', + 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'uploader': 'LBK', + 'uploader_id': 'sdragonfang', } }, { 'note': 'Embedded SWF player', @@ -2462,13 +2567,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'title': 'JODA7', 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', - } + }, + 'skip': 'This playlist does not exist', }, { 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', 'info_dict': { 'title': 'Uploads from Interstellar Movie', 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'InterstellarMovie1', }, 'playlist_mincount': 21, }, { @@ -2493,6 +2601,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is not available.', 'add_ie': [YoutubeIE.ie_key()], }, { 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', @@ -2504,7 +2613,6 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader_id': 'backuspagemuseum', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', 'upload_date': '20161008', - 'license': 'Standard YouTube License', 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', 'categories': ['Nonprofits & Activism'], 'tags': list, @@ -2515,6 +2623,16 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'noplaylist': True, 'skip_download': True, }, + }, { + # https://github.com/ytdl-org/youtube-dl/issues/21844 + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'Computerphile', + 'uploader': 'Computerphile', + }, + 'playlist_mincount': 11, }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, @@ -2533,6 +2651,34 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): def _real_initialize(self): self._login() + def extract_videos_from_page(self, page): + ids_in_page = [] + titles_in_page = [] + + for item in re.findall( + r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): + attrs = extract_attributes(item) + video_id = attrs['data-video-id'] + video_title = unescapeHTML(attrs.get('data-title')) + if video_title: + video_title = video_title.strip() + ids_in_page.append(video_id) + titles_in_page.append(video_title) + + # Fallback with old _VIDEO_RE + self.extract_videos_from_page_impl( + self._VIDEO_RE, page, ids_in_page, titles_in_page) + + # Relaxed fallbacks + self.extract_videos_from_page_impl( + r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, + ids_in_page, titles_in_page) + self.extract_videos_from_page_impl( + r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, + ids_in_page, titles_in_page) + + return zip(ids_in_page, titles_in_page) + def _extract_mix(self, playlist_id): # The mixes are generated from a single video # the id of the playlist is just 'RD' + video_id @@ -2558,9 +2704,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) title_span = ( - search_title('playlist-title') or - search_title('title long-title') or - search_title('title')) + search_title('playlist-title') + or search_title('title long-title') + or search_title('title')) title = clean_html(title_span) return self.playlist_result(url_results, playlist_id, title) @@ -2595,7 +2741,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): page, 'title', default=None) _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' - uploader = self._search_regex( + uploader = self._html_search_regex( r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, page, 'uploader', default=None) mobj = re.search( @@ -2681,6 +2827,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', 'title': 'Uploads from lex will', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', } }, { 'note': 'Age restricted channel', @@ -2690,6 +2838,8 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): 'info_dict': { 'id': 'UUs0ifCMCm1icqRbqhUINa0w', 'title': 'Uploads from Deus Ex', + 'uploader': 'Deus Ex', + 'uploader_id': 'DeusExOfficial', }, }, { 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', @@ -2774,6 +2924,8 @@ class YoutubeUserIE(YoutubeChannelIE): 'info_dict': { 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', 'title': 'Uploads from The Linux Foundation', + 'uploader': 'The Linux Foundation', + 'uploader_id': 'TheLinuxFoundation', } }, { # Only available via https://www.youtube.com/c/12minuteathlete/videos @@ -2783,6 +2935,8 @@ class YoutubeUserIE(YoutubeChannelIE): 'info_dict': { 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', 'title': 'Uploads from 12 Minute Athlete', + 'uploader': '12 Minute Athlete', + 'uploader_id': 'the12minuteathlete', } }, { 'url': 'ytuser:phihag', @@ -2876,7 +3030,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'playlist_mincount': 4, 'info_dict': { 'id': 'ThirstForScience', - 'title': 'Thirst for Science', + 'title': 'ThirstForScience', }, }, { # with "Load more" button @@ -2893,6 +3047,7 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): 'id': 'UCiU1dHvZObB2iP6xkJ__Icw', 'title': 'Chem Player', }, + 'skip': 'Blocked', }]