X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=438eb5aa7d371f0d0fd9c70b9c8b7d15a8d44737;hb=5caabd3c701a484271d197f7006ecf831e38136b;hp=730935657981ca23ffe981b4e4afce2bbca22078;hpb=4fe54c128a11d394874505af75aaa5a2276aa3ba;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 730935657..438eb5aa7 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -351,7 +351,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| - (?:www\.)?invidio\.us/| + (?:(?:www|dev)\.)?invidio\.us/| + (?:www\.)?invidiou\.sh/| + (?:www\.)?invidious\.snopyta\.org/| + (?:www\.)?invidious\.kabi\.tk/| + (?:www\.)?vid\.wxzm\.sx/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: @@ -427,7 +431,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/ytdl-org/youtube-dl/issues/4559) '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, @@ -480,7 +484,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, } - _SUBTITLE_FORMATS = ('ttml', 'vtt') + _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt') _GEO_BYPASS = False @@ -692,7 +696,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 18, }, }, - # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) + # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) # YouTube Red ad is not captured for creator { 'url': '__2ABJjxzNo', @@ -713,7 +717,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'DASH manifest missing', ] }, - # Olympics (https://github.com/rg3/youtube-dl/issues/4431) + # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) { 'url': 'lqQg6PlCWgI', 'info_dict': { @@ -764,7 +768,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'skip': 'This live event has ended.', }, - # Extraction from multiple DASH manifests (https://github.com/rg3/youtube-dl/pull/6097) + # Extraction from multiple DASH manifests (https://github.com/ytdl-org/youtube-dl/pull/6097) { 'url': 'https://www.youtube.com/watch?v=FIl7x6_3R5Y', 'info_dict': { @@ -867,7 +871,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip': 'This video is not available.', }, { - # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536) + # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', 'info_dict': { 'id': 'gVfLd0zydlo', @@ -885,10 +889,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'only_matching': True, }, { - # Title with JS-like syntax "};" (see https://github.com/rg3/youtube-dl/issues/7468) + # Title with JS-like syntax "};" (see https://github.com/ytdl-org/youtube-dl/issues/7468) # Also tests cut-off URL expansion in video description (see - # https://github.com/rg3/youtube-dl/issues/1892, - # https://github.com/rg3/youtube-dl/issues/8164) + # https://github.com/ytdl-org/youtube-dl/issues/1892, + # https://github.com/ytdl-org/youtube-dl/issues/8164) 'url': 'https://www.youtube.com/watch?v=lsguqyKfVQg', 'info_dict': { 'id': 'lsguqyKfVQg', @@ -910,7 +914,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, { - # Tags with '};' (see https://github.com/rg3/youtube-dl/issues/7468) + # Tags with '};' (see https://github.com/ytdl-org/youtube-dl/issues/7468) 'url': 'https://www.youtube.com/watch?v=Ms7iBXnlUO8', 'only_matching': True, }, @@ -974,7 +978,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'only_matching': True, }, { - # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059) + # YouTube Red paid video (https://github.com/ytdl-org/youtube-dl/issues/10059) 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', 'only_matching': True, }, @@ -1082,7 +1086,95 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, 'youtube_include_dash_manifest': False, }, - } + }, + { + # artist and track fields should return non-null, per issue #20599 + 'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', + 'info_dict': { + 'id': 'MgNrAu2pzNs', + 'ext': 'mp4', + 'title': 'Voyeur Girl', + 'description': 'md5:7ae382a65843d6df2685993e90a8628f', + 'upload_date': '20190312', + 'uploader': 'Various Artists - Topic', + 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', + 'artist': 'Stephen', + 'track': 'Voyeur Girl', + 'album': 'it\'s too much love to know my dear', + 'release_date': '20190313', + 'release_year': 2019, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Retrieve 'artist' field from 'Artist:' in video description + # when it is present on youtube music video + # Some videos have release_date and no release_year - + # (release_year should be extracted from release_date) + # https://github.com/ytdl-org/youtube-dl/pull/20742#issuecomment-485740932 + 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', + 'info_dict': { + 'id': 'k0jLE7tTwjY', + 'ext': 'mp4', + 'title': 'Latch Feat. Sam Smith', + 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', + 'upload_date': '20150110', + 'uploader': 'Various Artists - Topic', + 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', + 'artist': 'Disclosure', + 'track': 'Latch Feat. Sam Smith', + 'album': 'Latch Featuring Sam Smith', + 'release_date': '20121008', + 'release_year': 2012, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # handle multiple artists on youtube music video + 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', + 'info_dict': { + 'id': '74qn0eJSjpA', + 'ext': 'mp4', + 'title': 'Eastside', + 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', + 'upload_date': '20180710', + 'uploader': 'Benny Blanco - Topic', + 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', + 'artist': 'benny blanco, Halsey, Khalid', + 'track': 'Eastside', + 'album': 'Eastside', + 'release_date': '20180713', + 'release_year': 2018, + }, + 'params': { + 'skip_download': True, + }, + }, + { + # handle youtube music video with release_year and no release_date + 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', + 'info_dict': { + 'id': '-hcAI0g-f5M', + 'ext': 'mp4', + 'title': 'Put It On Me', + 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', + 'upload_date': '20180426', + 'uploader': 'Matt Maeson - Topic', + 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', + 'artist': 'Matt Maeson', + 'track': 'Put It On Me', + 'album': 'The Hearse', + 'release_date': None, + 'release_year': 2018, + }, + 'params': { + 'skip_download': True, + }, + }, ] def __init__(self, *args, **kwargs): @@ -1198,8 +1290,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', r'\.sig\|\|(?P[a-zA-Z0-9$]+)\(', - r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', - r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'yt\.akamaized\.net/\)\s*\|\|\s*.*?\s*c\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?(?P[a-zA-Z0-9$]+)\(', + r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*(?:encodeURIComponent\s*\()?\s*(?P[a-zA-Z0-9$]+)\(', r'\bc\s*&&\s*d\.set\([^,]+\s*,\s*\([^)]*\)\s*\(\s*(?P[a-zA-Z0-9$]+)\('), jscode, 'Initial JS player signature function name', group='sig') @@ -1280,8 +1372,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # regex won't capture the whole JSON. Yet working around by trying more # concrete regex first keeping in mind proper quoted string handling # to be implemented in future that will replace this workaround (see - # https://github.com/rg3/youtube-dl/issues/7468, - # https://github.com/rg3/youtube-dl/pull/7599) + # https://github.com/ytdl-org/youtube-dl/issues/7468, + # https://github.com/ytdl-org/youtube-dl/pull/7599) r';ytplayer\.config\s*=\s*({.+?});ytplayer', r';ytplayer\.config\s*=\s*({.+?});', ) @@ -1596,7 +1688,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): add_dash_mpd(video_info) # Rental video is not rented but preview is available (e.g. # https://www.youtube.com/watch?v=yYr8q0y5Jfg, - # https://github.com/rg3/youtube-dl/issues/10532) + # https://github.com/ytdl-org/youtube-dl/issues/10532) if not video_info and args.get('ypc_vid'): return self.url_result( args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid']) @@ -1616,7 +1708,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # are missing from DASH manifest pointed by webpage's dashmpd, some - from DASH # manifest pointed by get_video_info's dashmpd). # The general idea is to take a union of itags of both DASH manifests (for example - # video with such 'manifest behavior' see https://github.com/rg3/youtube-dl/issues/6093) + # video with such 'manifest behavior' see https://github.com/ytdl-org/youtube-dl/issues/6093) self.report_video_info_webpage_download(video_id) for el in ('info', 'embedded', 'detailpage', 'vevo', ''): query = { @@ -1648,17 +1740,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): view_count = extract_view_count(get_video_info) if not video_info: video_info = get_video_info - if 'token' in get_video_info: + get_token = get_video_info.get('token') or get_video_info.get('account_playback_token') + if get_token: # Different get_video_info requests may report different results, e.g. # some may report video unavailability, but some may serve it without - # any complaint (see https://github.com/rg3/youtube-dl/issues/7362, + # any complaint (see https://github.com/ytdl-org/youtube-dl/issues/7362, # the original webpage as well as el=info and el=embedded get_video_info # requests report video unavailability due to geo restriction while # el=detailpage succeeds and returns valid data). This is probably # due to YouTube measures against IP ranges of hosting providers. # Working around by preferring the first succeeded video_info containing # the token if no such video_info yet was found. - if 'token' not in video_info: + token = video_info.get('token') or video_info.get('account_playback_token') + if not token: video_info = get_video_info break @@ -1667,7 +1761,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'(?s)]+id="unavailable-message"[^>]*>(.+?)', video_webpage, 'unavailable message', default=None) - if 'token' not in video_info: + if not video_info: + unavailable_message = extract_unavailable_message() + if not unavailable_message: + unavailable_message = 'Unable to extract video data' + raise ExtractorError( + 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) + + token = video_info.get('token') or video_info.get('account_playback_token') + if not token: if 'reason' in video_info: if 'The uploader has not made this video available in your country.' in video_info['reason']: regions_allowed = self._html_search_meta( @@ -1747,7 +1849,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): for feed in multifeed_metadata_list.split(','): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see - # https://github.com/rg3/youtube-dl/issues/8536) + # https://github.com/ytdl-org/youtube-dl/issues/8536) feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) entries.append({ '_type': 'url_transparent', @@ -1772,7 +1874,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) + raise ExtractorError('"rental" videos not supported. See https://github.com/ytdl-org/youtube-dl/issues/359 for more information.', expected=True) def _extract_filesize(media_url): return int_or_none(self._search_regex( @@ -1789,7 +1891,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif not is_live and (len(video_info.get('url_encoded_fmt_stream_map', [''])[0]) >= 1 or len(video_info.get('adaptive_fmts', [''])[0]) >= 1): encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts', [''])[0] if 'rtmpe%3Dyes' in encoded_url_map: - raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True) + raise ExtractorError('rtmpe downloads are not supported, see https://github.com/ytdl-org/youtube-dl/issues/343 for more information.', expected=True) formats_spec = {} fmt_list = video_info.get('fmt_list', [''])[0] if fmt_list: @@ -1901,7 +2003,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): dct.update(formats_spec[format_id]) # Some itags are not included in DASH manifest thus corresponding formats will - # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993). + # lack metadata (see https://github.com/ytdl-org/youtube-dl/pull/5993). # Trying to extract metadata from url_encoded_fmt_stream_map entry. mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0]) width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None) @@ -2059,6 +2161,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor): track = extract_meta('Song') artist = extract_meta('Artist') + album = None + release_date = None + release_year = None + + description_info = video_description.split('\n\n') + # If the description of the video has the youtube music auto-generated format, extract additional info + if len(description_info) >= 5 and description_info[-1] == 'Auto-generated by YouTube.': + track_artist = description_info[1].split(' · ') + if len(track_artist) >= 2: + if track is None: + track = track_artist[0] + if artist is None: + artist = re.search(r'Artist: ([^\n]+)', description_info[-2]) + if artist: + artist = artist.group(1) + if artist is None: + artist = track_artist[1] + # handle multiple artists + if len(track_artist) > 2: + for i in range(2, len(track_artist)): + artist += ', %s' % track_artist[i] + release_year = re.search(r'℗ ([0-9]+)', video_description) + if release_year: + release_year = int_or_none(release_year.group(1)) + album = description_info[2] + if description_info[4].startswith('Released on: '): + release_date = description_info[4].split(': ')[1].replace('-', '') + # extract release_year from release_date if necessary + if release_year is None: + release_year = int_or_none(release_date[0:4]) m_episode = re.search( r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*•\s*E(?P\d+)', @@ -2152,7 +2284,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Remove the formats we found through non-DASH, they # contain less info and it can be wrong, because we use # fixed values (for example the resolution). See - # https://github.com/rg3/youtube-dl/issues/5774 for an + # https://github.com/ytdl-org/youtube-dl/issues/5774 for an # example. formats = [f for f in formats if f['format_id'] not in dash_formats.keys()] formats.extend(dash_formats.values()) @@ -2212,6 +2344,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'episode_number': episode_number, 'track': track, 'artist': artist, + 'album': album, + 'release_date': release_date, + 'release_year': release_year, } @@ -2421,7 +2556,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604) + # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) for match in re.findall(r'
]*>([^<]+)
', page): match = match.strip() # Check if the playlist exists or is private @@ -2514,7 +2649,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): return playlist # Some playlist URLs don't actually serve a playlist (see - # https://github.com/rg3/youtube-dl/issues/10537). + # https://github.com/ytdl-org/youtube-dl/issues/10537). # Fallback to plain video extraction if there is a video id # along with playlist id. return self.url_result(video_id, 'Youtube', video_id=video_id)