X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=906988e6f0974443cb2552746befe4aa238aff3b;hb=d332ec725db5b1102473a75b9fc3212913bda618;hp=5e397324bb26dcdd4fbe171d5d7488be1ba3b587;hpb=76e510b92c4a1c4b0001f892504ba2cbb4b8d486;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 5e397324b..906988e6f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -69,7 +69,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False - _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}' + _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' def _set_language(self): self._set_cookie( @@ -372,7 +372,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL - (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| + (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| (?:www\.)?hooktube\.com/| @@ -570,7 +570,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', + 'description': 'md5:19a2f98d9032b9311e686ed039564f63', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], @@ -685,12 +685,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'nfWlot6h_JM', 'ext': 'm4a', 'title': 'Taylor Swift - Shake It Off', - 'description': 'md5:bec2185232c05479482cb5a9b82719bf', + 'description': 'md5:307195cd21ff7fa352270fe884570ef0', 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', - 'creator': 'Taylor Swift', }, 'params': { 'youtube_include_dash_manifest': True, @@ -755,11 +754,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', - 'creator': 'deadmau5', + 'creator': 'Dada Life, deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', 'title': 'Deadmau5 - Some Chords (HD)', - 'alt_title': 'Some Chords', + 'alt_title': 'This Machine Kills Some Chords', }, 'expected_warnings': [ 'DASH manifest missing', @@ -1135,6 +1134,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, 'youtube_include_dash_manifest': False, }, + 'skip': 'not actual anymore', }, { # Youtube Music Auto-generated description @@ -1145,8 +1145,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Voyeur Girl', 'description': 'md5:7ae382a65843d6df2685993e90a8628f', 'upload_date': '20190312', - 'uploader': 'Various Artists - Topic', - 'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw', + 'uploader': 'Stephen - Topic', + 'uploader_id': 'UC-pWHpBjdGG69N9mM2auIAA', 'artist': 'Stephen', 'track': 'Voyeur Girl', 'album': 'it\'s too much love to know my dear', @@ -1210,7 +1210,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '-hcAI0g-f5M', 'ext': 'mp4', 'title': 'Put It On Me', - 'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e', + 'description': 'md5:f6422397c07c4c907c6638e1fee380a5', 'upload_date': '20180426', 'uploader': 'Matt Maeson - Topic', 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', @@ -1224,6 +1224,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -1252,7 +1256,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', + r'.*?[-.](?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2,3}_[A-Z]{2})?/base)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1339,6 +1343,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): funcname = self._search_regex( (r'\b[cs]\s*&&\s*[adf]\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', r'\b[a-zA-Z0-9]+\s*&&\s*[a-zA-Z0-9]+\.set\([^,]+\s*,\s*encodeURIComponent\s*\(\s*(?P[a-zA-Z0-9$]+)\(', + r'\b(?P[a-zA-Z0-9$]{2})\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', r'(?P[a-zA-Z0-9$]+)\s*=\s*function\(\s*a\s*\)\s*{\s*a\s*=\s*a\.split\(\s*""\s*\)', # Obsolete patterns r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', @@ -1721,6 +1726,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True + video_info = None # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube url = proto + '://www.youtube.com/embed/%s' % video_id @@ -1732,15 +1738,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data - video_info_webpage = self._download_webpage( - video_info_url, video_id, - note='Refetching age-gated info webpage', - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - pl_response = video_info.get('player_response', [None])[0] - player_response = extract_player_response(pl_response, video_id) - add_dash_mpd(video_info) - view_count = extract_view_count(video_info) + try: + video_info_webpage = self._download_webpage( + video_info_url, video_id, + note='Refetching age-gated info webpage', + errnote='unable to download video info webpage') + except ExtractorError: + video_info_webpage = None + if video_info_webpage: + video_info = compat_parse_qs(video_info_webpage) + pl_response = video_info.get('player_response', [None])[0] + player_response = extract_player_response(pl_response, video_id) + add_dash_mpd(video_info) + view_count = extract_view_count(video_info) else: age_gate = False video_info = None @@ -1785,11 +1795,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): query['el'] = el if sts: query['sts'] = sts - video_info_webpage = self._download_webpage( - '%s://www.youtube.com/get_video_info' % proto, - video_id, note=False, - errnote='unable to download video info webpage', - fatal=False, query=query) + try: + video_info_webpage = self._download_webpage( + '%s://www.youtube.com/get_video_info' % proto, + video_id, note=False, + errnote='unable to download video info webpage', + query=query) + except ExtractorError as e: + # Skip further retries if we get 429 since solving + # captcha only unblocks access to website but + # not get_video_info end point + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 429: + break + continue if not video_info_webpage: continue get_video_info = compat_parse_qs(video_info_webpage) @@ -1828,13 +1846,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if messages: return '\n'.join(messages) - if not video_info: + if not video_info and not player_response: unavailable_message = extract_unavailable_message() if not unavailable_message: unavailable_message = 'Unable to extract video data' raise ExtractorError( 'YouTube said: %s' % unavailable_message, expected=True, video_id=video_id) + if not isinstance(video_info, dict): + video_info = {} + video_details = try_get( player_response, lambda x: x['videoDetails'], dict) or {} @@ -2030,7 +2051,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: player_version = self._search_regex( [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', - r'(?:www|player(?:_ias)?)-([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], + r'(?:www|player(?:_ias)?)[-.]([^/]+)(?:/[a-z]{2,3}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -2465,7 +2486,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?:\w+\.)? (?: (?: - youtube\.com| + youtube(?:kids)?\.com| invidio\.us ) / @@ -2477,7 +2498,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) @@ -2490,20 +2511,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): _VIDEO_RE = _VIDEO_RE_TPL % r'(?P[0-9A-Za-z_-]{11})' IE_NAME = 'youtube:playlist' _TESTS = [{ - 'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', 'info_dict': { - 'title': 'ytdl test PL', - 'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', }, - 'playlist_count': 3, + 'playlist_count': 1, }, { - 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', 'info_dict': { - 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', - 'title': 'YDL_Empty_List', + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', }, 'playlist_count': 0, - 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -2513,7 +2537,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'uploader': 'Christiaan008', 'uploader_id': 'ChRiStIaAn008', }, - 'playlist_count': 95, + 'playlist_count': 96, }, { 'note': 'issue #673', 'url': 'PLBB231211A4F62143', @@ -2647,6 +2671,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, }] def _real_initialize(self): @@ -2817,7 +2844,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com channels' - _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P[0-9A-Za-z_-]+)' + _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P[0-9A-Za-z_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' _VIDEO_RE = r'(?:title="(?P[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' IE_NAME = 'youtube:channel' @@ -2845,6 +2872,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', + 'only_matching': True, }] @classmethod