X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=76710931ae5e6a292af767f3f57685ad0be98cac;hb=605fd6392fedd2599115e1f1e12df2a6212df1ae;hp=63597dd166723a6bda47b0ea681d6050566c052c;hpb=12afdc2ad617dedfd7d60654b8c57b99604332ed;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 63597dd16..76710931a 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -40,6 +40,7 @@ from ..utils import ( sanitized_Request, smuggle_url, str_to_int, + try_get, unescapeHTML, unified_strdate, unsmuggle_url, @@ -328,6 +329,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, @@ -383,6 +386,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, 'like_count': int, 'dislike_count': int, 'start_time': 1, @@ -402,6 +406,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], + 'duration': 180, 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop', @@ -419,6 +424,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', 'alt_title': 'Tunnel Vision', 'description': 'md5:64249768eec3bc4276236606ea996373', + 'duration': 419, 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', @@ -458,6 +464,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'test chars: "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], + 'duration': 10, 'like_count': int, 'dislike_count': int, }, @@ -493,6 +500,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'm4a', 'title': 'Afrojack, Spree Wilson - The Spark ft. Spree Wilson', 'description': 'md5:12e7067fa6735a77bdcbb58cb1187d2d', + 'duration': 244, 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', @@ -512,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'Taylor Swift - Shake It Off', 'alt_title': 'Shake It Off', 'description': 'md5:95f66187cd7c8b2c13eb78e1223b63c3', + 'duration': 242, 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', @@ -529,6 +538,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'T4XJQO3qol8', 'ext': 'mp4', + 'duration': 219, 'upload_date': '20100909', 'uploader': 'The Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', @@ -546,6 +556,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', 'description': r're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', + 'duration': 142, 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/WitcherGame', @@ -562,6 +573,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'duration': 247, 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', @@ -576,6 +588,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': '__2ABJjxzNo', 'ext': 'mp4', + 'duration': 266, 'upload_date': '20100430', 'uploader_id': 'deadmau5', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', @@ -596,6 +609,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'info_dict': { 'id': 'lqQg6PlCWgI', 'ext': 'mp4', + 'duration': 6085, 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', @@ -615,6 +629,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': '_b-2C3KPAM0', 'ext': 'mp4', 'stretched_ratio': 16 / 9., + 'duration': 85, 'upload_date': '20110310', 'uploader_id': 'AllenMeow', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/AllenMeow', @@ -649,6 +664,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'md5:7b81415841e02ecd4313668cde88737a', 'description': 'md5:116377fd2963b81ec4ce64b542173306', + 'duration': 220, 'upload_date': '20150625', 'uploader_id': 'dorappi2000', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/dorappi2000', @@ -691,6 +707,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (Main Camera)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7335, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -703,6 +720,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (kreestuh)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -715,6 +733,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (grizzle)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7337, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -727,6 +746,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'teamPGP: Rocket League Noob Stream (zim)', 'description': 'md5:dc7872fb300e143831327f1bae3af010', + 'duration': 7334, 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', @@ -768,6 +788,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': '{dark walk}; Loki/AC/Dishonored; collab w/Elflover21', 'alt_title': 'Dark Walk', 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', + 'duration': 133, 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', @@ -809,10 +830,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'md5:e41008789470fc2533a3252216f1c1d1', 'description': 'md5:a677553cf0840649b731a3024aeff4cc', + 'duration': 721, 'upload_date': '20150127', 'uploader_id': 'BerkmanCenter', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', - 'uploader': 'BerkmanCenter', + 'uploader': 'The Berkman Klein Center for Internet & Society', 'license': 'Creative Commons Attribution license (reuse allowed)', }, 'params': { @@ -827,6 +849,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'duration': 4060, 'upload_date': '20151119', 'uploader': 'Bernie 2016', 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', @@ -871,7 +894,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'id': 'iqKdEhx-dD4', 'ext': 'mp4', 'title': 'Isolation - Mind Field (Ep 1)', - 'description': 'md5:3a72f23c086a1496c9e2c54a25fa0822', + 'description': 'md5:8013b7ddea787342608f63a13ddc9492', + 'duration': 2085, 'upload_date': '20170118', 'uploader': 'Vsauce', 'uploader_id': 'Vsauce', @@ -1006,8 +1030,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, - 'Initial JS player signature function name') + (r'(["\'])signature\1\s*,\s*(?P[a-zA-Z0-9$]+)\(', + r'\.sig\|\|(?P[a-zA-Z0-9$]+)\('), + jscode, 'Initial JS player signature function name', group='sig') jsi = JSInterpreter(jscode) initial_function = jsi.extract_function(funcname) @@ -1028,6 +1053,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if player_url.startswith('//'): player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) try: player_id = (player_url, self._signature_cache_id(s)) if player_id not in self._player_cache: @@ -1516,11 +1544,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_subtitles = self.extract_subtitles(video_id, video_webpage) automatic_captions = self.extract_automatic_captions(video_id, video_webpage) - if 'length_seconds' not in video_info: - self._downloader.report_warning('unable to extract video duration') - video_duration = None - else: - video_duration = int(compat_urllib_parse_unquote_plus(video_info['length_seconds'][0])) + video_duration = try_get( + video_info, lambda x: int_or_none(x['length_seconds'][0])) + if not video_duration: + video_duration = parse_duration(self._html_search_meta( + 'duration', video_webpage, 'video duration')) # annotations video_annotations = None @@ -1831,13 +1859,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= ) ( - (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD|UL|TL)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD|UL|TL)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})&[^"]*?index=(?P\d+)(?:[^>]+>(?P[^<]+))?' @@ -1856,6 +1884,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'YDL_Empty_List', }, 'playlist_count': 0, + 'skip': 'This playlist is private', }, { 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', @@ -1887,6 +1916,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', }, 'playlist_count': 2, + 'skip': 'This playlist is private', }, { 'note': 'embedded', 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', @@ -1957,6 +1987,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): }, { 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', 'only_matching': True, + }, { + 'url': 'TLGGrESM50VT6acwMjAyMjAxNw', + 'only_matching': True, }] def _real_initialize(self): @@ -1998,14 +2031,18 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) - for match in re.findall(r'<div class="yt-alert-message">([^<]+)</div>', page): + # the yt-alert-message now has tabindex attribute (see https://github.com/rg3/youtube-dl/issues/11604) + for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): match = match.strip() # Check if the playlist exists or is private - if re.match(r'[^<]*(The|This) playlist (does not exist|is private)[^<]*', match): - raise ExtractorError( - 'The playlist doesn\'t exist or is private, use --username or ' - '--netrc to access it.', - expected=True) + mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) + if mobj: + reason = mobj.group('reason') + message = 'This playlist %s' % reason + if 'private' in reason: + message += ', use --username or --netrc to access it' + message += '.' + raise ExtractorError(message, expected=True) elif re.match(r'[^<]*Invalid parameters[^<]*', match): raise ExtractorError( 'Invalid parameters. Maybe URL is incorrect.', @@ -2313,18 +2350,18 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos = [] limit = n + url_query = { + 'search_query': query.encode('utf-8'), + } + url_query.update(self._EXTRA_QUERY_ARGS) + result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) + for pagenum in itertools.count(1): - url_query = { - 'search_query': query.encode('utf-8'), - 'page': pagenum, - 'spf': 'navigate', - } - url_query.update(self._EXTRA_QUERY_ARGS) - result_url = 'https://www.youtube.com/results?' + compat_urllib_parse_urlencode(url_query) data = self._download_json( result_url, video_id='query "%s"' % query, note='Downloading page %s' % pagenum, - errnote='Unable to download API page') + errnote='Unable to download API page', + query={'spf': 'navigate'}) html_content = data[1]['body']['content'] if 'class="search-message' in html_content: @@ -2336,6 +2373,12 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): videos += new_videos if not new_videos or len(videos) > limit: break + next_link = self._html_search_regex( + r'href="(/results\?[^"]*\bsp=[^"]+)"[^>]*>\s*<span[^>]+class="[^"]*\byt-uix-button-content\b[^"]*"[^>]*>Next', + html_content, 'next link', default=None) + if next_link is None: + break + result_url = compat_urlparse.urljoin('https://www.youtube.com/', next_link) if len(videos) > n: videos = videos[:n]