X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=e4b26b84fe5cf65dfdcedc5d9fd9bf2b67e17f35;hb=acf757f42eea16266a539fa0eb871dae422a1d22;hp=bc18276d6c7754a812b04c4ae42bc6c021d22627;hpb=ff21a8e0ee43d4ce0b75cd938f9bdfab664dd579;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index bc18276d6..e4b26b84f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -264,9 +264,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, @@ -394,6 +394,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'format': '141', }, }, + # JS player signature function name containing $ + { + 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', + 'info_dict': { + 'id': 'nfWlot6h_JM', + 'ext': 'm4a', + 'title': 'Taylor Swift - Shake It Off', + 'description': 'md5:2acfda1b285bdd478ccec22f9918199d', + 'uploader': 'TaylorSwiftVEVO', + 'uploader_id': 'TaylorSwiftVEVO', + 'upload_date': '20140818', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141', + }, + }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', @@ -465,6 +482,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'skip_download': 'requires avconv', } }, + # Non-square pixels + { + 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', + 'info_dict': { + 'id': '_b-2C3KPAM0', + 'ext': 'mp4', + 'stretched_ratio': 16 / 9., + 'upload_date': '20110310', + 'uploader_id': 'AllenMeow', + 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', + 'uploader': '孫艾倫', + 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', + }, + } ] def __init__(self, *args, **kwargs): @@ -574,7 +605,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode, + r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, 'Initial JS player signature function name') jsi = JSInterpreter(jscode) @@ -778,6 +809,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): player_url = None # Get video info + embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -985,10 +1017,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url += '&signature=' + url_data['sig'][0] elif 's' in url_data: encrypted_sig = url_data['s'][0] + ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' jsplayer_url_json = self._search_regex( - r'"assets":.+?"js":\s*("[^"]+")', - embed_webpage if age_gate else video_webpage, 'JS player URL') + ASSETS_RE, + embed_webpage if age_gate else video_webpage, + 'JS player URL (1)', default=None) + if not jsplayer_url_json and not age_gate: + # We need the embed website after all + if embed_webpage is None: + embed_url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') + jsplayer_url_json = self._search_regex( + ASSETS_RE, embed_webpage, 'JS player URL') + player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( @@ -1051,6 +1094,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): f['preference'] = f.get('preference', 0) - 10000 formats.extend(dash_formats) + # Check for malformed aspect ratio + stretched_m = re.search( + r'[0-9]+):(?P[0-9]+)">', + video_webpage) + if stretched_m: + ratio = float(stretched_m.group('w')) / float(stretched_m.group('h')) + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + self._sort_formats(formats) return { @@ -1107,6 +1160,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', 'info_dict': { + 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx', 'title': 'YDL_Empty_List', }, 'playlist_count': 0, @@ -1115,6 +1169,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', 'info_dict': { 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', }, 'playlist_count': 95, }, { @@ -1122,6 +1177,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'url': 'PLBB231211A4F62143', 'info_dict': { 'title': '[OLD]Team Fortress 2 (Class-based LP)', + 'id': 'PLBB231211A4F62143', }, 'playlist_mincount': 26, }, { @@ -1129,12 +1185,14 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', 'info_dict': { 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', }, 'playlist_mincount': 799, }, { 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', 'info_dict': { 'title': 'YDL_safe_search', + 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', }, 'playlist_count': 2, }, { @@ -1143,6 +1201,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'playlist_count': 4, 'info_dict': { 'title': 'JODA15', + 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', } }, { 'note': 'Embedded SWF player', @@ -1150,12 +1209,14 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'playlist_count': 4, 'info_dict': { 'title': 'JODA7', + 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', } }, { 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', 'info_dict': { - 'title': 'Uploads from Interstellar Movie', + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', }, 'playlist_mincout': 21, }] @@ -1261,6 +1322,9 @@ class YoutubeChannelIE(InfoExtractor): 'note': 'paginated channel', 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', 'playlist_mincount': 91, + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + } }] def extract_videos_from_page(self, page): @@ -1641,11 +1705,18 @@ class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) - (?:https?://)?[^/]+/watch\?(?: + (?:https?://)? + (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ + (?:watch\?(?: feature=[a-z_]+| - annotation_id=annotation_[^&]+ - )?$| - (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$ + annotation_id=annotation_[^&]+| + x-yt-cl=[0-9]+| + hl=[^&]*| + )? + | + attribution_link\?a=[^&]+ + ) + $ ''' _TESTS = [{ @@ -1654,6 +1725,15 @@ class YoutubeTruncatedURLIE(InfoExtractor): }, { 'url': 'http://www.youtube.com/watch?', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?feature=foo', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?hl=en-GB', + 'only_matching': True, }] def _real_extract(self, url): @@ -1669,7 +1749,7 @@ class YoutubeTruncatedURLIE(InfoExtractor): class YoutubeTruncatedIDIE(InfoExtractor): IE_NAME = 'youtube:truncated_id' IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P[0-9A-Za-z_-]{1,10})$' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P[0-9A-Za-z_-]{1,10})$' _TESTS = [{ 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',