X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=3b3678c6e638d29000dd6728358c76b319af4b9f;hb=70fca8d694e6f611384fdcabad748fb7a65235e4;hp=823d6aaf3a4f38ad5696e67a23b14e6fe690fabd;hpb=d5524947b560c1d0e1dfa2ef7f1969efe07866fa;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 823d6aaf3..3b3678c6e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -256,7 +256,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, - '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'}, @@ -264,9 +264,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'vcodec': 'none', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, # Dash webm '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, @@ -287,7 +287,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'VP9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'}, # Dash webm audio '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, @@ -392,6 +394,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'format': '141', }, }, + # JS player signature function name containing $ + { + 'url': 'https://www.youtube.com/watch?v=nfWlot6h_JM', + 'info_dict': { + 'id': 'nfWlot6h_JM', + 'ext': 'm4a', + 'title': 'Taylor Swift - Shake It Off', + 'description': 'md5:2acfda1b285bdd478ccec22f9918199d', + 'uploader': 'TaylorSwiftVEVO', + 'uploader_id': 'TaylorSwiftVEVO', + 'upload_date': '20140818', + }, + 'params': { + 'youtube_include_dash_manifest': True, + 'format': '141', + }, + }, # Controversy video { 'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', @@ -412,12 +431,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'id': 'HtVdAasjOgU', 'ext': 'mp4', 'title': 'The Witcher 3: Wild Hunt - The Sword Of Destiny Trailer', - 'description': 'md5:eca57043abae25130f58f655ad9a7771', + 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', 'upload_date': '20140605', }, }, + # Age-gate video with encrypted signature + { + 'url': 'http://www.youtube.com/watch?v=6kLq3WMV1nU', + 'info_dict': { + 'id': '6kLq3WMV1nU', + 'ext': 'mp4', + 'title': 'Dedication To My Ex (Miss That) (Lyric Video)', + 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', + 'uploader': 'LloydVEVO', + 'uploader_id': 'LloydVEVO', + 'upload_date': '20110629', + }, + }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) { 'url': '__2ABJjxzNo', @@ -450,6 +482,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'skip_download': 'requires avconv', } }, + # Non-square pixels + { + 'url': 'https://www.youtube.com/watch?v=_b-2C3KPAM0', + 'info_dict': { + 'id': '_b-2C3KPAM0', + 'ext': 'mp4', + 'stretched_ratio': 16 / 9., + 'upload_date': '20110310', + 'uploader_id': 'AllenMeow', + 'description': 'made by Wacom from Korea | 字幕&加油添醋 by TY\'s Allen | 感謝heylisa00cavey1001同學熱情提供梗及翻譯', + 'uploader': '孫艾倫', + 'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人', + }, + } ] def __init__(self, *args, **kwargs): @@ -478,7 +524,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', + r'.*?-(?P[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -527,8 +573,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): return 's[%s%s%s]' % (starts, ends, steps) step = None - start = '(Never used)' # Quelch pyflakes warnings - start will be - # set as soon as step is set + # Quelch pyflakes warnings - start will be set when step is set + start = '(Never used)' for i, prev in zip(idxs[1:], idxs[:-1]): if step is not None: if i - prev == step: @@ -559,7 +605,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode, + r'\.sig\|\|([a-zA-Z0-9$]+)\(', jscode, 'Initial JS player signature function name') jsi = JSInterpreter(jscode) @@ -599,24 +645,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _get_available_subtitles(self, video_id, webpage): try: - sub_list = self._download_webpage( + subs_doc = self._download_xml( 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, video_id, note=False) except ExtractorError as err: self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err)) return {} - lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) sub_lang_list = {} - for l in lang_list: - lang = l[1] + for track in subs_doc.findall('track'): + lang = track.attrib['lang_code'] if lang in sub_lang_list: continue params = compat_urllib_parse.urlencode({ 'lang': lang, 'v': video_id, 'fmt': self._downloader.params.get('subtitlesformat', 'srt'), - 'name': unescapeHTML(l[0]).encode('utf-8'), + 'name': track.attrib['name'].encode('utf-8'), }) url = 'https://www.youtube.com/api/timedtext?' + params sub_lang_list[lang] = url @@ -649,10 +694,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): list_url = caption_url + '&' + list_params caption_list = self._download_xml(list_url, video_id) original_lang_node = caption_list.find('track') - if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr': + if original_lang_node is None: self._downloader.report_warning('Video doesn\'t have automatic captions') return {} original_lang = original_lang_node.attrib['lang_code'] + caption_kind = original_lang_node.attrib.get('kind', '') sub_lang_list = {} for lang_node in caption_list.findall('target'): @@ -662,7 +708,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'tlang': sub_lang, 'fmt': sub_format, 'ts': timestamp, - 'kind': 'asr', + 'kind': caption_kind, }) sub_lang_list[sub_lang] = caption_url + '&' + params return sub_lang_list @@ -723,6 +769,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'format_id': format_id, 'url': video_url, 'width': int_or_none(r.attrib.get('width')), + 'height': int_or_none(r.attrib.get('height')), 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000), 'asr': int_or_none(r.attrib.get('audioSamplingRate')), 'filesize': filesize, @@ -733,7 +780,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): fo for fo in formats if fo['format_id'] == format_id) except StopIteration: - f.update(self._formats.get(format_id, {})) + f.update(self._formats.get(format_id, {}).items()) formats.append(f) else: existing_format.update(f) @@ -762,15 +809,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): player_url = None # Get video info + embed_webpage = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube + url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage(url, video_id, 'Downloading embed webpage') data = compat_urllib_parse.urlencode({ 'video_id': video_id, 'eurl': 'https://youtube.googleapis.com/v/' + video_id, 'sts': self._search_regex( - r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''), + r'"sts"\s*:\s*(\d+)', embed_webpage, 'sts', default=''), }) video_info_url = proto + '://www.youtube.com/get_video_info?' + data video_info_webpage = self._download_webpage( @@ -967,12 +1017,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): url += '&signature=' + url_data['sig'][0] elif 's' in url_data: encrypted_sig = url_data['s'][0] - - if not age_gate: + ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + + jsplayer_url_json = self._search_regex( + ASSETS_RE, + embed_webpage if age_gate else video_webpage, + 'JS player URL (1)', default=None) + if not jsplayer_url_json and not age_gate: + # We need the embed website after all + if embed_webpage is None: + embed_url = proto + '://www.youtube.com/embed/%s' % video_id + embed_webpage = self._download_webpage( + embed_url, video_id, 'Downloading embed webpage') jsplayer_url_json = self._search_regex( - r'"assets":.+?"js":\s*("[^"]+")', - video_webpage, 'JS player URL') - player_url = json.loads(jsplayer_url_json) + ASSETS_RE, embed_webpage, 'JS player URL') + + player_url = json.loads(jsplayer_url_json) if player_url is None: player_url_json = self._search_regex( r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")', @@ -1026,8 +1086,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): self.report_warning( 'Skipping DASH manifest: %r' % e, video_id) else: + # Hide the formats we found through non-DASH + dash_keys = set(df['format_id'] for df in dash_formats) + for f in formats: + if f['format_id'] in dash_keys: + f['format_id'] = 'nondash-%s' % f['format_id'] + f['preference'] = f.get('preference', 0) - 10000 formats.extend(dash_formats) + # Check for malformed aspect ratio + stretched_m = re.search( + r'[0-9]+):(?P[0-9]+)">', + video_webpage) + if stretched_m: + ratio = float(stretched_m.group('w')) / float(stretched_m.group('h')) + for f in formats: + if f.get('vcodec') != 'none': + f['stretched_ratio'] = ratio + self._sort_formats(formats) return { @@ -1128,6 +1204,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'info_dict': { 'title': 'JODA7', } + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + }, + 'playlist_mincout': 21, }] def _real_initialize(self): @@ -1178,9 +1261,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): if playlist_id.startswith('RD'): # Mixes require a custom extraction process return self._extract_mix(playlist_id) - if playlist_id.startswith('TL'): - raise ExtractorError('For downloading YouTube.com top lists, use ' - 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True) url = self._TEMPLATE_URL % playlist_id page = self._download_webpage(url, playlist_id) @@ -1212,6 +1292,10 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): 'Downloading page #%s' % page_num, transform_source=uppercase_escape) content_html = more['content_html'] + if not content_html.strip(): + # Some webpages show a "Load more" button but they don't + # have more videos + break more_widget_html = more['load_more_widget_html'] playlist_title = self._html_search_regex( @@ -1222,49 +1306,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): return self.playlist_result(url_results, playlist_id, playlist_title) -class YoutubeTopListIE(YoutubePlaylistIE): - IE_NAME = 'youtube:toplist' - IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"' - ' (Example: "yttoplist:music:Top Tracks")') - _VALID_URL = r'yttoplist:(?P.*?):(?P.*?)$' - _TESTS = [{ - 'url': 'yttoplist:music:Trending', - 'playlist_mincount': 5, - 'skip': 'Only works for logged-in users', - }] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - channel = mobj.group('chann') - title = mobj.group('title') - query = compat_urllib_parse.urlencode({'title': title}) - channel_page = self._download_webpage( - 'https://www.youtube.com/%s' % channel, title) - link = self._html_search_regex( - r'''(?x) - <a\s+href="([^"]+)".*?>\s* - <span\s+class="branded-page-module-title-text">\s* - <span[^>]*>.*?%s.*?</span>''' % re.escape(query), - channel_page, 'list') - url = compat_urlparse.urljoin('https://www.youtube.com/', link) - - video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"' - ids = [] - # sometimes the webpage doesn't contain the videos - # retry until we get them - for i in itertools.count(0): - msg = 'Downloading Youtube mix' - if i > 0: - msg += ', retry #%d' % i - - webpage = self._download_webpage(url, title, msg) - ids = orderedSet(re.findall(video_re, webpage)) - if ids: - break - url_results = self._ids_to_results(ids) - return self.playlist_result(url_results, playlist_title=title) - - class YoutubeChannelIE(InfoExtractor): IE_DESC = 'YouTube.com channels' _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)' @@ -1555,9 +1596,11 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_entries = [] paging = 0 for i in itertools.count(1): - info = self._download_json(self._FEED_TEMPLATE % paging, - '%s feed' % self._FEED_NAME, - 'Downloading page %s' % i) + info = self._download_json( + self._FEED_TEMPLATE % paging, + '%s feed' % self._FEED_NAME, + 'Downloading page %s' % i, + transform_source=uppercase_escape) feed_html = info.get('feed_html') or info.get('content_html') load_more_widget_html = info.get('load_more_widget_html') or feed_html m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) @@ -1651,11 +1694,18 @@ class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list _VALID_URL = r'''(?x) - (?:https?://)?[^/]+/watch\?(?: + (?:https?://)? + (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/ + (?:watch\?(?: feature=[a-z_]+| - annotation_id=annotation_[^&]+ - )?$| - (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$ + annotation_id=annotation_[^&]+| + x-yt-cl=[0-9]+| + hl=[^&]*| + )? + | + attribution_link\?a=[^&]+ + ) + $ ''' _TESTS = [{ @@ -1664,6 +1714,15 @@ class YoutubeTruncatedURLIE(InfoExtractor): }, { 'url': 'http://www.youtube.com/watch?', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?feature=foo', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?hl=en-GB', + 'only_matching': True, }] def _real_extract(self, url): @@ -1674,3 +1733,20 @@ class YoutubeTruncatedURLIE(InfoExtractor): '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" ' ' or simply youtube-dl BaW_jenozKc .', expected=True) + + +class YoutubeTruncatedIDIE(InfoExtractor): + IE_NAME = 'youtube:truncated_id' + IE_DESC = False # Do not list + _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$' + + _TESTS = [{ + 'url': 'https://www.youtube.com/watch?v=N_708QY7Ob', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + raise ExtractorError( + 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), + expected=True)