X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=27e67feb43efb9374c9638a2f5616770c83ed241;hb=49dea4913bea3b8e5c7d65dd932aa68ada526088;hp=d6fef39e9e4b4295382681f21b8c01a1a96cb7bf;hpb=2d2fa82d172a10a49fb5449fa35bc409de778f05;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d6fef39e9..27e67feb4 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -6,6 +6,7 @@ from __future__ import unicode_literals import itertools import json import os.path +import random import re import time import traceback @@ -181,7 +182,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return -class YoutubeEntryListBaseInfoExtractor(InfoExtractor): +class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): # Extract entries from page with "Load more" button def _entries(self, page, playlist_id): more_widget_html = content_html = page @@ -233,7 +234,7 @@ class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): def _process_page(self, content): - for playlist_id in re.findall(r'href="/?playlist\?list=(.+?)"', content): + for playlist_id in orderedSet(re.findall(r'href="/?playlist\?list=([0-9A-Za-z-_]{10,})"', content)): yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') @@ -286,7 +287,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '22': {'ext': 'mp4', 'width': 1280, 'height': 720, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '34': {'ext': 'flv', 'width': 640, 'height': 360, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, '35': {'ext': 'flv', 'width': 854, 'height': 480, 'acodec': 'aac', 'abr': 128, 'vcodec': 'h264'}, - '36': {'ext': '3gp', 'width': 320, 'height': 240, 'acodec': 'aac', 'abr': 32, 'vcodec': 'mp4v'}, + # itag 36 videos are either 320x180 (BaW_jenozKc) or 320x240 (__2ABJjxzNo), abr varies as well + '36': {'ext': '3gp', 'width': 320, 'acodec': 'aac', 'vcodec': 'mp4v'}, '37': {'ext': 'mp4', 'width': 1920, 'height': 1080, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '38': {'ext': 'mp4', 'width': 4096, 'height': 3072, 'acodec': 'aac', 'abr': 192, 'vcodec': 'h264'}, '43': {'ext': 'webm', 'width': 640, 'height': 360, 'acodec': 'vorbis', 'abr': 128, 'vcodec': 'vp8'}, @@ -369,18 +371,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, } + _SUBTITLE_FORMATS = ('ttml', 'vtt') IE_NAME = 'youtube' _TESTS = [ { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&t=1s&end=9', + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&t=1s&end=9', 'info_dict': { 'id': 'BaW_jenozKc', 'ext': 'mp4', 'title': 'youtube-dl test video "\'/\\äâð', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', + 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\äâð\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], @@ -399,12 +404,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20120506', 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]', 'alt_title': 'I Love It (feat. Charli XCX)', - 'description': 'md5:782e8651347686cba06e58f71ab51773', + 'description': 'md5:f3ceb5ef83a08d95b9d146f973157cc8', 'tags': ['Icona Pop i love it', 'sweden', 'pop music', 'big beat records', 'big beat', 'charli', 'xcx', 'charli xcx', 'girls', 'hbo', 'i love it', "i don't care", 'icona', 'pop', 'iconic ep', 'iconic', 'love', 'it'], 'uploader': 'Icona Pop', 'uploader_id': 'IconaPop', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IconaPop', + 'license': 'Standard YouTube License', 'creator': 'Icona Pop', } }, @@ -420,6 +427,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:64249768eec3bc4276236606ea996373', 'uploader': 'justintimberlakeVEVO', 'uploader_id': 'justintimberlakeVEVO', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO', + 'license': 'Standard YouTube License', 'creator': 'Justin Timberlake', 'age_limit': 18, } @@ -435,11 +444,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7', 'uploader': 'SET India', 'uploader_id': 'setindia', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/setindia', + 'license': 'Standard YouTube License', 'age_limit': 18, } }, { - 'url': 'http://www.youtube.com/watch?v=BaW_jenozKcj&v=UxxajLWwzqY', + 'url': 'http://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', 'note': 'Use the first video ID in the URL', 'info_dict': { 'id': 'BaW_jenozKc', @@ -447,7 +458,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'title': 'youtube-dl test video "\'/\\äâð', 'uploader': 'Philipp Hagemeister', 'uploader_id': 'phihag', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/phihag', 'upload_date': '20121002', + 'license': 'Standard YouTube License', 'description': 'test chars: "\'/\\äâð\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .', 'categories': ['Science & Technology'], 'tags': ['youtube-dl'], @@ -466,8 +479,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'm4a', 'upload_date': '20121002', 'uploader_id': '8KVIDEO', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/8KVIDEO', 'description': '', 'uploader': '8KVIDEO', + 'license': 'Standard YouTube License', 'title': 'UHDTV TEST 8K VIDEO.mp4' }, 'params': { @@ -486,6 +501,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'AfrojackVEVO', 'uploader_id': 'AfrojackVEVO', 'upload_date': '20131011', + 'license': 'Standard YouTube License', }, 'params': { 'youtube_include_dash_manifest': True, @@ -504,6 +520,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'TaylorSwiftVEVO', 'uploader_id': 'TaylorSwiftVEVO', 'upload_date': '20140818', + 'license': 'Standard YouTube License', 'creator': 'Taylor Swift', }, 'params': { @@ -520,6 +537,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20100909', 'uploader': 'The Amazing Atheist', 'uploader_id': 'TheAmazingAtheist', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/TheAmazingAtheist', + 'license': 'Standard YouTube License', 'title': 'Burning Everyone\'s Koran', 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html', } @@ -534,7 +553,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 're:(?s).{100,}About the Game\n.*?The Witcher 3: Wild Hunt.{100,}', 'uploader': 'The Witcher', 'uploader_id': 'WitcherGame', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/WitcherGame', 'upload_date': '20140605', + 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -548,7 +569,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:33765bb339e1b47e7e72b5490139bb41', 'uploader': 'LloydVEVO', 'uploader_id': 'LloydVEVO', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/LloydVEVO', 'upload_date': '20110629', + 'license': 'Standard YouTube License', 'age_limit': 18, }, }, @@ -560,9 +583,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20100430', 'uploader_id': 'deadmau5', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/deadmau5', 'creator': 'deadmau5', 'description': 'md5:12c56784b8032162bb936a5f76d55360', 'uploader': 'deadmau5', + 'license': 'Standard YouTube License', 'title': 'Deadmau5 - Some Chords (HD)', 'alt_title': 'Some Chords', }, @@ -578,6 +603,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'ext': 'mp4', 'upload_date': '20150827', 'uploader_id': 'olympic', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/olympic', + 'license': 'Standard YouTube License', 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', @@ -595,8 +622,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'stretched_ratio': 16 / 9., 'upload_date': '20110310', 'uploader_id': 'AllenMeow', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/AllenMeow', 'description': 'made by Wacom from Korea | åå¹&å 油添é by TY\'s Allen | æè¬heylisa00cavey1001åå¸ç±æ æä¾æ¢åç¿»è¯', 'uploader': 'å«è¾å«', + 'license': 'Standard YouTube License', 'title': '[A-made] è®æ å¦åå¹ç å¤ªå¦ æå°±æ¯é樣ç人', }, }, @@ -627,7 +656,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:116377fd2963b81ec4ce64b542173306', 'upload_date': '20150625', 'uploader_id': 'dorappi2000', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/dorappi2000', 'uploader': 'dorappi2000', + 'license': 'Standard YouTube License', 'formats': 'mincount:33', }, }, @@ -642,6 +673,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader': 'Airtek', 'description': 'Retransmisión en directo de la XVIII media maratón de Zaragoza.', 'uploader_id': 'UCzTzUmjXxxacNnL8I3m4LnQ', + 'license': 'Standard YouTube License', 'title': 'Retransmisión XVIII Media maratón Zaragoza 2015', }, 'params': { @@ -666,6 +698,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -676,6 +710,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -686,6 +722,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }, { 'info_dict': { @@ -696,12 +734,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150721', 'uploader': 'Beer Games Beer', 'uploader_id': 'beergamesbeer', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/beergamesbeer', + 'license': 'Standard YouTube License', }, }], 'params': { 'skip_download': True, }, }, + { + # Multifeed video with comma in title (see https://github.com/rg3/youtube-dl/issues/8536) + 'url': 'https://www.youtube.com/watch?v=gVfLd0zydlo', + 'info_dict': { + 'id': 'gVfLd0zydlo', + 'title': 'DevConf.cz 2016 Day 2 Workshops 1 14:00 - 15:30', + }, + 'playlist_count': 2, + }, { 'url': 'http://vid.plus/FlRa-iH7PGw', 'only_matching': True, @@ -720,7 +769,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'description': 'md5:8085699c11dc3f597ce0410b0dcbb34a', 'upload_date': '20151119', 'uploader_id': 'IronSoulElf', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/IronSoulElf', 'uploader': 'IronSoulElf', + 'license': 'Standard YouTube License', 'creator': 'Todd Haberman, Daniel Law Heath & Aaron Kaplan', }, 'params': { @@ -748,6 +799,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip_download': True, }, }, + { + # Video licensed under Creative Commons + 'url': 'https://www.youtube.com/watch?v=M4gD1WSo5mA', + 'info_dict': { + 'id': 'M4gD1WSo5mA', + 'ext': 'mp4', + 'title': 'md5:e41008789470fc2533a3252216f1c1d1', + 'description': 'md5:a677553cf0840649b731a3024aeff4cc', + 'upload_date': '20150127', + 'uploader_id': 'BerkmanCenter', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/user/BerkmanCenter', + 'uploader': 'BerkmanCenter', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Channel-like uploader_url + 'url': 'https://www.youtube.com/watch?v=eQcmzGIKrzg', + 'info_dict': { + 'id': 'eQcmzGIKrzg', + 'ext': 'mp4', + 'title': 'Democratic Socialism and Foreign Policy | Bernie Sanders', + 'description': 'md5:dda0d780d5a6e120758d1711d062a867', + 'upload_date': '20151119', + 'uploader': 'Bernie 2016', + 'uploader_id': 'UCH1dpzjCEiGAt8CXkryhkZg', + 'uploader_url': 're:https?://(?:www\.)?youtube\.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', + 'license': 'Creative Commons Attribution license (reuse allowed)', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'https://www.youtube.com/watch?feature=player_embedded&v=V36LpHqtcDY', 'only_matching': True, @@ -918,7 +1005,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if lang in sub_lang_list: continue sub_formats = [] - for ext in ['sbv', 'vtt', 'srt']: + for ext in self._SUBTITLE_FORMATS: params = compat_urllib_parse.urlencode({ 'lang': lang, 'v': video_id, @@ -964,40 +1051,67 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return {} try: args = player_config['args'] - caption_url = args['ttsurl'] - if not caption_url: - self._downloader.report_warning(err_msg) - return {} - timestamp = args['timestamp'] - # We get the available subtitles - list_params = compat_urllib_parse.urlencode({ - 'type': 'list', - 'tlangs': 1, - 'asrs': 1, - }) - list_url = caption_url + '&' + list_params - caption_list = self._download_xml(list_url, video_id) - original_lang_node = caption_list.find('track') - if original_lang_node is None: - self._downloader.report_warning('Video doesn\'t have automatic captions') - return {} - original_lang = original_lang_node.attrib['lang_code'] - caption_kind = original_lang_node.attrib.get('kind', '') + caption_url = args.get('ttsurl') + if caption_url: + timestamp = args['timestamp'] + # We get the available subtitles + list_params = compat_urllib_parse.urlencode({ + 'type': 'list', + 'tlangs': 1, + 'asrs': 1, + }) + list_url = caption_url + '&' + list_params + caption_list = self._download_xml(list_url, video_id) + original_lang_node = caption_list.find('track') + if original_lang_node is None: + self._downloader.report_warning('Video doesn\'t have automatic captions') + return {} + original_lang = original_lang_node.attrib['lang_code'] + caption_kind = original_lang_node.attrib.get('kind', '') + + sub_lang_list = {} + for lang_node in caption_list.findall('target'): + sub_lang = lang_node.attrib['lang_code'] + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': ext, + 'ts': timestamp, + 'kind': caption_kind, + }) + sub_formats.append({ + 'url': caption_url + '&' + params, + 'ext': ext, + }) + sub_lang_list[sub_lang] = sub_formats + return sub_lang_list + + # Some videos don't provide ttsurl but rather caption_tracks and + # caption_translation_languages (e.g. 20LmZk1hakA) + caption_tracks = args['caption_tracks'] + caption_translation_languages = args['caption_translation_languages'] + caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] + parsed_caption_url = compat_urlparse.urlparse(caption_url) + caption_qs = compat_parse_qs(parsed_caption_url.query) sub_lang_list = {} - for lang_node in caption_list.findall('target'): - sub_lang = lang_node.attrib['lang_code'] + for lang in caption_translation_languages.split(','): + lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) + sub_lang = lang_qs.get('lc', [None])[0] + if not sub_lang: + continue sub_formats = [] - for ext in ['sbv', 'vtt', 'srt']: - params = compat_urllib_parse.urlencode({ - 'lang': original_lang, - 'tlang': sub_lang, - 'fmt': ext, - 'ts': timestamp, - 'kind': caption_kind, + for ext in self._SUBTITLE_FORMATS: + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], }) + sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( + query=compat_urllib_parse.urlencode(caption_qs, True))) sub_formats.append({ - 'url': caption_url + '&' + params, + 'url': sub_url, 'ext': ext, }) sub_lang_list[sub_lang] = sub_formats @@ -1008,6 +1122,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._downloader.report_warning(err_msg) return {} + def _mark_watched(self, video_id, video_info): + playback_url = video_info.get('videostats_playback_base_url', [None])[0] + if not playback_url: + return + parsed_playback_url = compat_urlparse.urlparse(playback_url) + qs = compat_urlparse.parse_qs(parsed_playback_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + }) + playback_url = compat_urlparse.urlunparse( + parsed_playback_url._replace(query=compat_urllib_parse.urlencode(qs, True))) + + self._download_webpage( + playback_url, video_id, 'Marking watched', + 'Unable to mark watched', fatal=False) + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1194,9 +1331,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not self._downloader.params.get('noplaylist'): entries = [] feed_ids = [] - multifeed_metadata_list = compat_urllib_parse_unquote_plus(video_info['multifeed_metadata_list'][0]) + multifeed_metadata_list = video_info['multifeed_metadata_list'][0] for feed in multifeed_metadata_list.split(','): - feed_data = compat_parse_qs(feed) + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/rg3/youtube-dl/issues/8536) + feed_data = compat_parse_qs(compat_urllib_parse_unquote_plus(feed)) entries.append({ '_type': 'url_transparent', 'ie_key': 'Youtube', @@ -1231,9 +1371,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # uploader_id video_uploader_id = None - mobj = re.search(r'', video_webpage) + video_uploader_url = None + mobj = re.search( + r'', + video_webpage) if mobj is not None: - video_uploader_id = mobj.group(1) + video_uploader_id = mobj.group('uploader_id') + video_uploader_url = mobj.group('uploader_url') else: self._downloader.report_warning('unable to extract uploader nickname') @@ -1261,6 +1405,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) + video_license = self._html_search_regex( + r'