X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=456d3cb0fc2eb9df9e3b56333d5df84a0381702a;hb=e0df6211cc9364f62406b2907fa830847324db53;hp=4c9bab4597b0f85e5fc437dfaadd3134e5974d35;hpb=8d212e604a86da3c924ab15fe8045ab748a8183d;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4c9bab459..456d3cb0f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1,12 +1,19 @@ # coding: utf-8 +import collections +import itertools +import io import json import netrc import re import socket -import itertools +import string +import struct +import traceback +import zlib from .common import InfoExtractor, SearchInfoExtractor +from .subtitles import SubtitlesInfoExtractor from ..utils import ( compat_http_client, compat_parse_qs, @@ -130,13 +137,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._confirm_age() -class YoutubeIE(YoutubeBaseInfoExtractor): + +class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): IE_DESC = u'YouTube.com' _VALID_URL = r"""^ ( (?:https?://)? # http(s):// (optional) - (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| - tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains + (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/| + tube\.majestyc\.net/| + youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains (?:.*?\#/)? # handle anchor (#/) redirect urls (?: # the various things that can precede the ID: (?:(?:v|embed|e)/) # v/ or embed/ or e/ @@ -146,26 +155,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) v= ) - )? # optional -> youtube.com/xxxx is OK + )) + |youtu\.be/ # just youtu.be/xxxx + ) )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]+) # here is it! the YouTube video ID + ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow $""" _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13', + # Apple HTTP Live Streaming + '96', '95', '94', '93', '92', '132', '151', + # 3D '85', '84', '102', '83', '101', '82', '100', + # Dash video + '138', '137', '248', '136', '247', '135', '246', + '245', '244', '134', '243', '133', '242', '160', + # Dash audio + '141', '172', '140', '171', '139', ] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13', - '95', '94', '93', '92', '132', '151', + _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13', + # Apple HTTP Live Streaming + '96', '95', '94', '93', '92', '132', '151', + # 3D '85', '102', '84', '101', '83', '100', '82', + # Dash video + '138', '248', '137', '247', '136', '246', '245', + '244', '135', '243', '134', '242', '133', '160', + # Dash audio + '172', '141', '171', '140', '139', ] + _video_formats_map = { + 'flv': ['35', '34', '6', '5'], + '3gp': ['36', '17', '13'], + 'mp4': ['38', '37', '22', '18'], + 'webm': ['46', '45', '44', '43'], + } _video_extensions = { '13': '3gp', - '17': 'mp4', + '17': '3gp', '18': 'mp4', '22': 'mp4', + '36': '3gp', '37': 'mp4', '38': 'mp4', '43': 'webm', @@ -181,8 +213,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '100': 'webm', '101': 'webm', '102': 'webm', - - # videos that use m3u8 + + # Apple HTTP Live Streaming '92': 'mp4', '93': 'mp4', '94': 'mp4', @@ -190,6 +222,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '96': 'mp4', '132': 'mp4', '151': 'mp4', + + # Dash mp4 + '133': 'mp4', + '134': 'mp4', + '135': 'mp4', + '136': 'mp4', + '137': 'mp4', + '138': 'mp4', + '139': 'mp4', + '140': 'mp4', + '141': 'mp4', + '160': 'mp4', + + # Dash webm + '171': 'webm', + '172': 'webm', + '242': 'webm', + '243': 'webm', + '244': 'webm', + '245': 'webm', + '246': 'webm', + '247': 'webm', + '248': 'webm', } _video_dimensions = { '5': '240x400', @@ -200,6 +255,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '22': '720x1280', '34': '360x640', '35': '480x854', + '36': '240x320', '37': '1080x1920', '38': '3072x4096', '43': '360x640', @@ -217,11 +273,58 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '96': '1080p', '100': '360p', '101': '480p', - '102': '720p', + '102': '720p', '132': '240p', '151': '72p', + '133': '240p', + '134': '360p', + '135': '480p', + '136': '720p', + '137': '1080p', + '138': '>1080p', + '139': '48k', + '140': '128k', + '141': '256k', + '160': '192p', + '171': '128k', + '172': '256k', + '242': '240p', + '243': '360p', + '244': '480p', + '245': '480p', + '246': '480p', + '247': '720p', + '248': '1080p', + } + _special_itags = { + '82': '3D', + '83': '3D', + '84': '3D', + '85': '3D', + '100': '3D', + '101': '3D', + '102': '3D', + '133': 'DASH Video', + '134': 'DASH Video', + '135': 'DASH Video', + '136': 'DASH Video', + '137': 'DASH Video', + '138': 'DASH Video', + '139': 'DASH Audio', + '140': 'DASH Audio', + '141': 'DASH Audio', + '160': 'DASH Video', + '171': 'DASH Audio', + '172': 'DASH Audio', + '242': 'DASH Video', + '243': 'DASH Video', + '244': 'DASH Video', + '245': 'DASH Video', + '246': 'DASH Video', + '247': 'DASH Video', + '248': 'DASH Video', } - _3d_itags = ['85', '84', '102', '83', '101', '82', '100'] + IE_NAME = u'youtube' _TESTS = [ { @@ -254,7 +357,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:b085c9804f5ab69f4adea963a2dceb3c", + u"description": u"md5:3e2666e0a55044490499ea45fe9037b7", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } @@ -292,9 +395,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False + if YoutubePlaylistIE.suitable(url): return False return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def __init__(self, *args, **kwargs): + super(YoutubeIE, self).__init__(*args, **kwargs) + self._jsplayer_cache = {} + def report_video_webpage_download(self, video_id): """Report attempt to download video webpage.""" self.to_screen(u'%s: Downloading video webpage' % video_id) @@ -303,19 +410,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """Report attempt to download video info webpage.""" self.to_screen(u'%s: Downloading video info webpage' % video_id) - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Checking available subtitles' % video_id) - - def report_video_subtitles_request(self, video_id, sub_lang, format): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - def report_information_extraction(self, video_id): """Report attempt to extract video information.""" self.to_screen(u'%s: Extracting video information' % video_id) @@ -328,9 +422,563 @@ class YoutubeIE(YoutubeBaseInfoExtractor): """Indicate the download will use the RTMP protocol.""" self.to_screen(u'RTMP download detected') - def _decrypt_signature(self, s): + def _extract_signature_function(self, video_id, player_url): + id_m = re.match(r'.*-(?P[^.]+)\.(?P[^.]+)$', player_url) + player_type = id_m.group('ext') + player_id = id_m.group('id') + + if player_type == 'js': + code = self._download_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + return self._parse_sig_js(code) + elif player_tpye == 'swf': + urlh = self._request_webpage( + player_url, video_id, + note=u'Downloading %s player %s' % (player_type, jsplayer_id), + errnote=u'Download of %s failed' % player_url) + code = urlh.read() + return self._parse_sig_swf(code) + else: + assert False, 'Invalid player type %r' % player_type + + def _parse_sig_js(self, jscode): + funcname = self._search_regex( + r'signature=([a-zA-Z]+)', jscode, + u'Initial JS player signature function name') + + functions = {} + + def argidx(varname): + return string.lowercase.index(varname) + + def interpret_statement(stmt, local_vars, allow_recursion=20): + if allow_recursion < 0: + raise ExctractorError(u'Recursion limit reached') + + if stmt.startswith(u'var '): + stmt = stmt[len(u'var '):] + ass_m = re.match(r'^(?P[a-z]+)(?:\[(?P[^\]]+)\])?' + + r'=(?P.*)$', stmt) + if ass_m: + if ass_m.groupdict().get('index'): + def assign(val): + lvar = local_vars[ass_m.group('out')] + idx = interpret_expression(ass_m.group('index'), + local_vars, allow_recursion) + assert isinstance(idx, int) + lvar[idx] = val + return val + expr = ass_m.group('expr') + else: + def assign(val): + local_vars[ass_m.group('out')] = val + return val + expr = ass_m.group('expr') + elif stmt.startswith(u'return '): + assign = lambda v: v + expr = stmt[len(u'return '):] + else: + raise ExtractorError( + u'Cannot determine left side of statement in %r' % stmt) + + v = interpret_expression(expr, local_vars, allow_recursion) + return assign(v) + + def interpret_expression(expr, local_vars, allow_recursion): + if expr.isdigit(): + return int(expr) + + if expr.isalpha(): + return local_vars[expr] + + m = re.match(r'^(?P[a-z]+)\.(?P.*)$', expr) + if m: + member = m.group('member') + val = local_vars[m.group('in')] + if member == 'split("")': + return list(val) + if member == 'join("")': + return u''.join(val) + if member == 'length': + return len(val) + if member == 'reverse()': + return val[::-1] + slice_m = re.match(r'slice\((?P.*)\)', member) + if slice_m: + idx = interpret_expression( + slice_m.group('idx'), local_vars, allow_recursion-1) + return val[idx:] + + m = re.match( + r'^(?P[a-z]+)\[(?P.+)\]$', expr) + if m: + val = local_vars[m.group('in')] + idx = interpret_expression(m.group('idx'), local_vars, + allow_recursion-1) + return val[idx] + + m = re.match(r'^(?P.+?)(?P[%])(?P.+?)$', expr) + if m: + a = interpret_expression(m.group('a'), + local_vars, allow_recursion) + b = interpret_expression(m.group('b'), + local_vars, allow_recursion) + return a % b + + m = re.match( + r'^(?P[a-zA-Z]+)\((?P[a-z0-9,]+)\)$', expr) + if m: + fname = m.group('func') + if fname not in functions: + functions[fname] = extract_function(fname) + argvals = [int(v) if v.isdigit() else local_vars[v] + for v in m.group('args').split(',')] + return functions[fname](argvals) + raise ExtractorError(u'Unsupported JS expression %r' % expr) + + def extract_function(funcname): + func_m = re.search( + r'function ' + re.escape(funcname) + + r'\((?P[a-z,]+)\){(?P[^}]+)}', + jscode) + argnames = func_m.group('args').split(',') + + def resf(args): + local_vars = dict(zip(argnames, args)) + for stmt in func_m.group('code').split(';'): + res = interpret_statement(stmt, local_vars) + return res + return resf + + initial_function = extract_function(funcname) + return lambda s: initial_function([s]) + + def _parse_sig_swf(self, file_contents): + if file_contents[1:3] != b'WS': + raise ExtractorError( + u'Not an SWF file; header is %r' % file_contents[:3]) + if file_contents[:1] == b'C': + content = zlib.decompress(file_contents[8:]) + else: + raise NotImplementedError(u'Unsupported compression format %r' % + file_contents[:1]) + + def extract_tags(content): + pos = 0 + while pos < len(content): + header16 = struct.unpack('> 6 + tag_len = header16 & 0x3f + if tag_len == 0x3f: + tag_len = struct.unpack('> 4 + methods = {} + if kind in [0x00, 0x06]: # Slot or Const + _, pos = u30(pos=pos) # Slot id + type_name_idx, pos = u30(pos=pos) + vindex, pos = u30(pos=pos) + if vindex != 0: + _, pos = read_byte(pos=pos) # vkind + elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter + _, pos = u30(pos=pos) # disp_id + method_idx, pos = u30(pos=pos) + methods[multinames[trait_name_idx]] = method_idx + elif kind == 0x04: # Class + _, pos = u30(pos=pos) # slot_id + _, pos = u30(pos=pos) # classi + elif kind == 0x05: # Function + _, pos = u30(pos=pos) # slot_id + function_idx, pos = u30(pos=pos) + methods[function_idx] = multinames[trait_name_idx] + else: + raise ExtractorError(u'Unsupported trait kind %d' % kind) + + if attrs & 0x4 != 0: # Metadata present + metadata_count, pos = u30(pos=pos) + for _c3 in range(metadata_count): + _, pos = u30(pos=pos) + + return (methods, pos) + + # Classes + TARGET_CLASSNAME = u'SignatureDecipher' + searched_idx = multinames.index(TARGET_CLASSNAME) + searched_class_id = None + class_count, p = u30() + for class_id in range(class_count): + name_idx, p = u30() + if name_idx == searched_idx: + # We found the class we're looking for! + searched_class_id = class_id + _, p = u30() # super_name idx + flags, p = read_byte() + if flags & 0x08 != 0: # Protected namespace is present + protected_ns_idx, p = u30() + intrf_count, p = u30() + for _c2 in range(intrf_count): + _, p = u30() + _, p = u30() # iinit + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + if searched_class_id is None: + raise ExtractorError(u'Target class %r not found' % + TARGET_CLASSNAME) + + method_names = {} + method_idxs = {} + for class_id in range(class_count): + _, p = u30() # cinit + trait_count, p = u30() + for _c2 in range(trait_count): + trait_methods, p = parse_traits_info() + if class_id == searched_class_id: + method_names.update(trait_methods.items()) + method_idxs.update(dict( + (idx, name) + for name, idx in trait_methods.items())) + + # Scripts + script_count, p = u30() + for _c in range(script_count): + _, p = u30() # init + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + # Method bodies + method_body_count, p = u30() + Method = collections.namedtuple('Method', ['code', 'local_count']) + methods = {} + for _c in range(method_body_count): + method_idx, p = u30() + max_stack, p = u30() + local_count, p = u30() + init_scope_depth, p = u30() + max_scope_depth, p = u30() + code_length, p = u30() + if method_idx in method_idxs: + m = Method(code_tag[p:p+code_length], local_count) + methods[method_idxs[method_idx]] = m + p += code_length + exception_count, p = u30() + for _c2 in range(exception_count): + _, p = u30() # from + _, p = u30() # to + _, p = u30() # target + _, p = u30() # exc_type + _, p = u30() # var_name + trait_count, p = u30() + for _c2 in range(trait_count): + _, p = parse_traits_info() + + assert p == len(code_tag) + assert len(methods) == len(method_idxs) + + method_pyfunctions = {} + + def extract_function(func_name): + if func_name in method_pyfunctions: + return method_pyfunctions[func_name] + if func_name not in methods: + raise ExtractorError(u'Cannot find function %r' % func_name) + m = methods[func_name] + + def resfunc(args): + print('Entering function %s(%r)' % (func_name, args)) + registers = ['(this)'] + list(args) + [None] * m.local_count + stack = [] + coder = io.BytesIO(m.code) + while True: + opcode = struct.unpack('!B', coder.read(1))[0] + if opcode == 208: # getlocal_0 + stack.append(registers[0]) + elif opcode == 209: # getlocal_1 + stack.append(registers[1]) + elif opcode == 210: # getlocal_2 + stack.append(registers[2]) + elif opcode == 36: # pushbyte + v = struct.unpack('!B', coder.read(1))[0] + stack.append(v) + elif opcode == 44: # pushstring + idx = u30(coder) + stack.append(constant_strings[idx]) + elif opcode == 48: # pushscope + # We don't implement the scope register, so we'll just + # ignore the popped value + stack.pop() + elif opcode == 70: # callproperty + index = u30(coder) + mname = multinames[index] + arg_count = u30(coder) + args = list(reversed( + [stack.pop() for _ in range(arg_count)])) + obj = stack.pop() + if mname == u'split': + assert len(args) == 1 + assert isinstance(args[0], compat_str) + assert isinstance(obj, compat_str) + if args[0] == u'': + res = list(obj) + else: + res = obj.split(args[0]) + stack.append(res) + elif mname in method_pyfunctions: + stack.append(method_pyfunctions[mname](args)) + else: + raise NotImplementedError( + u'Unsupported property %r on %r' + % (mname, obj)) + elif opcode == 93: # findpropstrict + index = u30(coder) + mname = multinames[index] + res = extract_function(mname) + stack.append(res) + elif opcode == 97: # setproperty + index = u30(coder) + value = stack.pop() + idx = stack.pop() + obj = stack.pop() + assert isinstance(obj, list) + assert isinstance(idx, int) + obj[idx] = value + elif opcode == 98: # getlocal + index = u30(coder) + stack.append(registers[index]) + elif opcode == 99: # setlocal + index = u30(coder) + value = stack.pop() + registers[index] = value + elif opcode == 102: # getproperty + index = u30(coder) + pname = multinames[index] + if pname == u'length': + obj = stack.pop() + assert isinstance(obj, list) + stack.append(len(obj)) + else: # Assume attribute access + idx = stack.pop() + assert isinstance(idx, int) + obj = stack.pop() + assert isinstance(obj, list) + stack.append(obj[idx]) + elif opcode == 128: # coerce + _ = u30(coder) + elif opcode == 133: # coerce_s + assert isinstance(stack[-1], (type(None), compat_str)) + elif opcode == 164: # modulo + value2 = stack.pop() + value1 = stack.pop() + res = value1 % value2 + stack.append(res) + elif opcode == 214: # setlocal_2 + registers[2] = stack.pop() + elif opcode == 215: # setlocal_3 + registers[3] = stack.pop() + else: + raise NotImplementedError( + u'Unsupported opcode %d' % opcode) + + method_pyfunctions[func_name] = resfunc + return resfunc + + initial_function = extract_function(u'decipher') + return lambda s: initial_function([s]) + + def _decrypt_signature(self, s, video_id, jsplayer_url, age_gate=False): """Turn the encrypted s field into a working signature""" + if jsplayer_url is not None: + try: + if jsplayer_url not in self._jsplayer_cache: + self._jsplayer_cache[jsplayer_url] = self._extract_signature_function( + video_id, jsplayer_url + ) + return self._jsplayer_cache[jsplayer_url]([s]) + except Exception as e: + tb = traceback.format_exc() + self._downloader.report_warning(u'Automatic signature extraction failed: ' + tb) + + self._downloader.report_warning(u'Warning: Falling back to static signature algorithm') + + if age_gate: + # The videos with age protection use another player, so the + # algorithms can be different. + if len(s) == 86: + return s[2:63] + s[82] + s[64:82] + s[63] + if len(s) == 92: return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83] elif len(s) == 90: @@ -338,21 +986,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): elif len(s) == 89: return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1] elif len(s) == 88: - return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12] + return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28] elif len(s) == 87: return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:] elif len(s) == 86: - return s[5:20] + s[2] + s[21:] + return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53] elif len(s) == 85: - return s[83:34:-1] + s[0] + s[33:27:-1] + s[3] + s[26:19:-1] + s[34] + s[18:3:-1] + s[27] + return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84] elif len(s) == 84: - return s[83:27:-1] + s[0] + s[26:5:-1] + s[2:0:-1] + s[27] + return s[81:36:-1] + s[0] + s[35:2:-1] elif len(s) == 83: return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0] elif len(s) == 82: - return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34] + return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54] elif len(s) == 81: return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] + elif len(s) == 80: + return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80] elif len(s) == 79: return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9] @@ -368,111 +1018,85 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Fallback to the other algortihms return self._decrypt_signature(s) - def _get_available_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None) - sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + sub_list = self._download_webpage( + 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id, + video_id, note=False) + except ExtractorError as err: + self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err)) + return {} + lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + + sub_lang_list = {} + for l in lang_list: + lang = l[1] + params = compat_urllib_parse.urlencode({ + 'lang': lang, + 'v': video_id, + 'fmt': self._downloader.params.get('subtitlesformat'), + }) + url = u'http://www.youtube.com/api/timedtext?' + params + sub_lang_list[lang] = url if not sub_lang_list: - return (u'video doesn\'t have subtitles', None) + self._downloader.report_warning(u'video doesn\'t have subtitles') + return {} return sub_lang_list - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ - Return tuple: - (error_message, sub_lang, sub) - """ - self.report_video_subtitles_request(video_id, sub_lang, format) - params = compat_urllib_parse.urlencode({ - 'lang': sub_lang, - 'name': sub_name, - 'v': video_id, - 'fmt': format, - }) - url = 'http://www.youtube.com/api/timedtext?' + params - try: - sub = compat_urllib_request.urlopen(url).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None, None) - if not sub: - return (u'Did not fetch video subtitles', None, None) - return (None, sub_lang, sub) - - def _request_automatic_caption(self, video_id, webpage): + def _get_available_automatic_caption(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" - sub_lang = self._downloader.params.get('subtitleslang') or 'en' sub_format = self._downloader.params.get('subtitlesformat') self.to_screen(u'%s: Looking for automatic captions' % video_id) mobj = re.search(r';ytplayer.config = ({.*?});', webpage) - err_msg = u'Couldn\'t find automatic captions for "%s"' % sub_lang + err_msg = u'Couldn\'t find automatic captions for %s' % video_id if mobj is None: - return [(err_msg, None, None)] + self._downloader.report_warning(err_msg) + return {} player_config = json.loads(mobj.group(1)) try: args = player_config[u'args'] caption_url = args[u'ttsurl'] timestamp = args[u'timestamp'] - params = compat_urllib_parse.urlencode({ - 'lang': 'en', - 'tlang': sub_lang, - 'fmt': sub_format, - 'ts': timestamp, - 'kind': 'asr', + # We get the available subtitles + list_params = compat_urllib_parse.urlencode({ + 'type': 'list', + 'tlangs': 1, + 'asrs': 1, }) - subtitles_url = caption_url + '&' + params - sub = self._download_webpage(subtitles_url, video_id, u'Downloading automatic captions') - return [(None, sub_lang, sub)] - except KeyError: - return [(err_msg, None, None)] - - def _extract_subtitle(self, video_id): - """ - Return a list with a tuple: - [(error_message, sub_lang, sub)] - """ - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - if self._downloader.params.get('subtitleslang', False): - sub_lang = self._downloader.params.get('subtitleslang') - elif 'en' in sub_lang_list: - sub_lang = 'en' - else: - sub_lang = list(sub_lang_list.keys())[0] - if not sub_lang in sub_lang_list: - return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] - - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - return [subtitle] - - def _extract_all_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - subtitles = [] - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - subtitles.append(subtitle) - return subtitles + list_url = caption_url + '&' + list_params + list_page = self._download_webpage(list_url, video_id) + caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8')) + original_lang_node = caption_list.find('track') + if original_lang_node.attrib.get('kind') != 'asr' : + self._downloader.report_warning(u'Video doesn\'t have automatic captions') + return {} + original_lang = original_lang_node.attrib['lang_code'] + + sub_lang_list = {} + for lang_node in caption_list.findall('target'): + sub_lang = lang_node.attrib['lang_code'] + params = compat_urllib_parse.urlencode({ + 'lang': original_lang, + 'tlang': sub_lang, + 'fmt': sub_format, + 'ts': timestamp, + 'kind': 'asr', + }) + sub_lang_list[sub_lang] = caption_url + '&' + params + return sub_lang_list + # An extractor error can be raise by the download process if there are + # no automatic captions but there are subtitles + except (KeyError, ExtractorError): + self._downloader.report_warning(err_msg) + return {} def _print_formats(self, formats): print('Available formats:') for x in formats: print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'), - ' (3D)' if x in self._3d_itags else '')) + ' ('+self._special_itags[x]+')' if x in self._special_itags else '')) def _extract_id(self, url): mobj = re.match(self._VALID_URL, url, re.VERBOSE) @@ -507,13 +1131,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats else: # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality + # available in the specified format. For example, + # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. + # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'. + # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'. req_formats = req_format.split('/') video_url_list = None for rf in req_formats: if rf in url_map: video_url_list = [(rf, url_map[rf])] break + if rf in self._video_formats_map: + for srf in self._video_formats_map[rf]: + if srf in url_map: + video_url_list = [(srf, url_map[srf])] + break + else: + continue + break if video_url_list is None: raise ExtractorError(u'requested format not available') return video_url_list @@ -554,7 +1190,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) + mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) if mobj is not None: player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) else: @@ -653,30 +1289,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_description = u'' # subtitles - video_subtitles = None - - if self._downloader.params.get('writesubtitles', False): - video_subtitles = self._extract_subtitle(video_id) - if video_subtitles: - (sub_error, sub_lang, sub) = video_subtitles[0] - if sub_error: - self._downloader.report_warning(sub_error) - - if self._downloader.params.get('writeautomaticsub', False): - video_subtitles = self._request_automatic_caption(video_id, video_webpage) - (sub_error, sub_lang, sub) = video_subtitles[0] - if sub_error: - self._downloader.report_warning(sub_error) - - if self._downloader.params.get('allsubtitles', False): - video_subtitles = self._extract_all_subtitles(video_id) - for video_subtitle in video_subtitles: - (sub_error, sub_lang, sub) = video_subtitle - if sub_error: - self._downloader.report_warning(sub_error) + video_subtitles = self.extract_subtitles(video_id, video_webpage) if self._downloader.params.get('listsubtitles', False): - self._list_available_subtitles(video_id) + self._list_available_subtitles(video_id, video_webpage) return if 'length_seconds' not in video_info: @@ -699,6 +1315,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if m_s is not None: self.to_screen(u'%s: Encrypted signatures detected.' % video_id) video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']] + m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u'')) + if m_s is not None: + if 'url_encoded_fmt_stream_map' in video_info: + video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts'] + else: + video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']] + elif 'adaptive_fmts' in video_info: + if 'url_encoded_fmt_stream_map' in video_info: + video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0] + else: + video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts'] except ValueError: pass @@ -716,24 +1343,31 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'sig' in url_data: url += '&signature=' + url_data['sig'][0] elif 's' in url_data: + encrypted_sig = url_data['s'][0] if self._downloader.params.get('verbose'): - s = url_data['s'][0] if age_gate: - player_version = self._search_regex(r'ad3-(.+?)\.swf', - video_info['ad3_module'][0] if 'ad3_module' in video_info else 'NOT FOUND', + player_version = self._search_regex(r'-(.+)\.swf$', + player_url if player_url else 'NOT FOUND', 'flash player', fatal=False) - player = 'flash player %s' % player_version + player_desc = 'flash player %s' % player_version else: - player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage, + player_version = self._search_regex(r'html5player-(.+?)\.js', video_webpage, 'html5 player', fatal=False) - parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.')) + player_desc = u'html5 player %s' % player_version + + parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.')) self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' % - (len(s), parts_sizes, url_data['itag'][0], player)) - encrypted_sig = url_data['s'][0] + (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc)) + if age_gate: - signature = self._decrypt_signature_age_gate(encrypted_sig) + jsplayer_url = None else: - signature = self._decrypt_signature(encrypted_sig) + jsplayer_url_json = self._search_regex( + r'"assets":.+?"js":\s*("[^"]+")', + video_webpage, u'JS player URL') + jsplayer_url = json.loads(jsplayer_url_json) + + signature = self._decrypt_signature(encrypted_sig, video_id, jsplayer_url, age_gate) url += '&signature=' + signature if 'ratebypass' not in url: url += '&ratebypass=yes' @@ -758,7 +1392,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension, self._video_dimensions.get(format_param, '???'), - ' (3D)' if format_param in self._3d_itags else '') + ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '') results.append({ 'id': video_id, @@ -834,8 +1468,11 @@ class YoutubePlaylistIE(InfoExtractor): for entry in response['feed']['entry']: index = entry['yt$position']['$t'] - if 'media$group' in entry and 'media$player' in entry['media$group']: - videos.append((index, entry['media$group']['media$player']['url'])) + if 'media$group' in entry and 'yt$videoid' in entry['media$group']: + videos.append(( + index, + 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] + )) videos = [v[1] for v in sorted(videos)] @@ -901,13 +1538,20 @@ class YoutubeChannelIE(InfoExtractor): class YoutubeUserIE(InfoExtractor): IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)' _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s' _GDATA_PAGE_SIZE = 50 - _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' - _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]' + _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json' IE_NAME = u'youtube:user' + @classmethod + def suitable(cls, url): + # Don't return True if the url can be extracted with other youtube + # extractor, the regex would is too permissive and it would match. + other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls) + if any(ie.suitable(url) for ie in other_ies): return False + else: return super(YoutubeUserIE, cls).suitable(url) + def _real_extract(self, url): # Extract username mobj = re.match(self._VALID_URL, url) @@ -930,13 +1574,18 @@ class YoutubeUserIE(InfoExtractor): page = self._download_webpage(gdata_url, username, u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE)) + try: + response = json.loads(page) + except ValueError as err: + raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) + if 'entry' not in response['feed']: + # Number of videos is a multiple of self._MAX_RESULTS + break + # Extract video identifiers ids_in_page = [] - - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - if mobj.group(1) not in ids_in_page: - ids_in_page.append(mobj.group(1)) - + for entry in response['feed']['entry']: + ids_in_page.append(entry['id']['$t'].split('/')[-1]) video_ids.extend(ids_in_page) # A little optimization - if current page is not @@ -1075,7 +1724,7 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' - _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?' + _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?' _LOGIN_REQUIRED = True def _real_extract(self, url):