14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
70 request = compat_urllib_request.Request(self._LOGIN_URL)
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
78 login_page, u'Login GALX parameter')
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u'PersistentCookie': u'yes',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
95 u'signIn': u'Sign in',
97 u'service': u'youtube',
101 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
103 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
104 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
105 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
108 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
109 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
110 self._downloader.report_warning(u'unable to log in: bad username or password')
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
117 def _confirm_age(self):
120 'action_confirm': 'Confirm',
122 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
124 self.report_age_confirmation()
125 compat_urllib_request.urlopen(request).read().decode('utf-8')
126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
127 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
130 def _real_initialize(self):
131 if self._downloader is None:
133 if not self._set_language():
135 if not self._login():
140 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
141 IE_DESC = u'YouTube.com'
144 (?:https?://)? # http(s):// (optional)
145 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
146 tube\.majestyc\.net/|
147 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
148 (?:.*?\#/)? # handle anchor (#/) redirect urls
149 (?: # the various things that can precede the ID:
150 (?:(?:v|embed|e)/) # v/ or embed/ or e/
151 |(?: # or the v= param in all its forms
152 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
153 (?:\?|\#!?) # the params delimiter ? or # or #!
154 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
158 |youtu\.be/ # just youtu.be/xxxx
160 )? # all until now is optional -> you can pass the naked ID
161 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
162 (?(1).+)? # if we found the ID, everything can follow
164 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
165 # Listed in order of quality
166 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
167 # Apple HTTP Live Streaming
168 '96', '95', '94', '93', '92', '132', '151',
170 '85', '84', '102', '83', '101', '82', '100',
172 '138', '137', '248', '136', '247', '135', '246',
173 '245', '244', '134', '243', '133', '242', '160',
175 '141', '172', '140', '171', '139',
177 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
178 # Apple HTTP Live Streaming
179 '96', '95', '94', '93', '92', '132', '151',
181 '85', '102', '84', '101', '83', '100', '82',
183 '138', '248', '137', '247', '136', '246', '245',
184 '244', '135', '243', '134', '242', '133', '160',
186 '172', '141', '171', '140', '139',
188 _video_formats_map = {
189 'flv': ['35', '34', '6', '5'],
190 '3gp': ['36', '17', '13'],
191 'mp4': ['38', '37', '22', '18'],
192 'webm': ['46', '45', '44', '43'],
194 _video_extensions = {
216 # Apple HTTP Live Streaming
250 _video_dimensions = {
332 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
333 u"file": u"BaW_jenozKc.mp4",
335 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
336 u"uploader": u"Philipp Hagemeister",
337 u"uploader_id": u"phihag",
338 u"upload_date": u"20121002",
339 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
343 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
344 u"file": u"UxxajLWwzqY.mp4",
345 u"note": u"Test generic use_cipher_signature video (#897)",
347 u"upload_date": u"20120506",
348 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
349 u"description": u"md5:5b292926389560516e384ac437c0ec07",
350 u"uploader": u"Icona Pop",
351 u"uploader_id": u"IconaPop"
355 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
356 u"file": u"07FYdnEawAQ.mp4",
357 u"note": u"Test VEVO video with age protection (#956)",
359 u"upload_date": u"20130703",
360 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
361 u"description": u"md5:64249768eec3bc4276236606ea996373",
362 u"uploader": u"justintimberlakeVEVO",
363 u"uploader_id": u"justintimberlakeVEVO"
370 def suitable(cls, url):
371 """Receives a URL and returns True if suitable for this IE."""
372 if YoutubePlaylistIE.suitable(url): return False
373 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
375 def __init__(self, *args, **kwargs):
376 super(YoutubeIE, self).__init__(*args, **kwargs)
377 self._player_cache = {}
379 def report_video_webpage_download(self, video_id):
380 """Report attempt to download video webpage."""
381 self.to_screen(u'%s: Downloading video webpage' % video_id)
383 def report_video_info_webpage_download(self, video_id):
384 """Report attempt to download video info webpage."""
385 self.to_screen(u'%s: Downloading video info webpage' % video_id)
387 def report_information_extraction(self, video_id):
388 """Report attempt to extract video information."""
389 self.to_screen(u'%s: Extracting video information' % video_id)
391 def report_unavailable_format(self, video_id, format):
392 """Report extracted video URL."""
393 self.to_screen(u'%s: Format %s not available' % (video_id, format))
395 def report_rtmp_download(self):
396 """Indicate the download will use the RTMP protocol."""
397 self.to_screen(u'RTMP download detected')
399 def _extract_signature_function(self, video_id, player_url, slen):
400 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
402 player_type = id_m.group('ext')
403 player_id = id_m.group('id')
405 # Read from filesystem cache
406 func_id = '%s_%s_%d' % (player_type, player_id, slen)
407 assert os.path.basename(func_id) == func_id
408 cache_dir = get_cachedir(self._downloader.params)
410 cache_enabled = cache_dir is not None
412 cache_fn = os.path.join(os.path.expanduser(cache_dir),
416 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
417 cache_spec = json.load(cachef)
418 return lambda s: u''.join(s[i] for i in cache_spec)
420 pass # No cache available
422 if player_type == 'js':
423 code = self._download_webpage(
424 player_url, video_id,
425 note=u'Downloading %s player %s' % (player_type, player_id),
426 errnote=u'Download of %s failed' % player_url)
427 res = self._parse_sig_js(code)
428 elif player_type == 'swf':
429 urlh = self._request_webpage(
430 player_url, video_id,
431 note=u'Downloading %s player %s' % (player_type, player_id),
432 errnote=u'Download of %s failed' % player_url)
434 res = self._parse_sig_swf(code)
436 assert False, 'Invalid player type %r' % player_type
440 test_string = u''.join(map(compat_chr, range(slen)))
441 cache_res = res(test_string)
442 cache_spec = [ord(c) for c in cache_res]
444 os.makedirs(os.path.dirname(cache_fn))
445 except OSError as ose:
446 if ose.errno != errno.EEXIST:
448 write_json_file(cache_spec, cache_fn)
450 tb = traceback.format_exc()
451 self._downloader.report_warning(
452 u'Writing cache to %r failed: %s' % (cache_fn, tb))
456 def _print_sig_code(self, func, slen):
457 def gen_sig_code(idxs):
458 def _genslice(start, end, step):
459 starts = u'' if start == 0 else str(start)
460 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
461 steps = u'' if step == 1 else (u':%d' % step)
462 return u's[%s%s%s]' % (starts, ends, steps)
465 start = '(Never used)' # Quelch pyflakes warnings - start will be
466 # set as soon as step is set
467 for i, prev in zip(idxs[1:], idxs[:-1]):
471 yield _genslice(start, prev, step)
474 if i - prev in [-1, 1]:
479 yield u's[%d]' % prev
483 yield _genslice(start, i, step)
485 test_string = u''.join(map(compat_chr, range(slen)))
486 cache_res = func(test_string)
487 cache_spec = [ord(c) for c in cache_res]
488 expr_code = u' + '.join(gen_sig_code(cache_spec))
489 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
490 self.to_screen(u'Extracted signature function:\n' + code)
492 def _parse_sig_js(self, jscode):
493 funcname = self._search_regex(
494 r'signature=([a-zA-Z]+)', jscode,
495 u'Initial JS player signature function name')
500 return string.lowercase.index(varname)
502 def interpret_statement(stmt, local_vars, allow_recursion=20):
503 if allow_recursion < 0:
504 raise ExtractorError(u'Recursion limit reached')
506 if stmt.startswith(u'var '):
507 stmt = stmt[len(u'var '):]
508 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
509 r'=(?P<expr>.*)$', stmt)
511 if ass_m.groupdict().get('index'):
513 lvar = local_vars[ass_m.group('out')]
514 idx = interpret_expression(ass_m.group('index'),
515 local_vars, allow_recursion)
516 assert isinstance(idx, int)
519 expr = ass_m.group('expr')
522 local_vars[ass_m.group('out')] = val
524 expr = ass_m.group('expr')
525 elif stmt.startswith(u'return '):
527 expr = stmt[len(u'return '):]
529 raise ExtractorError(
530 u'Cannot determine left side of statement in %r' % stmt)
532 v = interpret_expression(expr, local_vars, allow_recursion)
535 def interpret_expression(expr, local_vars, allow_recursion):
540 return local_vars[expr]
542 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
544 member = m.group('member')
545 val = local_vars[m.group('in')]
546 if member == 'split("")':
548 if member == 'join("")':
550 if member == 'length':
552 if member == 'reverse()':
554 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
556 idx = interpret_expression(
557 slice_m.group('idx'), local_vars, allow_recursion-1)
561 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
563 val = local_vars[m.group('in')]
564 idx = interpret_expression(m.group('idx'), local_vars,
568 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
570 a = interpret_expression(m.group('a'),
571 local_vars, allow_recursion)
572 b = interpret_expression(m.group('b'),
573 local_vars, allow_recursion)
577 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
579 fname = m.group('func')
580 if fname not in functions:
581 functions[fname] = extract_function(fname)
582 argvals = [int(v) if v.isdigit() else local_vars[v]
583 for v in m.group('args').split(',')]
584 return functions[fname](argvals)
585 raise ExtractorError(u'Unsupported JS expression %r' % expr)
587 def extract_function(funcname):
589 r'function ' + re.escape(funcname) +
590 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
592 argnames = func_m.group('args').split(',')
595 local_vars = dict(zip(argnames, args))
596 for stmt in func_m.group('code').split(';'):
597 res = interpret_statement(stmt, local_vars)
601 initial_function = extract_function(funcname)
602 return lambda s: initial_function([s])
604 def _parse_sig_swf(self, file_contents):
605 if file_contents[1:3] != b'WS':
606 raise ExtractorError(
607 u'Not an SWF file; header is %r' % file_contents[:3])
608 if file_contents[:1] == b'C':
609 content = zlib.decompress(file_contents[8:])
611 raise NotImplementedError(u'Unsupported compression format %r' %
614 def extract_tags(content):
616 while pos < len(content):
617 header16 = struct.unpack('<H', content[pos:pos+2])[0]
619 tag_code = header16 >> 6
620 tag_len = header16 & 0x3f
622 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
624 assert pos+tag_len <= len(content)
625 yield (tag_code, content[pos:pos+tag_len])
629 for tag_code, tag in extract_tags(content)
631 p = code_tag.index(b'\0', 4) + 1
632 code_reader = io.BytesIO(code_tag[p:])
634 # Parse ABC (AVM2 ByteCode)
635 def read_int(reader=None):
643 b = struct.unpack('<B', buf)[0]
644 res = res | ((b & 0x7f) << shift)
650 def u30(reader=None):
651 res = read_int(reader)
652 assert res & 0xf0000000 == 0
656 def s32(reader=None):
658 if v & 0x80000000 != 0:
659 v = - ((v ^ 0xffffffff) + 1)
662 def read_string(reader=None):
666 resb = reader.read(slen)
667 assert len(resb) == slen
668 return resb.decode('utf-8')
670 def read_bytes(count, reader=None):
673 resb = reader.read(count)
674 assert len(resb) == count
677 def read_byte(reader=None):
678 resb = read_bytes(1, reader=reader)
679 res = struct.unpack('<B', resb)[0]
682 # minor_version + major_version
687 for _c in range(1, int_count):
690 for _c in range(1, uint_count):
693 read_bytes((double_count-1) * 8)
695 constant_strings = [u'']
696 for _c in range(1, string_count):
698 constant_strings.append(s)
699 namespace_count = u30()
700 for _c in range(1, namespace_count):
704 for _c in range(1, ns_set_count):
706 for _c2 in range(count):
708 multiname_count = u30()
717 0x0e: 2, # MultinameA
718 0x1b: 1, # MultinameL
719 0x1c: 1, # MultinameLA
722 for _c in range(1, multiname_count):
724 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
726 u30() # namespace_idx
728 multinames.append(constant_strings[name_idx])
730 multinames.append('[MULTINAME kind: %d]' % kind)
731 for _c2 in range(MULTINAME_SIZES[kind]):
736 MethodInfo = collections.namedtuple(
738 ['NEED_ARGUMENTS', 'NEED_REST'])
740 for method_id in range(method_count):
743 for _ in range(param_count):
745 u30() # name index (always 0 for youtube)
747 if flags & 0x08 != 0:
750 for c in range(option_count):
753 if flags & 0x80 != 0:
754 # Param names present
755 for _ in range(param_count):
757 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
758 method_infos.append(mi)
761 metadata_count = u30()
762 for _c in range(metadata_count):
765 for _c2 in range(item_count):
769 def parse_traits_info():
770 trait_name_idx = u30()
771 kind_full = read_byte()
772 kind = kind_full & 0x0f
773 attrs = kind_full >> 4
775 if kind in [0x00, 0x06]: # Slot or Const
777 u30() # type_name_idx
781 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
784 methods[multinames[trait_name_idx]] = method_idx
785 elif kind == 0x04: # Class
788 elif kind == 0x05: # Function
791 methods[function_idx] = multinames[trait_name_idx]
793 raise ExtractorError(u'Unsupported trait kind %d' % kind)
795 if attrs & 0x4 != 0: # Metadata present
796 metadata_count = u30()
797 for _c3 in range(metadata_count):
798 u30() # metadata index
803 TARGET_CLASSNAME = u'SignatureDecipher'
804 searched_idx = multinames.index(TARGET_CLASSNAME)
805 searched_class_id = None
807 for class_id in range(class_count):
809 if name_idx == searched_idx:
810 # We found the class we're looking for!
811 searched_class_id = class_id
812 u30() # super_name idx
814 if flags & 0x08 != 0: # Protected namespace is present
815 u30() # protected_ns_idx
817 for _c2 in range(intrf_count):
821 for _c2 in range(trait_count):
824 if searched_class_id is None:
825 raise ExtractorError(u'Target class %r not found' %
830 for class_id in range(class_count):
833 for _c2 in range(trait_count):
834 trait_methods = parse_traits_info()
835 if class_id == searched_class_id:
836 method_names.update(trait_methods.items())
837 method_idxs.update(dict(
839 for name, idx in trait_methods.items()))
843 for _c in range(script_count):
846 for _c2 in range(trait_count):
850 method_body_count = u30()
851 Method = collections.namedtuple('Method', ['code', 'local_count'])
853 for _c in range(method_body_count):
857 u30() # init_scope_depth
858 u30() # max_scope_depth
860 code = read_bytes(code_length)
861 if method_idx in method_idxs:
862 m = Method(code, local_count)
863 methods[method_idxs[method_idx]] = m
864 exception_count = u30()
865 for _c2 in range(exception_count):
872 for _c2 in range(trait_count):
875 assert p + code_reader.tell() == len(code_tag)
876 assert len(methods) == len(method_idxs)
878 method_pyfunctions = {}
880 def extract_function(func_name):
881 if func_name in method_pyfunctions:
882 return method_pyfunctions[func_name]
883 if func_name not in methods:
884 raise ExtractorError(u'Cannot find function %r' % func_name)
885 m = methods[func_name]
888 registers = ['(this)'] + list(args) + [None] * m.local_count
890 coder = io.BytesIO(m.code)
892 opcode = struct.unpack('!B', coder.read(1))[0]
893 if opcode == 36: # pushbyte
894 v = struct.unpack('!B', coder.read(1))[0]
896 elif opcode == 44: # pushstring
898 stack.append(constant_strings[idx])
899 elif opcode == 48: # pushscope
900 # We don't implement the scope register, so we'll just
901 # ignore the popped value
903 elif opcode == 70: # callproperty
905 mname = multinames[index]
906 arg_count = u30(coder)
907 args = list(reversed(
908 [stack.pop() for _ in range(arg_count)]))
910 if mname == u'split':
911 assert len(args) == 1
912 assert isinstance(args[0], compat_str)
913 assert isinstance(obj, compat_str)
917 res = obj.split(args[0])
919 elif mname == u'slice':
920 assert len(args) == 1
921 assert isinstance(args[0], int)
922 assert isinstance(obj, list)
925 elif mname == u'join':
926 assert len(args) == 1
927 assert isinstance(args[0], compat_str)
928 assert isinstance(obj, list)
929 res = args[0].join(obj)
931 elif mname in method_pyfunctions:
932 stack.append(method_pyfunctions[mname](args))
934 raise NotImplementedError(
935 u'Unsupported property %r on %r'
937 elif opcode == 72: # returnvalue
940 elif opcode == 79: # callpropvoid
942 mname = multinames[index]
943 arg_count = u30(coder)
944 args = list(reversed(
945 [stack.pop() for _ in range(arg_count)]))
947 if mname == u'reverse':
948 assert isinstance(obj, list)
951 raise NotImplementedError(
952 u'Unsupported (void) property %r on %r'
954 elif opcode == 93: # findpropstrict
956 mname = multinames[index]
957 res = extract_function(mname)
959 elif opcode == 97: # setproperty
964 assert isinstance(obj, list)
965 assert isinstance(idx, int)
967 elif opcode == 98: # getlocal
969 stack.append(registers[index])
970 elif opcode == 99: # setlocal
973 registers[index] = value
974 elif opcode == 102: # getproperty
976 pname = multinames[index]
977 if pname == u'length':
979 assert isinstance(obj, list)
980 stack.append(len(obj))
981 else: # Assume attribute access
983 assert isinstance(idx, int)
985 assert isinstance(obj, list)
986 stack.append(obj[idx])
987 elif opcode == 128: # coerce
989 elif opcode == 133: # coerce_s
990 assert isinstance(stack[-1], (type(None), compat_str))
991 elif opcode == 164: # modulo
994 res = value1 % value2
996 elif opcode == 208: # getlocal_0
997 stack.append(registers[0])
998 elif opcode == 209: # getlocal_1
999 stack.append(registers[1])
1000 elif opcode == 210: # getlocal_2
1001 stack.append(registers[2])
1002 elif opcode == 211: # getlocal_3
1003 stack.append(registers[3])
1004 elif opcode == 214: # setlocal_2
1005 registers[2] = stack.pop()
1006 elif opcode == 215: # setlocal_3
1007 registers[3] = stack.pop()
1009 raise NotImplementedError(
1010 u'Unsupported opcode %d' % opcode)
1012 method_pyfunctions[func_name] = resfunc
1015 initial_function = extract_function(u'decipher')
1016 return lambda s: initial_function([s])
1018 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1019 """Turn the encrypted s field into a working signature"""
1021 if player_url is not None:
1022 if player_url.startswith(u'//'):
1023 player_url = u'https:' + player_url
1025 player_id = (player_url, len(s))
1026 if player_id not in self._player_cache:
1027 func = self._extract_signature_function(
1028 video_id, player_url, len(s)
1030 self._player_cache[player_id] = func
1031 func = self._player_cache[player_id]
1032 if self._downloader.params.get('youtube_print_sig_code'):
1033 self._print_sig_code(func, len(s))
1036 tb = traceback.format_exc()
1037 self._downloader.report_warning(
1038 u'Automatic signature extraction failed: ' + tb)
1040 self._downloader.report_warning(
1041 u'Warning: Falling back to static signature algorithm')
1043 return self._static_decrypt_signature(
1044 s, video_id, player_url, age_gate)
1046 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1048 # The videos with age protection use another player, so the
1049 # algorithms can be different.
1051 return s[2:63] + s[82] + s[64:82] + s[63]
1054 return s[86:29:-1] + s[88] + s[28:5:-1]
1056 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1058 return s[84:27:-1] + s[86] + s[26:5:-1]
1060 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1062 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1064 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1066 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1068 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1070 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1072 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1074 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1076 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1078 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1080 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1082 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1085 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1087 def _get_available_subtitles(self, video_id, webpage):
1089 sub_list = self._download_webpage(
1090 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1091 video_id, note=False)
1092 except ExtractorError as err:
1093 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1095 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1100 params = compat_urllib_parse.urlencode({
1103 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1104 'name': l[0].encode('utf-8'),
1106 url = u'http://www.youtube.com/api/timedtext?' + params
1107 sub_lang_list[lang] = url
1108 if not sub_lang_list:
1109 self._downloader.report_warning(u'video doesn\'t have subtitles')
1111 return sub_lang_list
1113 def _get_available_automatic_caption(self, video_id, webpage):
1114 """We need the webpage for getting the captions url, pass it as an
1115 argument to speed up the process."""
1116 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1117 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1118 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1119 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1121 self._downloader.report_warning(err_msg)
1123 player_config = json.loads(mobj.group(1))
1125 args = player_config[u'args']
1126 caption_url = args[u'ttsurl']
1127 timestamp = args[u'timestamp']
1128 # We get the available subtitles
1129 list_params = compat_urllib_parse.urlencode({
1134 list_url = caption_url + '&' + list_params
1135 list_page = self._download_webpage(list_url, video_id)
1136 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1137 original_lang_node = caption_list.find('track')
1138 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1139 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1141 original_lang = original_lang_node.attrib['lang_code']
1144 for lang_node in caption_list.findall('target'):
1145 sub_lang = lang_node.attrib['lang_code']
1146 params = compat_urllib_parse.urlencode({
1147 'lang': original_lang,
1153 sub_lang_list[sub_lang] = caption_url + '&' + params
1154 return sub_lang_list
1155 # An extractor error can be raise by the download process if there are
1156 # no automatic captions but there are subtitles
1157 except (KeyError, ExtractorError):
1158 self._downloader.report_warning(err_msg)
1161 def _print_formats(self, formats):
1162 print('Available formats:')
1164 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1165 self._video_dimensions.get(x, '???'),
1166 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1168 def _extract_id(self, url):
1169 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1171 raise ExtractorError(u'Invalid URL: %s' % url)
1172 video_id = mobj.group(2)
1175 def _get_video_url_list(self, url_map):
1177 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1178 with the requested formats.
1180 req_format = self._downloader.params.get('format', None)
1181 format_limit = self._downloader.params.get('format_limit', None)
1182 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1183 if format_limit is not None and format_limit in available_formats:
1184 format_list = available_formats[available_formats.index(format_limit):]
1186 format_list = available_formats
1187 existing_formats = [x for x in format_list if x in url_map]
1188 if len(existing_formats) == 0:
1189 raise ExtractorError(u'no known formats available for video')
1190 if self._downloader.params.get('listformats', None):
1191 self._print_formats(existing_formats)
1193 if req_format is None or req_format == 'best':
1194 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1195 elif req_format == 'worst':
1196 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1197 elif req_format in ('-1', 'all'):
1198 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1200 # Specific formats. We pick the first in a slash-delimeted sequence.
1201 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1202 # available in the specified format. For example,
1203 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1204 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1205 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1206 req_formats = req_format.split('/')
1207 video_url_list = None
1208 for rf in req_formats:
1210 video_url_list = [(rf, url_map[rf])]
1212 if rf in self._video_formats_map:
1213 for srf in self._video_formats_map[rf]:
1215 video_url_list = [(srf, url_map[srf])]
1220 if video_url_list is None:
1221 raise ExtractorError(u'requested format not available')
1222 return video_url_list
1224 def _extract_from_m3u8(self, manifest_url, video_id):
1226 def _get_urls(_manifest):
1227 lines = _manifest.split('\n')
1228 urls = filter(lambda l: l and not l.startswith('#'),
1231 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1232 formats_urls = _get_urls(manifest)
1233 for format_url in formats_urls:
1234 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1235 url_map[itag] = format_url
1238 def _extract_annotations(self, video_id):
1239 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1240 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1242 def _real_extract(self, url):
1243 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1244 mobj = re.search(self._NEXT_URL_RE, url)
1246 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1247 video_id = self._extract_id(url)
1250 self.report_video_webpage_download(video_id)
1251 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1252 request = compat_urllib_request.Request(url)
1254 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1256 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1258 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1260 # Attempt to extract SWF player URL
1261 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1262 if mobj is not None:
1263 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1268 self.report_video_info_webpage_download(video_id)
1269 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1270 self.report_age_confirmation()
1272 # We simulate the access to the video from www.youtube.com/v/{video_id}
1273 # this can be viewed without login into Youtube
1274 data = compat_urllib_parse.urlencode({'video_id': video_id,
1278 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1282 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1283 video_info_webpage = self._download_webpage(video_info_url, video_id,
1285 errnote='unable to download video info webpage')
1286 video_info = compat_parse_qs(video_info_webpage)
1289 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1290 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1291 % (video_id, el_type))
1292 video_info_webpage = self._download_webpage(video_info_url, video_id,
1294 errnote='unable to download video info webpage')
1295 video_info = compat_parse_qs(video_info_webpage)
1296 if 'token' in video_info:
1298 if 'token' not in video_info:
1299 if 'reason' in video_info:
1300 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1302 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1304 # Check for "rental" videos
1305 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1306 raise ExtractorError(u'"rental" videos not supported')
1308 # Start extracting information
1309 self.report_information_extraction(video_id)
1312 if 'author' not in video_info:
1313 raise ExtractorError(u'Unable to extract uploader name')
1314 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1317 video_uploader_id = None
1318 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1319 if mobj is not None:
1320 video_uploader_id = mobj.group(1)
1322 self._downloader.report_warning(u'unable to extract uploader nickname')
1325 if 'title' in video_info:
1326 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1328 self._downloader.report_warning(u'Unable to extract video title')
1332 # We try first to get a high quality image:
1333 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1334 video_webpage, re.DOTALL)
1335 if m_thumb is not None:
1336 video_thumbnail = m_thumb.group(1)
1337 elif 'thumbnail_url' not in video_info:
1338 self._downloader.report_warning(u'unable to extract video thumbnail')
1339 video_thumbnail = None
1340 else: # don't panic if we can't find it
1341 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1345 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1346 if mobj is not None:
1347 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1348 upload_date = unified_strdate(upload_date)
1351 video_description = get_element_by_id("eow-description", video_webpage)
1352 if video_description:
1353 video_description = clean_html(video_description)
1355 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1357 video_description = unescapeHTML(fd_mobj.group(1))
1359 video_description = u''
1362 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1364 if self._downloader.params.get('listsubtitles', False):
1365 self._list_available_subtitles(video_id, video_webpage)
1368 if 'length_seconds' not in video_info:
1369 self._downloader.report_warning(u'unable to extract video duration')
1372 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1375 video_annotations = None
1376 if self._downloader.params.get('writeannotations', False):
1377 video_annotations = self._extract_annotations(video_id)
1379 # Decide which formats to download
1382 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1384 raise ValueError('Could not find vevo ID')
1385 info = json.loads(mobj.group(1))
1387 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1388 # this signatures are encrypted
1389 if 'url_encoded_fmt_stream_map' not in args:
1390 raise ValueError(u'No stream_map present') # caught below
1391 re_signature = re.compile(r'[&,]s=')
1392 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1394 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1395 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1396 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1398 if 'adaptive_fmts' in video_info:
1399 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1401 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1405 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1406 self.report_rtmp_download()
1407 video_url_list = [(None, video_info['conn'][0])]
1408 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1409 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1410 if 'rtmpe%3Dyes' in encoded_url_map:
1411 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1413 for url_data_str in encoded_url_map.split(','):
1414 url_data = compat_parse_qs(url_data_str)
1415 if 'itag' in url_data and 'url' in url_data:
1416 url = url_data['url'][0]
1417 if 'sig' in url_data:
1418 url += '&signature=' + url_data['sig'][0]
1419 elif 's' in url_data:
1420 encrypted_sig = url_data['s'][0]
1421 if self._downloader.params.get('verbose'):
1423 if player_url is None:
1424 player_version = 'unknown'
1426 player_version = self._search_regex(
1427 r'-(.+)\.swf$', player_url,
1428 u'flash player', fatal=False)
1429 player_desc = 'flash player %s' % player_version
1431 player_version = self._search_regex(
1432 r'html5player-(.+?)\.js', video_webpage,
1433 'html5 player', fatal=False)
1434 player_desc = u'html5 player %s' % player_version
1436 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1437 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1438 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1441 jsplayer_url_json = self._search_regex(
1442 r'"assets":.+?"js":\s*("[^"]+")',
1443 video_webpage, u'JS player URL')
1444 player_url = json.loads(jsplayer_url_json)
1446 signature = self._decrypt_signature(
1447 encrypted_sig, video_id, player_url, age_gate)
1448 url += '&signature=' + signature
1449 if 'ratebypass' not in url:
1450 url += '&ratebypass=yes'
1451 url_map[url_data['itag'][0]] = url
1452 video_url_list = self._get_video_url_list(url_map)
1453 if not video_url_list:
1455 elif video_info.get('hlsvp'):
1456 manifest_url = video_info['hlsvp'][0]
1457 url_map = self._extract_from_m3u8(manifest_url, video_id)
1458 video_url_list = self._get_video_url_list(url_map)
1459 if not video_url_list:
1463 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1466 for itag, video_real_url in video_url_list:
1468 video_extension = self._video_extensions.get(itag, 'flv')
1470 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1471 self._video_dimensions.get(itag, '???'),
1472 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1476 'url': video_real_url,
1477 'uploader': video_uploader,
1478 'uploader_id': video_uploader_id,
1479 'upload_date': upload_date,
1480 'title': video_title,
1481 'ext': video_extension,
1482 'format': video_format,
1484 'thumbnail': video_thumbnail,
1485 'description': video_description,
1486 'player_url': player_url,
1487 'subtitles': video_subtitles,
1488 'duration': video_duration,
1489 'age_limit': 18 if age_gate else 0,
1490 'annotations': video_annotations,
1491 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1495 class YoutubePlaylistIE(InfoExtractor):
1496 IE_DESC = u'YouTube.com playlists'
1497 _VALID_URL = r"""(?:
1502 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1503 \? (?:.*?&)*? (?:p|a|list)=
1506 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1509 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1511 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1513 IE_NAME = u'youtube:playlist'
1516 def suitable(cls, url):
1517 """Receives a URL and returns True if suitable for this IE."""
1518 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1520 def _real_extract(self, url):
1521 # Extract playlist id
1522 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1524 raise ExtractorError(u'Invalid URL: %s' % url)
1525 playlist_id = mobj.group(1) or mobj.group(2)
1527 # Check if it's a video-specific URL
1528 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1529 if 'v' in query_dict:
1530 video_id = query_dict['v'][0]
1531 if self._downloader.params.get('noplaylist'):
1532 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1533 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1535 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1537 # Download playlist videos from API
1540 for page_num in itertools.count(1):
1541 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1542 if start_index >= 1000:
1543 self._downloader.report_warning(u'Max number of results reached')
1545 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1546 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1549 response = json.loads(page)
1550 except ValueError as err:
1551 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1553 if 'feed' not in response:
1554 raise ExtractorError(u'Got a malformed response from YouTube API')
1555 playlist_title = response['feed']['title']['$t']
1556 if 'entry' not in response['feed']:
1557 # Number of videos is a multiple of self._MAX_RESULTS
1560 for entry in response['feed']['entry']:
1561 index = entry['yt$position']['$t']
1562 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1565 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1568 videos = [v[1] for v in sorted(videos)]
1570 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1571 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1574 class YoutubeChannelIE(InfoExtractor):
1575 IE_DESC = u'YouTube.com channels'
1576 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1577 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1578 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1579 IE_NAME = u'youtube:channel'
1581 def extract_videos_from_page(self, page):
1583 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1584 if mobj.group(1) not in ids_in_page:
1585 ids_in_page.append(mobj.group(1))
1588 def _real_extract(self, url):
1589 # Extract channel id
1590 mobj = re.match(self._VALID_URL, url)
1592 raise ExtractorError(u'Invalid URL: %s' % url)
1594 # Download channel page
1595 channel_id = mobj.group(1)
1597 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1598 channel_page = self._download_webpage(url, channel_id)
1599 if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
1600 autogenerated = True
1602 autogenerated = False
1605 # The videos are contained in a single page
1606 # the ajax pages can't be used, they are empty
1607 video_ids = self.extract_videos_from_page(channel_page)
1609 # Download all channel pages using the json-based channel_ajax query
1610 for pagenum in itertools.count(1):
1611 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1612 page = self._download_webpage(url, channel_id,
1613 u'Downloading page #%s' % pagenum)
1615 page = json.loads(page)
1617 ids_in_page = self.extract_videos_from_page(page['content_html'])
1618 video_ids.extend(ids_in_page)
1620 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1623 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1625 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1626 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1627 return [self.playlist_result(url_entries, channel_id)]
1630 class YoutubeUserIE(InfoExtractor):
1631 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1632 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1633 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1634 _GDATA_PAGE_SIZE = 50
1635 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1636 IE_NAME = u'youtube:user'
1639 def suitable(cls, url):
1640 # Don't return True if the url can be extracted with other youtube
1641 # extractor, the regex would is too permissive and it would match.
1642 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1643 if any(ie.suitable(url) for ie in other_ies): return False
1644 else: return super(YoutubeUserIE, cls).suitable(url)
1646 def _real_extract(self, url):
1648 mobj = re.match(self._VALID_URL, url)
1650 raise ExtractorError(u'Invalid URL: %s' % url)
1652 username = mobj.group(1)
1654 # Download video ids using YouTube Data API. Result size per
1655 # query is limited (currently to 50 videos) so we need to query
1656 # page by page until there are no video ids - it means we got
1661 for pagenum in itertools.count(0):
1662 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1664 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1665 page = self._download_webpage(gdata_url, username,
1666 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1669 response = json.loads(page)
1670 except ValueError as err:
1671 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1672 if 'entry' not in response['feed']:
1673 # Number of videos is a multiple of self._MAX_RESULTS
1676 # Extract video identifiers
1678 for entry in response['feed']['entry']:
1679 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1680 video_ids.extend(ids_in_page)
1682 # A little optimization - if current page is not
1683 # "full", ie. does not contain PAGE_SIZE video ids then
1684 # we can assume that this page is the last one - there
1685 # are no more ids on further pages - no need to query
1688 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1691 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1692 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1693 return [self.playlist_result(url_results, playlist_title = username)]
1695 class YoutubeSearchIE(SearchInfoExtractor):
1696 IE_DESC = u'YouTube.com searches'
1697 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1699 IE_NAME = u'youtube:search'
1700 _SEARCH_KEY = 'ytsearch'
1702 def report_download_page(self, query, pagenum):
1703 """Report attempt to download search page with given number."""
1704 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1706 def _get_n_results(self, query, n):
1707 """Get a specified number of results for a query"""
1713 while (50 * pagenum) < limit:
1714 self.report_download_page(query, pagenum+1)
1715 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1716 request = compat_urllib_request.Request(result_url)
1718 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1721 api_response = json.loads(data)['data']
1723 if not 'items' in api_response:
1724 raise ExtractorError(u'[youtube] No video results')
1726 new_ids = list(video['id'] for video in api_response['items'])
1727 video_ids += new_ids
1729 limit = min(n, api_response['totalItems'])
1732 if len(video_ids) > n:
1733 video_ids = video_ids[:n]
1734 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1735 return self.playlist_result(videos, query)
1737 class YoutubeSearchDateIE(YoutubeSearchIE):
1738 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1739 _SEARCH_KEY = 'ytsearchdate'
1740 IE_DESC = u'YouTube.com searches, newest videos first'
1742 class YoutubeShowIE(InfoExtractor):
1743 IE_DESC = u'YouTube.com (multi-season) shows'
1744 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1745 IE_NAME = u'youtube:show'
1747 def _real_extract(self, url):
1748 mobj = re.match(self._VALID_URL, url)
1749 show_name = mobj.group(1)
1750 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1751 # There's one playlist for each season of the show
1752 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1753 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1754 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1757 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1759 Base class for extractors that fetch info from
1760 http://www.youtube.com/feed_ajax
1761 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1763 _LOGIN_REQUIRED = True
1765 # use action_load_personal_feed instead of action_load_system_feed
1766 _PERSONAL_FEED = False
1769 def _FEED_TEMPLATE(self):
1770 action = 'action_load_system_feed'
1771 if self._PERSONAL_FEED:
1772 action = 'action_load_personal_feed'
1773 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1777 return u'youtube:%s' % self._FEED_NAME
1779 def _real_initialize(self):
1782 def _real_extract(self, url):
1784 # The step argument is available only in 2.7 or higher
1785 for i in itertools.count(0):
1786 paging = i*self._PAGING_STEP
1787 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1788 u'%s feed' % self._FEED_NAME,
1789 u'Downloading page %s' % i)
1790 info = json.loads(info)
1791 feed_html = info['feed_html']
1792 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1793 ids = orderedSet(m.group(1) for m in m_ids)
1794 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1795 if info['paging'] is None:
1797 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1799 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1800 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1801 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1802 _FEED_NAME = 'subscriptions'
1803 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1805 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1807 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1808 _FEED_NAME = 'recommended'
1809 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1811 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1814 _FEED_NAME = 'watch_later'
1815 _PLAYLIST_TITLE = u'Youtube Watch Later'
1817 _PERSONAL_FEED = True
1819 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1820 IE_NAME = u'youtube:favorites'
1821 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1822 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1823 _LOGIN_REQUIRED = True
1825 def _real_extract(self, url):
1826 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1827 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1828 return self.url_result(playlist_id, 'YoutubePlaylist')
1831 class YoutubeTruncatedURLIE(InfoExtractor):
1832 IE_NAME = 'youtube:truncated_url'
1833 IE_DESC = False # Do not list
1834 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1836 def _real_extract(self, url):
1837 raise ExtractorError(
1838 u'Did you forget to quote the URL? Remember that & is a meta '
1839 u'character in most shells, so you want to put the URL in quotes, '
1841 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1842 u' (or simply youtube-dl BaW_jenozKc ).',