15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
21 compat_urllib_request,
28 get_element_by_attribute,
36 class YoutubeBaseInfoExtractor(InfoExtractor):
37 """Provide base functions for Youtube extractors"""
38 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
39 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
40 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
41 _NETRC_MACHINE = 'youtube'
42 # If True it will raise an error if no login info is provided
43 _LOGIN_REQUIRED = False
45 def report_lang(self):
46 """Report attempt to set language."""
47 self.to_screen(u'Setting language')
49 def _set_language(self):
50 request = compat_urllib_request.Request(self._LANG_URL)
53 self._download_webpage(self._LANG_URL, None, False)
54 except ExtractorError as err:
55 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err.cause))
60 (username, password) = self._get_login_info()
61 # No authentication to be performed
63 if self._LOGIN_REQUIRED:
64 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
67 login_page = self._download_webpage(self._LOGIN_URL, None, False,
68 u'Unable to fetch login page')
70 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
71 login_page, u'Login GALX parameter')
75 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
79 u'PersistentCookie': u'yes',
81 u'bgresponse': u'js_disabled',
82 u'checkConnection': u'',
83 u'checkedDomains': u'youtube',
88 u'signIn': u'Sign in',
90 u'service': u'youtube',
94 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
96 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
97 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
98 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
101 login_results = self._download_webpage(request, None, False)
102 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
103 self._downloader.report_warning(u'unable to log in: bad username or password')
105 except ExtractorError as err:
106 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err.cause))
110 def _confirm_age(self):
113 'action_confirm': 'Confirm',
115 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
116 self.report_age_confirmation()
117 self._download_webpage(request, None, False, u'Unable to confirm age')
120 def _real_initialize(self):
121 if self._downloader is None:
123 if not self._set_language():
125 if not self._login():
130 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
131 IE_DESC = u'YouTube.com'
132 _VALID_URL = r"""(?x)^
134 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
135 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
136 tube\.majestyc\.net/|
137 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?: # the various things that can precede the ID:
140 (?:(?:v|embed|e)/) # v/ or embed/ or e/
141 |(?: # or the v= param in all its forms
142 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
143 (?:\?|\#!?) # the params delimiter ? or # or #!
144 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 |youtu\.be/ # just youtu.be/xxxx
150 )? # all until now is optional -> you can pass the naked ID
151 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
152 (?(1).+)? # if we found the ID, everything can follow
154 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
155 # Listed in order of quality
156 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
157 # Apple HTTP Live Streaming
158 '96', '95', '94', '93', '92', '132', '151',
160 '85', '84', '102', '83', '101', '82', '100',
162 '138', '137', '248', '136', '247', '135', '246',
163 '245', '244', '134', '243', '133', '242', '160',
165 '141', '172', '140', '171', '139',
167 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
168 # Apple HTTP Live Streaming
169 '96', '95', '94', '93', '92', '132', '151',
171 '85', '102', '84', '101', '83', '100', '82',
173 '138', '248', '137', '247', '136', '246', '245',
174 '244', '135', '243', '134', '242', '133', '160',
176 '172', '141', '171', '140', '139',
178 _video_formats_map = {
179 'flv': ['35', '34', '6', '5'],
180 '3gp': ['36', '17', '13'],
181 'mp4': ['38', '37', '22', '18'],
182 'webm': ['46', '45', '44', '43'],
184 _video_extensions = {
206 # Apple HTTP Live Streaming
240 _video_dimensions = {
322 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
323 u"file": u"BaW_jenozKc.mp4",
325 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
326 u"uploader": u"Philipp Hagemeister",
327 u"uploader_id": u"phihag",
328 u"upload_date": u"20121002",
329 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
333 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
334 u"file": u"UxxajLWwzqY.mp4",
335 u"note": u"Test generic use_cipher_signature video (#897)",
337 u"upload_date": u"20120506",
338 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
339 u"description": u"md5:5b292926389560516e384ac437c0ec07",
340 u"uploader": u"Icona Pop",
341 u"uploader_id": u"IconaPop"
345 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
346 u"file": u"07FYdnEawAQ.mp4",
347 u"note": u"Test VEVO video with age protection (#956)",
349 u"upload_date": u"20130703",
350 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
351 u"description": u"md5:64249768eec3bc4276236606ea996373",
352 u"uploader": u"justintimberlakeVEVO",
353 u"uploader_id": u"justintimberlakeVEVO"
357 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
358 u"file": u"yZIXLfi8CZQ.mp4",
359 u"note": u"Embed-only video (#1746)",
361 u"upload_date": u"20120608",
362 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
363 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
364 u"uploader": u"SET India",
365 u"uploader_id": u"setindia"
372 def suitable(cls, url):
373 """Receives a URL and returns True if suitable for this IE."""
374 if YoutubePlaylistIE.suitable(url): return False
375 return re.match(cls._VALID_URL, url) is not None
377 def __init__(self, *args, **kwargs):
378 super(YoutubeIE, self).__init__(*args, **kwargs)
379 self._player_cache = {}
381 def report_video_info_webpage_download(self, video_id):
382 """Report attempt to download video info webpage."""
383 self.to_screen(u'%s: Downloading video info webpage' % video_id)
385 def report_information_extraction(self, video_id):
386 """Report attempt to extract video information."""
387 self.to_screen(u'%s: Extracting video information' % video_id)
389 def report_unavailable_format(self, video_id, format):
390 """Report extracted video URL."""
391 self.to_screen(u'%s: Format %s not available' % (video_id, format))
393 def report_rtmp_download(self):
394 """Indicate the download will use the RTMP protocol."""
395 self.to_screen(u'RTMP download detected')
397 def _extract_signature_function(self, video_id, player_url, slen):
398 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
400 player_type = id_m.group('ext')
401 player_id = id_m.group('id')
403 # Read from filesystem cache
404 func_id = '%s_%s_%d' % (player_type, player_id, slen)
405 assert os.path.basename(func_id) == func_id
406 cache_dir = get_cachedir(self._downloader.params)
408 cache_enabled = cache_dir is not None
410 cache_fn = os.path.join(os.path.expanduser(cache_dir),
414 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
415 cache_spec = json.load(cachef)
416 return lambda s: u''.join(s[i] for i in cache_spec)
418 pass # No cache available
420 if player_type == 'js':
421 code = self._download_webpage(
422 player_url, video_id,
423 note=u'Downloading %s player %s' % (player_type, player_id),
424 errnote=u'Download of %s failed' % player_url)
425 res = self._parse_sig_js(code)
426 elif player_type == 'swf':
427 urlh = self._request_webpage(
428 player_url, video_id,
429 note=u'Downloading %s player %s' % (player_type, player_id),
430 errnote=u'Download of %s failed' % player_url)
432 res = self._parse_sig_swf(code)
434 assert False, 'Invalid player type %r' % player_type
438 test_string = u''.join(map(compat_chr, range(slen)))
439 cache_res = res(test_string)
440 cache_spec = [ord(c) for c in cache_res]
442 os.makedirs(os.path.dirname(cache_fn))
443 except OSError as ose:
444 if ose.errno != errno.EEXIST:
446 write_json_file(cache_spec, cache_fn)
448 tb = traceback.format_exc()
449 self._downloader.report_warning(
450 u'Writing cache to %r failed: %s' % (cache_fn, tb))
454 def _print_sig_code(self, func, slen):
455 def gen_sig_code(idxs):
456 def _genslice(start, end, step):
457 starts = u'' if start == 0 else str(start)
458 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
459 steps = u'' if step == 1 else (u':%d' % step)
460 return u's[%s%s%s]' % (starts, ends, steps)
463 start = '(Never used)' # Quelch pyflakes warnings - start will be
464 # set as soon as step is set
465 for i, prev in zip(idxs[1:], idxs[:-1]):
469 yield _genslice(start, prev, step)
472 if i - prev in [-1, 1]:
477 yield u's[%d]' % prev
481 yield _genslice(start, i, step)
483 test_string = u''.join(map(compat_chr, range(slen)))
484 cache_res = func(test_string)
485 cache_spec = [ord(c) for c in cache_res]
486 expr_code = u' + '.join(gen_sig_code(cache_spec))
487 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
488 self.to_screen(u'Extracted signature function:\n' + code)
490 def _parse_sig_js(self, jscode):
491 funcname = self._search_regex(
492 r'signature=([a-zA-Z]+)', jscode,
493 u'Initial JS player signature function name')
498 return string.lowercase.index(varname)
500 def interpret_statement(stmt, local_vars, allow_recursion=20):
501 if allow_recursion < 0:
502 raise ExtractorError(u'Recursion limit reached')
504 if stmt.startswith(u'var '):
505 stmt = stmt[len(u'var '):]
506 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
507 r'=(?P<expr>.*)$', stmt)
509 if ass_m.groupdict().get('index'):
511 lvar = local_vars[ass_m.group('out')]
512 idx = interpret_expression(ass_m.group('index'),
513 local_vars, allow_recursion)
514 assert isinstance(idx, int)
517 expr = ass_m.group('expr')
520 local_vars[ass_m.group('out')] = val
522 expr = ass_m.group('expr')
523 elif stmt.startswith(u'return '):
525 expr = stmt[len(u'return '):]
527 raise ExtractorError(
528 u'Cannot determine left side of statement in %r' % stmt)
530 v = interpret_expression(expr, local_vars, allow_recursion)
533 def interpret_expression(expr, local_vars, allow_recursion):
538 return local_vars[expr]
540 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
542 member = m.group('member')
543 val = local_vars[m.group('in')]
544 if member == 'split("")':
546 if member == 'join("")':
548 if member == 'length':
550 if member == 'reverse()':
552 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
554 idx = interpret_expression(
555 slice_m.group('idx'), local_vars, allow_recursion-1)
559 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
561 val = local_vars[m.group('in')]
562 idx = interpret_expression(m.group('idx'), local_vars,
566 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
568 a = interpret_expression(m.group('a'),
569 local_vars, allow_recursion)
570 b = interpret_expression(m.group('b'),
571 local_vars, allow_recursion)
575 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
577 fname = m.group('func')
578 if fname not in functions:
579 functions[fname] = extract_function(fname)
580 argvals = [int(v) if v.isdigit() else local_vars[v]
581 for v in m.group('args').split(',')]
582 return functions[fname](argvals)
583 raise ExtractorError(u'Unsupported JS expression %r' % expr)
585 def extract_function(funcname):
587 r'function ' + re.escape(funcname) +
588 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
590 argnames = func_m.group('args').split(',')
593 local_vars = dict(zip(argnames, args))
594 for stmt in func_m.group('code').split(';'):
595 res = interpret_statement(stmt, local_vars)
599 initial_function = extract_function(funcname)
600 return lambda s: initial_function([s])
602 def _parse_sig_swf(self, file_contents):
603 if file_contents[1:3] != b'WS':
604 raise ExtractorError(
605 u'Not an SWF file; header is %r' % file_contents[:3])
606 if file_contents[:1] == b'C':
607 content = zlib.decompress(file_contents[8:])
609 raise NotImplementedError(u'Unsupported compression format %r' %
612 def extract_tags(content):
614 while pos < len(content):
615 header16 = struct.unpack('<H', content[pos:pos+2])[0]
617 tag_code = header16 >> 6
618 tag_len = header16 & 0x3f
620 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
622 assert pos+tag_len <= len(content)
623 yield (tag_code, content[pos:pos+tag_len])
627 for tag_code, tag in extract_tags(content)
629 p = code_tag.index(b'\0', 4) + 1
630 code_reader = io.BytesIO(code_tag[p:])
632 # Parse ABC (AVM2 ByteCode)
633 def read_int(reader=None):
641 b = struct.unpack('<B', buf)[0]
642 res = res | ((b & 0x7f) << shift)
648 def u30(reader=None):
649 res = read_int(reader)
650 assert res & 0xf0000000 == 0
654 def s32(reader=None):
656 if v & 0x80000000 != 0:
657 v = - ((v ^ 0xffffffff) + 1)
660 def read_string(reader=None):
664 resb = reader.read(slen)
665 assert len(resb) == slen
666 return resb.decode('utf-8')
668 def read_bytes(count, reader=None):
671 resb = reader.read(count)
672 assert len(resb) == count
675 def read_byte(reader=None):
676 resb = read_bytes(1, reader=reader)
677 res = struct.unpack('<B', resb)[0]
680 # minor_version + major_version
685 for _c in range(1, int_count):
688 for _c in range(1, uint_count):
691 read_bytes((double_count-1) * 8)
693 constant_strings = [u'']
694 for _c in range(1, string_count):
696 constant_strings.append(s)
697 namespace_count = u30()
698 for _c in range(1, namespace_count):
702 for _c in range(1, ns_set_count):
704 for _c2 in range(count):
706 multiname_count = u30()
715 0x0e: 2, # MultinameA
716 0x1b: 1, # MultinameL
717 0x1c: 1, # MultinameLA
720 for _c in range(1, multiname_count):
722 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
724 u30() # namespace_idx
726 multinames.append(constant_strings[name_idx])
728 multinames.append('[MULTINAME kind: %d]' % kind)
729 for _c2 in range(MULTINAME_SIZES[kind]):
734 MethodInfo = collections.namedtuple(
736 ['NEED_ARGUMENTS', 'NEED_REST'])
738 for method_id in range(method_count):
741 for _ in range(param_count):
743 u30() # name index (always 0 for youtube)
745 if flags & 0x08 != 0:
748 for c in range(option_count):
751 if flags & 0x80 != 0:
752 # Param names present
753 for _ in range(param_count):
755 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
756 method_infos.append(mi)
759 metadata_count = u30()
760 for _c in range(metadata_count):
763 for _c2 in range(item_count):
767 def parse_traits_info():
768 trait_name_idx = u30()
769 kind_full = read_byte()
770 kind = kind_full & 0x0f
771 attrs = kind_full >> 4
773 if kind in [0x00, 0x06]: # Slot or Const
775 u30() # type_name_idx
779 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
782 methods[multinames[trait_name_idx]] = method_idx
783 elif kind == 0x04: # Class
786 elif kind == 0x05: # Function
789 methods[function_idx] = multinames[trait_name_idx]
791 raise ExtractorError(u'Unsupported trait kind %d' % kind)
793 if attrs & 0x4 != 0: # Metadata present
794 metadata_count = u30()
795 for _c3 in range(metadata_count):
796 u30() # metadata index
801 TARGET_CLASSNAME = u'SignatureDecipher'
802 searched_idx = multinames.index(TARGET_CLASSNAME)
803 searched_class_id = None
805 for class_id in range(class_count):
807 if name_idx == searched_idx:
808 # We found the class we're looking for!
809 searched_class_id = class_id
810 u30() # super_name idx
812 if flags & 0x08 != 0: # Protected namespace is present
813 u30() # protected_ns_idx
815 for _c2 in range(intrf_count):
819 for _c2 in range(trait_count):
822 if searched_class_id is None:
823 raise ExtractorError(u'Target class %r not found' %
828 for class_id in range(class_count):
831 for _c2 in range(trait_count):
832 trait_methods = parse_traits_info()
833 if class_id == searched_class_id:
834 method_names.update(trait_methods.items())
835 method_idxs.update(dict(
837 for name, idx in trait_methods.items()))
841 for _c in range(script_count):
844 for _c2 in range(trait_count):
848 method_body_count = u30()
849 Method = collections.namedtuple('Method', ['code', 'local_count'])
851 for _c in range(method_body_count):
855 u30() # init_scope_depth
856 u30() # max_scope_depth
858 code = read_bytes(code_length)
859 if method_idx in method_idxs:
860 m = Method(code, local_count)
861 methods[method_idxs[method_idx]] = m
862 exception_count = u30()
863 for _c2 in range(exception_count):
870 for _c2 in range(trait_count):
873 assert p + code_reader.tell() == len(code_tag)
874 assert len(methods) == len(method_idxs)
876 method_pyfunctions = {}
878 def extract_function(func_name):
879 if func_name in method_pyfunctions:
880 return method_pyfunctions[func_name]
881 if func_name not in methods:
882 raise ExtractorError(u'Cannot find function %r' % func_name)
883 m = methods[func_name]
886 registers = ['(this)'] + list(args) + [None] * m.local_count
888 coder = io.BytesIO(m.code)
890 opcode = struct.unpack('!B', coder.read(1))[0]
891 if opcode == 36: # pushbyte
892 v = struct.unpack('!B', coder.read(1))[0]
894 elif opcode == 44: # pushstring
896 stack.append(constant_strings[idx])
897 elif opcode == 48: # pushscope
898 # We don't implement the scope register, so we'll just
899 # ignore the popped value
901 elif opcode == 70: # callproperty
903 mname = multinames[index]
904 arg_count = u30(coder)
905 args = list(reversed(
906 [stack.pop() for _ in range(arg_count)]))
908 if mname == u'split':
909 assert len(args) == 1
910 assert isinstance(args[0], compat_str)
911 assert isinstance(obj, compat_str)
915 res = obj.split(args[0])
917 elif mname == u'slice':
918 assert len(args) == 1
919 assert isinstance(args[0], int)
920 assert isinstance(obj, list)
923 elif mname == u'join':
924 assert len(args) == 1
925 assert isinstance(args[0], compat_str)
926 assert isinstance(obj, list)
927 res = args[0].join(obj)
929 elif mname in method_pyfunctions:
930 stack.append(method_pyfunctions[mname](args))
932 raise NotImplementedError(
933 u'Unsupported property %r on %r'
935 elif opcode == 72: # returnvalue
938 elif opcode == 79: # callpropvoid
940 mname = multinames[index]
941 arg_count = u30(coder)
942 args = list(reversed(
943 [stack.pop() for _ in range(arg_count)]))
945 if mname == u'reverse':
946 assert isinstance(obj, list)
949 raise NotImplementedError(
950 u'Unsupported (void) property %r on %r'
952 elif opcode == 93: # findpropstrict
954 mname = multinames[index]
955 res = extract_function(mname)
957 elif opcode == 97: # setproperty
962 assert isinstance(obj, list)
963 assert isinstance(idx, int)
965 elif opcode == 98: # getlocal
967 stack.append(registers[index])
968 elif opcode == 99: # setlocal
971 registers[index] = value
972 elif opcode == 102: # getproperty
974 pname = multinames[index]
975 if pname == u'length':
977 assert isinstance(obj, list)
978 stack.append(len(obj))
979 else: # Assume attribute access
981 assert isinstance(idx, int)
983 assert isinstance(obj, list)
984 stack.append(obj[idx])
985 elif opcode == 128: # coerce
987 elif opcode == 133: # coerce_s
988 assert isinstance(stack[-1], (type(None), compat_str))
989 elif opcode == 164: # modulo
992 res = value1 % value2
994 elif opcode == 208: # getlocal_0
995 stack.append(registers[0])
996 elif opcode == 209: # getlocal_1
997 stack.append(registers[1])
998 elif opcode == 210: # getlocal_2
999 stack.append(registers[2])
1000 elif opcode == 211: # getlocal_3
1001 stack.append(registers[3])
1002 elif opcode == 214: # setlocal_2
1003 registers[2] = stack.pop()
1004 elif opcode == 215: # setlocal_3
1005 registers[3] = stack.pop()
1007 raise NotImplementedError(
1008 u'Unsupported opcode %d' % opcode)
1010 method_pyfunctions[func_name] = resfunc
1013 initial_function = extract_function(u'decipher')
1014 return lambda s: initial_function([s])
1016 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1017 """Turn the encrypted s field into a working signature"""
1019 if player_url is not None:
1020 if player_url.startswith(u'//'):
1021 player_url = u'https:' + player_url
1023 player_id = (player_url, len(s))
1024 if player_id not in self._player_cache:
1025 func = self._extract_signature_function(
1026 video_id, player_url, len(s)
1028 self._player_cache[player_id] = func
1029 func = self._player_cache[player_id]
1030 if self._downloader.params.get('youtube_print_sig_code'):
1031 self._print_sig_code(func, len(s))
1034 tb = traceback.format_exc()
1035 self._downloader.report_warning(
1036 u'Automatic signature extraction failed: ' + tb)
1038 self._downloader.report_warning(
1039 u'Warning: Falling back to static signature algorithm')
1041 return self._static_decrypt_signature(
1042 s, video_id, player_url, age_gate)
1044 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1046 # The videos with age protection use another player, so the
1047 # algorithms can be different.
1049 return s[2:63] + s[82] + s[64:82] + s[63]
1052 return s[86:29:-1] + s[88] + s[28:5:-1]
1054 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1056 return s[84:27:-1] + s[86] + s[26:5:-1]
1058 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1060 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1062 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1064 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1066 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1068 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1070 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1072 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1074 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1076 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1078 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1080 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1083 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1085 def _get_available_subtitles(self, video_id, webpage):
1087 sub_list = self._download_webpage(
1088 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1089 video_id, note=False)
1090 except ExtractorError as err:
1091 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1093 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1098 params = compat_urllib_parse.urlencode({
1101 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
1102 'name': l[0].encode('utf-8'),
1104 url = u'http://www.youtube.com/api/timedtext?' + params
1105 sub_lang_list[lang] = url
1106 if not sub_lang_list:
1107 self._downloader.report_warning(u'video doesn\'t have subtitles')
1109 return sub_lang_list
1111 def _get_available_automatic_caption(self, video_id, webpage):
1112 """We need the webpage for getting the captions url, pass it as an
1113 argument to speed up the process."""
1114 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
1115 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1116 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1117 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1119 self._downloader.report_warning(err_msg)
1121 player_config = json.loads(mobj.group(1))
1123 args = player_config[u'args']
1124 caption_url = args[u'ttsurl']
1125 timestamp = args[u'timestamp']
1126 # We get the available subtitles
1127 list_params = compat_urllib_parse.urlencode({
1132 list_url = caption_url + '&' + list_params
1133 caption_list = self._download_xml(list_url, video_id)
1134 original_lang_node = caption_list.find('track')
1135 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
1136 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1138 original_lang = original_lang_node.attrib['lang_code']
1141 for lang_node in caption_list.findall('target'):
1142 sub_lang = lang_node.attrib['lang_code']
1143 params = compat_urllib_parse.urlencode({
1144 'lang': original_lang,
1150 sub_lang_list[sub_lang] = caption_url + '&' + params
1151 return sub_lang_list
1152 # An extractor error can be raise by the download process if there are
1153 # no automatic captions but there are subtitles
1154 except (KeyError, ExtractorError):
1155 self._downloader.report_warning(err_msg)
1158 def _print_formats(self, formats):
1159 print('Available formats:')
1161 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1162 self._video_dimensions.get(x, '???'),
1163 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1165 def _extract_id(self, url):
1166 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1168 raise ExtractorError(u'Invalid URL: %s' % url)
1169 video_id = mobj.group(2)
1172 def _get_video_url_list(self, url_map):
1174 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1175 with the requested formats.
1177 req_format = self._downloader.params.get('format', None)
1178 format_limit = self._downloader.params.get('format_limit', None)
1179 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1180 if format_limit is not None and format_limit in available_formats:
1181 format_list = available_formats[available_formats.index(format_limit):]
1183 format_list = available_formats
1184 existing_formats = [x for x in format_list if x in url_map]
1185 if len(existing_formats) == 0:
1186 raise ExtractorError(u'no known formats available for video')
1187 if self._downloader.params.get('listformats', None):
1188 self._print_formats(existing_formats)
1190 if req_format is None or req_format == 'best':
1191 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1192 elif req_format == 'worst':
1193 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1194 elif req_format in ('-1', 'all'):
1195 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1197 # Specific formats. We pick the first in a slash-delimeted sequence.
1198 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1199 # available in the specified format. For example,
1200 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1201 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1202 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1203 req_formats = req_format.split('/')
1204 video_url_list = None
1205 for rf in req_formats:
1207 video_url_list = [(rf, url_map[rf])]
1209 if rf in self._video_formats_map:
1210 for srf in self._video_formats_map[rf]:
1212 video_url_list = [(srf, url_map[srf])]
1217 if video_url_list is None:
1218 raise ExtractorError(u'requested format not available')
1219 return video_url_list
1221 def _extract_from_m3u8(self, manifest_url, video_id):
1223 def _get_urls(_manifest):
1224 lines = _manifest.split('\n')
1225 urls = filter(lambda l: l and not l.startswith('#'),
1228 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1229 formats_urls = _get_urls(manifest)
1230 for format_url in formats_urls:
1231 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1232 url_map[itag] = format_url
1235 def _extract_annotations(self, video_id):
1236 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
1237 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
1239 def _real_extract(self, url):
1240 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1241 mobj = re.search(self._NEXT_URL_RE, url)
1243 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1244 video_id = self._extract_id(url)
1247 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1248 video_webpage = self._download_webpage(url, video_id)
1250 # Attempt to extract SWF player URL
1251 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1252 if mobj is not None:
1253 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1258 self.report_video_info_webpage_download(video_id)
1259 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1260 self.report_age_confirmation()
1262 # We simulate the access to the video from www.youtube.com/v/{video_id}
1263 # this can be viewed without login into Youtube
1264 data = compat_urllib_parse.urlencode({'video_id': video_id,
1265 'el': 'player_embedded',
1268 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1272 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1273 video_info_webpage = self._download_webpage(video_info_url, video_id,
1275 errnote='unable to download video info webpage')
1276 video_info = compat_parse_qs(video_info_webpage)
1279 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1280 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1281 % (video_id, el_type))
1282 video_info_webpage = self._download_webpage(video_info_url, video_id,
1284 errnote='unable to download video info webpage')
1285 video_info = compat_parse_qs(video_info_webpage)
1286 if 'token' in video_info:
1288 if 'token' not in video_info:
1289 if 'reason' in video_info:
1290 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1292 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1294 if 'view_count' in video_info:
1295 view_count = int(video_info['view_count'][0])
1299 # Check for "rental" videos
1300 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1301 raise ExtractorError(u'"rental" videos not supported')
1303 # Start extracting information
1304 self.report_information_extraction(video_id)
1307 if 'author' not in video_info:
1308 raise ExtractorError(u'Unable to extract uploader name')
1309 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1312 video_uploader_id = None
1313 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1314 if mobj is not None:
1315 video_uploader_id = mobj.group(1)
1317 self._downloader.report_warning(u'unable to extract uploader nickname')
1320 if 'title' in video_info:
1321 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1323 self._downloader.report_warning(u'Unable to extract video title')
1327 # We try first to get a high quality image:
1328 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1329 video_webpage, re.DOTALL)
1330 if m_thumb is not None:
1331 video_thumbnail = m_thumb.group(1)
1332 elif 'thumbnail_url' not in video_info:
1333 self._downloader.report_warning(u'unable to extract video thumbnail')
1334 video_thumbnail = None
1335 else: # don't panic if we can't find it
1336 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1340 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1341 if mobj is not None:
1342 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1343 upload_date = unified_strdate(upload_date)
1346 video_description = get_element_by_id("eow-description", video_webpage)
1347 if video_description:
1348 video_description = re.sub(r'''(?x)
1350 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1352 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1353 class="yt-uix-redirect-link"\s*>
1356 ''', r'\1', video_description)
1357 video_description = clean_html(video_description)
1359 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1361 video_description = unescapeHTML(fd_mobj.group(1))
1363 video_description = u''
1365 def _extract_count(klass):
1366 count = self._search_regex(r'class="%s">([\d,]+)</span>' % re.escape(klass), video_webpage, klass, fatal=False)
1367 if count is not None:
1368 return int(count.replace(',', ''))
1370 like_count = _extract_count(u'likes-count')
1371 dislike_count = _extract_count(u'dislikes-count')
1374 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1376 if self._downloader.params.get('listsubtitles', False):
1377 self._list_available_subtitles(video_id, video_webpage)
1380 if 'length_seconds' not in video_info:
1381 self._downloader.report_warning(u'unable to extract video duration')
1384 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1387 video_annotations = None
1388 if self._downloader.params.get('writeannotations', False):
1389 video_annotations = self._extract_annotations(video_id)
1391 # Decide which formats to download
1394 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1396 raise ValueError('Could not find vevo ID')
1397 info = json.loads(mobj.group(1))
1399 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1400 # this signatures are encrypted
1401 if 'url_encoded_fmt_stream_map' not in args:
1402 raise ValueError(u'No stream_map present') # caught below
1403 re_signature = re.compile(r'[&,]s=')
1404 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1406 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1407 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1408 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1410 if 'adaptive_fmts' in video_info:
1411 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1413 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1417 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1418 self.report_rtmp_download()
1419 video_url_list = [(None, video_info['conn'][0])]
1420 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1421 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1422 if 'rtmpe%3Dyes' in encoded_url_map:
1423 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1425 for url_data_str in encoded_url_map.split(','):
1426 url_data = compat_parse_qs(url_data_str)
1427 if 'itag' in url_data and 'url' in url_data:
1428 url = url_data['url'][0]
1429 if 'sig' in url_data:
1430 url += '&signature=' + url_data['sig'][0]
1431 elif 's' in url_data:
1432 encrypted_sig = url_data['s'][0]
1433 if self._downloader.params.get('verbose'):
1435 if player_url is None:
1436 player_version = 'unknown'
1438 player_version = self._search_regex(
1439 r'-(.+)\.swf$', player_url,
1440 u'flash player', fatal=False)
1441 player_desc = 'flash player %s' % player_version
1443 player_version = self._search_regex(
1444 r'html5player-(.+?)\.js', video_webpage,
1445 'html5 player', fatal=False)
1446 player_desc = u'html5 player %s' % player_version
1448 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1449 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1450 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1453 jsplayer_url_json = self._search_regex(
1454 r'"assets":.+?"js":\s*("[^"]+")',
1455 video_webpage, u'JS player URL')
1456 player_url = json.loads(jsplayer_url_json)
1458 signature = self._decrypt_signature(
1459 encrypted_sig, video_id, player_url, age_gate)
1460 url += '&signature=' + signature
1461 if 'ratebypass' not in url:
1462 url += '&ratebypass=yes'
1463 url_map[url_data['itag'][0]] = url
1464 video_url_list = self._get_video_url_list(url_map)
1465 if not video_url_list:
1467 elif video_info.get('hlsvp'):
1468 manifest_url = video_info['hlsvp'][0]
1469 url_map = self._extract_from_m3u8(manifest_url, video_id)
1470 video_url_list = self._get_video_url_list(url_map)
1471 if not video_url_list:
1475 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1478 for itag, video_real_url in video_url_list:
1480 video_extension = self._video_extensions.get(itag, 'flv')
1482 video_format = '{0} - {1}{2}'.format(itag if itag else video_extension,
1483 self._video_dimensions.get(itag, '???'),
1484 ' ('+self._special_itags[itag]+')' if itag in self._special_itags else '')
1488 'url': video_real_url,
1489 'uploader': video_uploader,
1490 'uploader_id': video_uploader_id,
1491 'upload_date': upload_date,
1492 'title': video_title,
1493 'ext': video_extension,
1494 'format': video_format,
1496 'thumbnail': video_thumbnail,
1497 'description': video_description,
1498 'player_url': player_url,
1499 'subtitles': video_subtitles,
1500 'duration': video_duration,
1501 'age_limit': 18 if age_gate else 0,
1502 'annotations': video_annotations,
1503 'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
1504 'view_count': view_count,
1505 'like_count': like_count,
1506 'dislike_count': dislike_count,
1510 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1511 IE_DESC = u'YouTube.com playlists'
1512 _VALID_URL = r"""(?:
1517 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1518 \? (?:.*?&)*? (?:p|a|list)=
1521 ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
1524 ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1526 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
1527 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1528 _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
1529 IE_NAME = u'youtube:playlist'
1532 def suitable(cls, url):
1533 """Receives a URL and returns True if suitable for this IE."""
1534 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1536 def _real_initialize(self):
1539 def _ids_to_results(self, ids):
1540 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1543 def _extract_mix(self, playlist_id):
1544 # The mixes are generated from a a single video
1545 # the id of the playlist is just 'RD' + video_id
1546 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1547 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1548 title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
1549 get_element_by_attribute('class', 'title ', webpage))
1550 title = clean_html(title_span)
1551 video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
1552 ids = orderedSet(re.findall(video_re, webpage))
1553 url_results = self._ids_to_results(ids)
1555 return self.playlist_result(url_results, playlist_id, title)
1557 def _real_extract(self, url):
1558 # Extract playlist id
1559 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1561 raise ExtractorError(u'Invalid URL: %s' % url)
1562 playlist_id = mobj.group(1) or mobj.group(2)
1564 # Check if it's a video-specific URL
1565 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1566 if 'v' in query_dict:
1567 video_id = query_dict['v'][0]
1568 if self._downloader.params.get('noplaylist'):
1569 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1570 return self.url_result(video_id, 'Youtube', video_id=video_id)
1572 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1574 if playlist_id.startswith('RD'):
1575 # Mixes require a custom extraction process
1576 return self._extract_mix(playlist_id)
1578 # Extract the video ids from the playlist pages
1581 for page_num in itertools.count(1):
1582 url = self._TEMPLATE_URL % (playlist_id, page_num)
1583 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1584 matches = re.finditer(self._VIDEO_RE, page)
1585 # We remove the duplicates and the link with index 0
1586 # (it's not the first video of the playlist)
1587 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1590 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1593 playlist_title = self._og_search_title(page)
1595 url_results = self._ids_to_results(ids)
1596 return self.playlist_result(url_results, playlist_id, playlist_title)
1599 class YoutubeChannelIE(InfoExtractor):
1600 IE_DESC = u'YouTube.com channels'
1601 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1602 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1603 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1604 IE_NAME = u'youtube:channel'
1606 def extract_videos_from_page(self, page):
1608 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1609 if mobj.group(1) not in ids_in_page:
1610 ids_in_page.append(mobj.group(1))
1613 def _real_extract(self, url):
1614 # Extract channel id
1615 mobj = re.match(self._VALID_URL, url)
1617 raise ExtractorError(u'Invalid URL: %s' % url)
1619 # Download channel page
1620 channel_id = mobj.group(1)
1622 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1623 channel_page = self._download_webpage(url, channel_id)
1624 autogenerated = re.search(r'''(?x)
1626 channel-header-autogenerated-label|
1627 yt-channel-title-autogenerated
1628 )[^"]*"''', channel_page) is not None
1631 # The videos are contained in a single page
1632 # the ajax pages can't be used, they are empty
1633 video_ids = self.extract_videos_from_page(channel_page)
1635 # Download all channel pages using the json-based channel_ajax query
1636 for pagenum in itertools.count(1):
1637 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1638 page = self._download_webpage(url, channel_id,
1639 u'Downloading page #%s' % pagenum)
1641 page = json.loads(page)
1643 ids_in_page = self.extract_videos_from_page(page['content_html'])
1644 video_ids.extend(ids_in_page)
1646 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1649 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1651 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1652 for video_id in video_ids]
1653 return self.playlist_result(url_entries, channel_id)
1656 class YoutubeUserIE(InfoExtractor):
1657 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1658 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1659 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1660 _GDATA_PAGE_SIZE = 50
1661 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1662 IE_NAME = u'youtube:user'
1665 def suitable(cls, url):
1666 # Don't return True if the url can be extracted with other youtube
1667 # extractor, the regex would is too permissive and it would match.
1668 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1669 if any(ie.suitable(url) for ie in other_ies): return False
1670 else: return super(YoutubeUserIE, cls).suitable(url)
1672 def _real_extract(self, url):
1674 mobj = re.match(self._VALID_URL, url)
1676 raise ExtractorError(u'Invalid URL: %s' % url)
1678 username = mobj.group(1)
1680 # Download video ids using YouTube Data API. Result size per
1681 # query is limited (currently to 50 videos) so we need to query
1682 # page by page until there are no video ids - it means we got
1687 for pagenum in itertools.count(0):
1688 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1690 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1691 page = self._download_webpage(gdata_url, username,
1692 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1695 response = json.loads(page)
1696 except ValueError as err:
1697 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1698 if 'entry' not in response['feed']:
1699 # Number of videos is a multiple of self._MAX_RESULTS
1702 # Extract video identifiers
1704 for entry in response['feed']['entry']:
1705 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1706 video_ids.extend(ids_in_page)
1708 # A little optimization - if current page is not
1709 # "full", ie. does not contain PAGE_SIZE video ids then
1710 # we can assume that this page is the last one - there
1711 # are no more ids on further pages - no need to query
1714 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1718 self.url_result(video_id, 'Youtube', video_id=video_id)
1719 for video_id in video_ids]
1720 return self.playlist_result(url_results, playlist_title=username)
1723 class YoutubeSearchIE(SearchInfoExtractor):
1724 IE_DESC = u'YouTube.com searches'
1725 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1727 IE_NAME = u'youtube:search'
1728 _SEARCH_KEY = 'ytsearch'
1730 def _get_n_results(self, query, n):
1731 """Get a specified number of results for a query"""
1737 while (50 * pagenum) < limit:
1738 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1739 data = self._download_webpage(result_url, u'query "%s"' % query,
1740 u'Downloading page %s' % pagenum, u'Unable to download API page')
1741 api_response = json.loads(data)['data']
1743 if not 'items' in api_response:
1744 raise ExtractorError(u'[youtube] No video results')
1746 new_ids = list(video['id'] for video in api_response['items'])
1747 video_ids += new_ids
1749 limit = min(n, api_response['totalItems'])
1752 if len(video_ids) > n:
1753 video_ids = video_ids[:n]
1754 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1755 for video_id in video_ids]
1756 return self.playlist_result(videos, query)
1758 class YoutubeSearchDateIE(YoutubeSearchIE):
1759 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1760 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1761 _SEARCH_KEY = 'ytsearchdate'
1762 IE_DESC = u'YouTube.com searches, newest videos first'
1764 class YoutubeShowIE(InfoExtractor):
1765 IE_DESC = u'YouTube.com (multi-season) shows'
1766 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1767 IE_NAME = u'youtube:show'
1769 def _real_extract(self, url):
1770 mobj = re.match(self._VALID_URL, url)
1771 show_name = mobj.group(1)
1772 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1773 # There's one playlist for each season of the show
1774 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1775 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1776 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1779 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1781 Base class for extractors that fetch info from
1782 http://www.youtube.com/feed_ajax
1783 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1785 _LOGIN_REQUIRED = True
1786 # use action_load_personal_feed instead of action_load_system_feed
1787 _PERSONAL_FEED = False
1790 def _FEED_TEMPLATE(self):
1791 action = 'action_load_system_feed'
1792 if self._PERSONAL_FEED:
1793 action = 'action_load_personal_feed'
1794 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1798 return u'youtube:%s' % self._FEED_NAME
1800 def _real_initialize(self):
1803 def _real_extract(self, url):
1806 for i in itertools.count(1):
1807 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1808 u'%s feed' % self._FEED_NAME,
1809 u'Downloading page %s' % i)
1810 info = json.loads(info)
1811 feed_html = info['feed_html']
1812 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1813 ids = orderedSet(m.group(1) for m in m_ids)
1814 feed_entries.extend(
1815 self.url_result(video_id, 'Youtube', video_id=video_id)
1816 for video_id in ids)
1817 if info['paging'] is None:
1819 paging = info['paging']
1820 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1822 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1823 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1824 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1825 _FEED_NAME = 'subscriptions'
1826 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1828 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1829 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1830 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1831 _FEED_NAME = 'recommended'
1832 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1834 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1835 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1836 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1837 _FEED_NAME = 'watch_later'
1838 _PLAYLIST_TITLE = u'Youtube Watch Later'
1839 _PERSONAL_FEED = True
1841 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1842 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1843 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1844 _FEED_NAME = 'history'
1845 _PERSONAL_FEED = True
1846 _PLAYLIST_TITLE = u'Youtube Watch History'
1848 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1849 IE_NAME = u'youtube:favorites'
1850 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1851 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1852 _LOGIN_REQUIRED = True
1854 def _real_extract(self, url):
1855 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1856 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1857 return self.url_result(playlist_id, 'YoutubePlaylist')
1860 class YoutubeTruncatedURLIE(InfoExtractor):
1861 IE_NAME = 'youtube:truncated_url'
1862 IE_DESC = False # Do not list
1863 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1865 def _real_extract(self, url):
1866 raise ExtractorError(
1867 u'Did you forget to quote the URL? Remember that & is a meta '
1868 u'character in most shells, so you want to put the URL in quotes, '
1870 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1871 u' (or simply youtube-dl BaW_jenozKc ).',