14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
48 def report_lang(self):
49 """Report attempt to set language."""
50 self.to_screen(u'Setting language')
52 def _set_language(self):
53 request = compat_urllib_request.Request(self._LANG_URL)
56 compat_urllib_request.urlopen(request).read()
57 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
58 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
63 (username, password) = self._get_login_info()
64 # No authentication to be performed
66 if self._LOGIN_REQUIRED:
67 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
70 request = compat_urllib_request.Request(self._LOGIN_URL)
72 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
73 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
74 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
79 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
82 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
88 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
92 u'PersistentCookie': u'yes',
94 u'bgresponse': u'js_disabled',
95 u'checkConnection': u'',
96 u'checkedDomains': u'youtube',
102 u'signIn': u'Sign in',
104 u'service': u'youtube',
108 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
110 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
111 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
112 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
115 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
116 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
117 self._downloader.report_warning(u'unable to log in: bad username or password')
119 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
124 def _confirm_age(self):
127 'action_confirm': 'Confirm',
129 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
131 self.report_age_confirmation()
132 compat_urllib_request.urlopen(request).read().decode('utf-8')
133 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
134 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
137 def _real_initialize(self):
138 if self._downloader is None:
140 if not self._set_language():
142 if not self._login():
147 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
148 IE_DESC = u'YouTube.com'
151 (?:https?://)? # http(s):// (optional)
152 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
153 tube\.majestyc\.net/|
154 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
155 (?:.*?\#/)? # handle anchor (#/) redirect urls
156 (?: # the various things that can precede the ID:
157 (?:(?:v|embed|e)/) # v/ or embed/ or e/
158 |(?: # or the v= param in all its forms
159 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
160 (?:\?|\#!?) # the params delimiter ? or # or #!
161 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
165 |youtu\.be/ # just youtu.be/xxxx
167 )? # all until now is optional -> you can pass the naked ID
168 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
169 (?(1).+)? # if we found the ID, everything can follow
171 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
172 # Listed in order of quality
173 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '84', '102', '83', '101', '82', '100',
179 '138', '137', '248', '136', '247', '135', '246',
180 '245', '244', '134', '243', '133', '242', '160',
182 '141', '172', '140', '171', '139',
184 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
185 # Apple HTTP Live Streaming
186 '96', '95', '94', '93', '92', '132', '151',
188 '85', '102', '84', '101', '83', '100', '82',
190 '138', '248', '137', '247', '136', '246', '245',
191 '244', '135', '243', '134', '242', '133', '160',
193 '172', '141', '171', '140', '139',
195 _video_formats_map = {
196 'flv': ['35', '34', '6', '5'],
197 '3gp': ['36', '17', '13'],
198 'mp4': ['38', '37', '22', '18'],
199 'webm': ['46', '45', '44', '43'],
201 _video_extensions = {
223 # Apple HTTP Live Streaming
255 _video_dimensions = {
337 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
338 u"file": u"BaW_jenozKc.mp4",
340 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
341 u"uploader": u"Philipp Hagemeister",
342 u"uploader_id": u"phihag",
343 u"upload_date": u"20121002",
344 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
348 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
349 u"file": u"1ltcDfZMA3U.flv",
350 u"note": u"Test VEVO video (#897)",
352 u"upload_date": u"20070518",
353 u"title": u"Maps - It Will Find You",
354 u"description": u"Music video by Maps performing It Will Find You.",
355 u"uploader": u"MuteUSA",
356 u"uploader_id": u"MuteUSA"
360 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
361 u"file": u"UxxajLWwzqY.mp4",
362 u"note": u"Test generic use_cipher_signature video (#897)",
364 u"upload_date": u"20120506",
365 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
366 u"description": u"md5:5b292926389560516e384ac437c0ec07",
367 u"uploader": u"Icona Pop",
368 u"uploader_id": u"IconaPop"
372 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
373 u"file": u"07FYdnEawAQ.mp4",
374 u"note": u"Test VEVO video with age protection (#956)",
376 u"upload_date": u"20130703",
377 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
378 u"description": u"md5:64249768eec3bc4276236606ea996373",
379 u"uploader": u"justintimberlakeVEVO",
380 u"uploader_id": u"justintimberlakeVEVO"
387 def suitable(cls, url):
388 """Receives a URL and returns True if suitable for this IE."""
389 if YoutubePlaylistIE.suitable(url): return False
390 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
392 def __init__(self, *args, **kwargs):
393 super(YoutubeIE, self).__init__(*args, **kwargs)
394 self._player_cache = {}
396 def report_video_webpage_download(self, video_id):
397 """Report attempt to download video webpage."""
398 self.to_screen(u'%s: Downloading video webpage' % video_id)
400 def report_video_info_webpage_download(self, video_id):
401 """Report attempt to download video info webpage."""
402 self.to_screen(u'%s: Downloading video info webpage' % video_id)
404 def report_information_extraction(self, video_id):
405 """Report attempt to extract video information."""
406 self.to_screen(u'%s: Extracting video information' % video_id)
408 def report_unavailable_format(self, video_id, format):
409 """Report extracted video URL."""
410 self.to_screen(u'%s: Format %s not available' % (video_id, format))
412 def report_rtmp_download(self):
413 """Indicate the download will use the RTMP protocol."""
414 self.to_screen(u'RTMP download detected')
416 def _extract_signature_function(self, video_id, player_url, slen):
417 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
419 player_type = id_m.group('ext')
420 player_id = id_m.group('id')
422 # Read from filesystem cache
423 func_id = '%s_%s_%d' % (player_type, player_id, slen)
424 assert os.path.basename(func_id) == func_id
425 cache_dir = get_cachedir(self._downloader.params)
427 cache_enabled = cache_dir is not None
429 cache_fn = os.path.join(os.path.expanduser(cache_dir),
433 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
434 cache_spec = json.load(cachef)
435 return lambda s: u''.join(s[i] for i in cache_spec)
437 pass # No cache available
439 if player_type == 'js':
440 code = self._download_webpage(
441 player_url, video_id,
442 note=u'Downloading %s player %s' % (player_type, player_id),
443 errnote=u'Download of %s failed' % player_url)
444 res = self._parse_sig_js(code)
445 elif player_type == 'swf':
446 urlh = self._request_webpage(
447 player_url, video_id,
448 note=u'Downloading %s player %s' % (player_type, player_id),
449 errnote=u'Download of %s failed' % player_url)
451 res = self._parse_sig_swf(code)
453 assert False, 'Invalid player type %r' % player_type
457 test_string = u''.join(map(compat_chr, range(slen)))
458 cache_res = res(test_string)
459 cache_spec = [ord(c) for c in cache_res]
461 os.makedirs(os.path.dirname(cache_fn))
462 except OSError as ose:
463 if ose.errno != errno.EEXIST:
465 write_json_file(cache_spec, cache_fn)
467 tb = traceback.format_exc()
468 self._downloader.report_warning(
469 u'Writing cache to %r failed: %s' % (cache_fn, tb))
473 def _print_sig_code(self, func, slen):
474 def gen_sig_code(idxs):
475 def _genslice(start, end, step):
476 starts = u'' if start == 0 else str(start)
477 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
478 steps = u'' if step == 1 else (u':%d' % step)
479 return u's[%s%s%s]' % (starts, ends, steps)
482 start = '(Never used)' # Quelch pyflakes warnings - start will be
483 # set as soon as step is set
484 for i, prev in zip(idxs[1:], idxs[:-1]):
488 yield _genslice(start, prev, step)
491 if i - prev in [-1, 1]:
496 yield u's[%d]' % prev
500 yield _genslice(start, i, step)
502 test_string = u''.join(map(compat_chr, range(slen)))
503 cache_res = func(test_string)
504 cache_spec = [ord(c) for c in cache_res]
505 expr_code = u' + '.join(gen_sig_code(cache_spec))
506 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
507 self.to_screen(u'Extracted signature function:\n' + code)
509 def _parse_sig_js(self, jscode):
510 funcname = self._search_regex(
511 r'signature=([a-zA-Z]+)', jscode,
512 u'Initial JS player signature function name')
517 return string.lowercase.index(varname)
519 def interpret_statement(stmt, local_vars, allow_recursion=20):
520 if allow_recursion < 0:
521 raise ExtractorError(u'Recursion limit reached')
523 if stmt.startswith(u'var '):
524 stmt = stmt[len(u'var '):]
525 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
526 r'=(?P<expr>.*)$', stmt)
528 if ass_m.groupdict().get('index'):
530 lvar = local_vars[ass_m.group('out')]
531 idx = interpret_expression(ass_m.group('index'),
532 local_vars, allow_recursion)
533 assert isinstance(idx, int)
536 expr = ass_m.group('expr')
539 local_vars[ass_m.group('out')] = val
541 expr = ass_m.group('expr')
542 elif stmt.startswith(u'return '):
544 expr = stmt[len(u'return '):]
546 raise ExtractorError(
547 u'Cannot determine left side of statement in %r' % stmt)
549 v = interpret_expression(expr, local_vars, allow_recursion)
552 def interpret_expression(expr, local_vars, allow_recursion):
557 return local_vars[expr]
559 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
561 member = m.group('member')
562 val = local_vars[m.group('in')]
563 if member == 'split("")':
565 if member == 'join("")':
567 if member == 'length':
569 if member == 'reverse()':
571 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
573 idx = interpret_expression(
574 slice_m.group('idx'), local_vars, allow_recursion-1)
578 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
580 val = local_vars[m.group('in')]
581 idx = interpret_expression(m.group('idx'), local_vars,
585 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
587 a = interpret_expression(m.group('a'),
588 local_vars, allow_recursion)
589 b = interpret_expression(m.group('b'),
590 local_vars, allow_recursion)
594 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
596 fname = m.group('func')
597 if fname not in functions:
598 functions[fname] = extract_function(fname)
599 argvals = [int(v) if v.isdigit() else local_vars[v]
600 for v in m.group('args').split(',')]
601 return functions[fname](argvals)
602 raise ExtractorError(u'Unsupported JS expression %r' % expr)
604 def extract_function(funcname):
606 r'function ' + re.escape(funcname) +
607 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
609 argnames = func_m.group('args').split(',')
612 local_vars = dict(zip(argnames, args))
613 for stmt in func_m.group('code').split(';'):
614 res = interpret_statement(stmt, local_vars)
618 initial_function = extract_function(funcname)
619 return lambda s: initial_function([s])
621 def _parse_sig_swf(self, file_contents):
622 if file_contents[1:3] != b'WS':
623 raise ExtractorError(
624 u'Not an SWF file; header is %r' % file_contents[:3])
625 if file_contents[:1] == b'C':
626 content = zlib.decompress(file_contents[8:])
628 raise NotImplementedError(u'Unsupported compression format %r' %
631 def extract_tags(content):
633 while pos < len(content):
634 header16 = struct.unpack('<H', content[pos:pos+2])[0]
636 tag_code = header16 >> 6
637 tag_len = header16 & 0x3f
639 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
641 assert pos+tag_len <= len(content)
642 yield (tag_code, content[pos:pos+tag_len])
646 for tag_code, tag in extract_tags(content)
648 p = code_tag.index(b'\0', 4) + 1
649 code_reader = io.BytesIO(code_tag[p:])
651 # Parse ABC (AVM2 ByteCode)
652 def read_int(reader=None):
660 b = struct.unpack('<B', buf)[0]
661 res = res | ((b & 0x7f) << shift)
667 def u30(reader=None):
668 res = read_int(reader)
669 assert res & 0xf0000000 == 0
673 def s32(reader=None):
675 if v & 0x80000000 != 0:
676 v = - ((v ^ 0xffffffff) + 1)
679 def read_string(reader=None):
683 resb = reader.read(slen)
684 assert len(resb) == slen
685 return resb.decode('utf-8')
687 def read_bytes(count, reader=None):
690 resb = reader.read(count)
691 assert len(resb) == count
694 def read_byte(reader=None):
695 resb = read_bytes(1, reader=reader)
696 res = struct.unpack('<B', resb)[0]
699 # minor_version + major_version
704 for _c in range(1, int_count):
707 for _c in range(1, uint_count):
710 read_bytes((double_count-1) * 8)
712 constant_strings = [u'']
713 for _c in range(1, string_count):
715 constant_strings.append(s)
716 namespace_count = u30()
717 for _c in range(1, namespace_count):
721 for _c in range(1, ns_set_count):
723 for _c2 in range(count):
725 multiname_count = u30()
734 0x0e: 2, # MultinameA
735 0x1b: 1, # MultinameL
736 0x1c: 1, # MultinameLA
739 for _c in range(1, multiname_count):
741 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
743 u30() # namespace_idx
745 multinames.append(constant_strings[name_idx])
747 multinames.append('[MULTINAME kind: %d]' % kind)
748 for _c2 in range(MULTINAME_SIZES[kind]):
753 MethodInfo = collections.namedtuple(
755 ['NEED_ARGUMENTS', 'NEED_REST'])
757 for method_id in range(method_count):
760 for _ in range(param_count):
762 u30() # name index (always 0 for youtube)
764 if flags & 0x08 != 0:
767 for c in range(option_count):
770 if flags & 0x80 != 0:
771 # Param names present
772 for _ in range(param_count):
774 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
775 method_infos.append(mi)
778 metadata_count = u30()
779 for _c in range(metadata_count):
782 for _c2 in range(item_count):
786 def parse_traits_info():
787 trait_name_idx = u30()
788 kind_full = read_byte()
789 kind = kind_full & 0x0f
790 attrs = kind_full >> 4
792 if kind in [0x00, 0x06]: # Slot or Const
794 u30() # type_name_idx
798 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
801 methods[multinames[trait_name_idx]] = method_idx
802 elif kind == 0x04: # Class
805 elif kind == 0x05: # Function
808 methods[function_idx] = multinames[trait_name_idx]
810 raise ExtractorError(u'Unsupported trait kind %d' % kind)
812 if attrs & 0x4 != 0: # Metadata present
813 metadata_count = u30()
814 for _c3 in range(metadata_count):
815 u30() # metadata index
820 TARGET_CLASSNAME = u'SignatureDecipher'
821 searched_idx = multinames.index(TARGET_CLASSNAME)
822 searched_class_id = None
824 for class_id in range(class_count):
826 if name_idx == searched_idx:
827 # We found the class we're looking for!
828 searched_class_id = class_id
829 u30() # super_name idx
831 if flags & 0x08 != 0: # Protected namespace is present
832 u30() # protected_ns_idx
834 for _c2 in range(intrf_count):
838 for _c2 in range(trait_count):
841 if searched_class_id is None:
842 raise ExtractorError(u'Target class %r not found' %
847 for class_id in range(class_count):
850 for _c2 in range(trait_count):
851 trait_methods = parse_traits_info()
852 if class_id == searched_class_id:
853 method_names.update(trait_methods.items())
854 method_idxs.update(dict(
856 for name, idx in trait_methods.items()))
860 for _c in range(script_count):
863 for _c2 in range(trait_count):
867 method_body_count = u30()
868 Method = collections.namedtuple('Method', ['code', 'local_count'])
870 for _c in range(method_body_count):
874 u30() # init_scope_depth
875 u30() # max_scope_depth
877 code = read_bytes(code_length)
878 if method_idx in method_idxs:
879 m = Method(code, local_count)
880 methods[method_idxs[method_idx]] = m
881 exception_count = u30()
882 for _c2 in range(exception_count):
889 for _c2 in range(trait_count):
892 assert p + code_reader.tell() == len(code_tag)
893 assert len(methods) == len(method_idxs)
895 method_pyfunctions = {}
897 def extract_function(func_name):
898 if func_name in method_pyfunctions:
899 return method_pyfunctions[func_name]
900 if func_name not in methods:
901 raise ExtractorError(u'Cannot find function %r' % func_name)
902 m = methods[func_name]
905 registers = ['(this)'] + list(args) + [None] * m.local_count
907 coder = io.BytesIO(m.code)
909 opcode = struct.unpack('!B', coder.read(1))[0]
910 if opcode == 36: # pushbyte
911 v = struct.unpack('!B', coder.read(1))[0]
913 elif opcode == 44: # pushstring
915 stack.append(constant_strings[idx])
916 elif opcode == 48: # pushscope
917 # We don't implement the scope register, so we'll just
918 # ignore the popped value
920 elif opcode == 70: # callproperty
922 mname = multinames[index]
923 arg_count = u30(coder)
924 args = list(reversed(
925 [stack.pop() for _ in range(arg_count)]))
927 if mname == u'split':
928 assert len(args) == 1
929 assert isinstance(args[0], compat_str)
930 assert isinstance(obj, compat_str)
934 res = obj.split(args[0])
936 elif mname == u'slice':
937 assert len(args) == 1
938 assert isinstance(args[0], int)
939 assert isinstance(obj, list)
942 elif mname == u'join':
943 assert len(args) == 1
944 assert isinstance(args[0], compat_str)
945 assert isinstance(obj, list)
946 res = args[0].join(obj)
948 elif mname in method_pyfunctions:
949 stack.append(method_pyfunctions[mname](args))
951 raise NotImplementedError(
952 u'Unsupported property %r on %r'
954 elif opcode == 72: # returnvalue
957 elif opcode == 79: # callpropvoid
959 mname = multinames[index]
960 arg_count = u30(coder)
961 args = list(reversed(
962 [stack.pop() for _ in range(arg_count)]))
964 if mname == u'reverse':
965 assert isinstance(obj, list)
968 raise NotImplementedError(
969 u'Unsupported (void) property %r on %r'
971 elif opcode == 93: # findpropstrict
973 mname = multinames[index]
974 res = extract_function(mname)
976 elif opcode == 97: # setproperty
981 assert isinstance(obj, list)
982 assert isinstance(idx, int)
984 elif opcode == 98: # getlocal
986 stack.append(registers[index])
987 elif opcode == 99: # setlocal
990 registers[index] = value
991 elif opcode == 102: # getproperty
993 pname = multinames[index]
994 if pname == u'length':
996 assert isinstance(obj, list)
997 stack.append(len(obj))
998 else: # Assume attribute access
1000 assert isinstance(idx, int)
1002 assert isinstance(obj, list)
1003 stack.append(obj[idx])
1004 elif opcode == 128: # coerce
1006 elif opcode == 133: # coerce_s
1007 assert isinstance(stack[-1], (type(None), compat_str))
1008 elif opcode == 164: # modulo
1009 value2 = stack.pop()
1010 value1 = stack.pop()
1011 res = value1 % value2
1013 elif opcode == 208: # getlocal_0
1014 stack.append(registers[0])
1015 elif opcode == 209: # getlocal_1
1016 stack.append(registers[1])
1017 elif opcode == 210: # getlocal_2
1018 stack.append(registers[2])
1019 elif opcode == 211: # getlocal_3
1020 stack.append(registers[3])
1021 elif opcode == 214: # setlocal_2
1022 registers[2] = stack.pop()
1023 elif opcode == 215: # setlocal_3
1024 registers[3] = stack.pop()
1026 raise NotImplementedError(
1027 u'Unsupported opcode %d' % opcode)
1029 method_pyfunctions[func_name] = resfunc
1032 initial_function = extract_function(u'decipher')
1033 return lambda s: initial_function([s])
1035 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1036 """Turn the encrypted s field into a working signature"""
1038 if player_url is not None:
1040 player_id = (player_url, len(s))
1041 if player_id not in self._player_cache:
1042 func = self._extract_signature_function(
1043 video_id, player_url, len(s)
1045 self._player_cache[player_id] = func
1046 func = self._player_cache[player_id]
1047 if self._downloader.params.get('youtube_print_sig_code'):
1048 self._print_sig_code(func, len(s))
1051 tb = traceback.format_exc()
1052 self._downloader.report_warning(
1053 u'Automatic signature extraction failed: ' + tb)
1055 self._downloader.report_warning(
1056 u'Warning: Falling back to static signature algorithm')
1058 return self._static_decrypt_signature(
1059 s, video_id, player_url, age_gate)
1061 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1063 # The videos with age protection use another player, so the
1064 # algorithms can be different.
1066 return s[2:63] + s[82] + s[64:82] + s[63]
1069 return s[86:29:-1] + s[88] + s[28:5:-1]
1071 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1073 return s[84:27:-1] + s[86] + s[26:5:-1]
1075 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1077 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1079 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1081 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1083 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1085 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1087 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1089 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1091 return s[80:37:-1] + s[7] + s[36:7:-1] + s[0] + s[6:0:-1] + s[37]
1093 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1095 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1097 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1100 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1102 def _get_available_subtitles(self, video_id):
1104 sub_list = self._download_webpage(
1105 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1106 video_id, note=False)
1107 except ExtractorError as err:
1108 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1110 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1115 params = compat_urllib_parse.urlencode({
1118 'fmt': self._downloader.params.get('subtitlesformat'),
1120 url = u'http://www.youtube.com/api/timedtext?' + params
1121 sub_lang_list[lang] = url
1122 if not sub_lang_list:
1123 self._downloader.report_warning(u'video doesn\'t have subtitles')
1125 return sub_lang_list
1127 def _get_available_automatic_caption(self, video_id, webpage):
1128 """We need the webpage for getting the captions url, pass it as an
1129 argument to speed up the process."""
1130 sub_format = self._downloader.params.get('subtitlesformat')
1131 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1132 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1133 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1135 self._downloader.report_warning(err_msg)
1137 player_config = json.loads(mobj.group(1))
1139 args = player_config[u'args']
1140 caption_url = args[u'ttsurl']
1141 timestamp = args[u'timestamp']
1142 # We get the available subtitles
1143 list_params = compat_urllib_parse.urlencode({
1148 list_url = caption_url + '&' + list_params
1149 list_page = self._download_webpage(list_url, video_id)
1150 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1151 original_lang_node = caption_list.find('track')
1152 if original_lang_node.attrib.get('kind') != 'asr' :
1153 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1155 original_lang = original_lang_node.attrib['lang_code']
1158 for lang_node in caption_list.findall('target'):
1159 sub_lang = lang_node.attrib['lang_code']
1160 params = compat_urllib_parse.urlencode({
1161 'lang': original_lang,
1167 sub_lang_list[sub_lang] = caption_url + '&' + params
1168 return sub_lang_list
1169 # An extractor error can be raise by the download process if there are
1170 # no automatic captions but there are subtitles
1171 except (KeyError, ExtractorError):
1172 self._downloader.report_warning(err_msg)
1175 def _print_formats(self, formats):
1176 print('Available formats:')
1178 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1179 self._video_dimensions.get(x, '???'),
1180 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1182 def _extract_id(self, url):
1183 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1185 raise ExtractorError(u'Invalid URL: %s' % url)
1186 video_id = mobj.group(2)
1189 def _get_video_url_list(self, url_map):
1191 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1192 with the requested formats.
1194 req_format = self._downloader.params.get('format', None)
1195 format_limit = self._downloader.params.get('format_limit', None)
1196 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1197 if format_limit is not None and format_limit in available_formats:
1198 format_list = available_formats[available_formats.index(format_limit):]
1200 format_list = available_formats
1201 existing_formats = [x for x in format_list if x in url_map]
1202 if len(existing_formats) == 0:
1203 raise ExtractorError(u'no known formats available for video')
1204 if self._downloader.params.get('listformats', None):
1205 self._print_formats(existing_formats)
1207 if req_format is None or req_format == 'best':
1208 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1209 elif req_format == 'worst':
1210 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1211 elif req_format in ('-1', 'all'):
1212 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1214 # Specific formats. We pick the first in a slash-delimeted sequence.
1215 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1216 # available in the specified format. For example,
1217 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1218 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1219 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1220 req_formats = req_format.split('/')
1221 video_url_list = None
1222 for rf in req_formats:
1224 video_url_list = [(rf, url_map[rf])]
1226 if rf in self._video_formats_map:
1227 for srf in self._video_formats_map[rf]:
1229 video_url_list = [(srf, url_map[srf])]
1234 if video_url_list is None:
1235 raise ExtractorError(u'requested format not available')
1236 return video_url_list
1238 def _extract_from_m3u8(self, manifest_url, video_id):
1240 def _get_urls(_manifest):
1241 lines = _manifest.split('\n')
1242 urls = filter(lambda l: l and not l.startswith('#'),
1245 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1246 formats_urls = _get_urls(manifest)
1247 for format_url in formats_urls:
1248 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1249 url_map[itag] = format_url
1252 def _real_extract(self, url):
1253 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1254 mobj = re.search(self._NEXT_URL_RE, url)
1256 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1257 video_id = self._extract_id(url)
1260 self.report_video_webpage_download(video_id)
1261 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1262 request = compat_urllib_request.Request(url)
1264 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1265 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1268 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1270 # Attempt to extract SWF player URL
1271 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1272 if mobj is not None:
1273 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1278 self.report_video_info_webpage_download(video_id)
1279 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1280 self.report_age_confirmation()
1282 # We simulate the access to the video from www.youtube.com/v/{video_id}
1283 # this can be viewed without login into Youtube
1284 data = compat_urllib_parse.urlencode({'video_id': video_id,
1288 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1292 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1293 video_info_webpage = self._download_webpage(video_info_url, video_id,
1295 errnote='unable to download video info webpage')
1296 video_info = compat_parse_qs(video_info_webpage)
1299 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1300 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1301 % (video_id, el_type))
1302 video_info_webpage = self._download_webpage(video_info_url, video_id,
1304 errnote='unable to download video info webpage')
1305 video_info = compat_parse_qs(video_info_webpage)
1306 if 'token' in video_info:
1308 if 'token' not in video_info:
1309 if 'reason' in video_info:
1310 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1312 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1314 # Check for "rental" videos
1315 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1316 raise ExtractorError(u'"rental" videos not supported')
1318 # Start extracting information
1319 self.report_information_extraction(video_id)
1322 if 'author' not in video_info:
1323 raise ExtractorError(u'Unable to extract uploader name')
1324 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1327 video_uploader_id = None
1328 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1329 if mobj is not None:
1330 video_uploader_id = mobj.group(1)
1332 self._downloader.report_warning(u'unable to extract uploader nickname')
1335 if 'title' in video_info:
1336 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1338 self._downloader.report_warning(u'Unable to extract video title')
1342 # We try first to get a high quality image:
1343 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1344 video_webpage, re.DOTALL)
1345 if m_thumb is not None:
1346 video_thumbnail = m_thumb.group(1)
1347 elif 'thumbnail_url' not in video_info:
1348 self._downloader.report_warning(u'unable to extract video thumbnail')
1349 video_thumbnail = None
1350 else: # don't panic if we can't find it
1351 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1355 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1356 if mobj is not None:
1357 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1358 upload_date = unified_strdate(upload_date)
1361 video_description = get_element_by_id("eow-description", video_webpage)
1362 if video_description:
1363 video_description = clean_html(video_description)
1365 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1367 video_description = unescapeHTML(fd_mobj.group(1))
1369 video_description = u''
1372 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1374 if self._downloader.params.get('listsubtitles', False):
1375 self._list_available_subtitles(video_id, video_webpage)
1378 if 'length_seconds' not in video_info:
1379 self._downloader.report_warning(u'unable to extract video duration')
1382 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1384 # Decide which formats to download
1387 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1389 raise ValueError('Could not find vevo ID')
1390 info = json.loads(mobj.group(1))
1392 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1393 # this signatures are encrypted
1394 if 'url_encoded_fmt_stream_map' not in args:
1395 raise ValueError(u'No stream_map present') # caught below
1396 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1398 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1399 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1400 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1402 if 'url_encoded_fmt_stream_map' in video_info:
1403 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1405 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1406 elif 'adaptive_fmts' in video_info:
1407 if 'url_encoded_fmt_stream_map' in video_info:
1408 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1410 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1414 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1415 self.report_rtmp_download()
1416 video_url_list = [(None, video_info['conn'][0])]
1417 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1418 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1419 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1421 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1422 url_data = compat_parse_qs(url_data_str)
1423 if 'itag' in url_data and 'url' in url_data:
1424 url = url_data['url'][0]
1425 if 'sig' in url_data:
1426 url += '&signature=' + url_data['sig'][0]
1427 elif 's' in url_data:
1428 encrypted_sig = url_data['s'][0]
1429 if self._downloader.params.get('verbose'):
1431 if player_url is None:
1432 player_version = 'unknown'
1434 player_version = self._search_regex(
1435 r'-(.+)\.swf$', player_url,
1436 u'flash player', fatal=False)
1437 player_desc = 'flash player %s' % player_version
1439 player_version = self._search_regex(
1440 r'html5player-(.+?)\.js', video_webpage,
1441 'html5 player', fatal=False)
1442 player_desc = u'html5 player %s' % player_version
1444 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1445 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1446 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1449 jsplayer_url_json = self._search_regex(
1450 r'"assets":.+?"js":\s*("[^"]+")',
1451 video_webpage, u'JS player URL')
1452 player_url = json.loads(jsplayer_url_json)
1454 signature = self._decrypt_signature(
1455 encrypted_sig, video_id, player_url, age_gate)
1456 url += '&signature=' + signature
1457 if 'ratebypass' not in url:
1458 url += '&ratebypass=yes'
1459 url_map[url_data['itag'][0]] = url
1460 video_url_list = self._get_video_url_list(url_map)
1461 if not video_url_list:
1463 elif video_info.get('hlsvp'):
1464 manifest_url = video_info['hlsvp'][0]
1465 url_map = self._extract_from_m3u8(manifest_url, video_id)
1466 video_url_list = self._get_video_url_list(url_map)
1467 if not video_url_list:
1471 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1474 for format_param, video_real_url in video_url_list:
1476 video_extension = self._video_extensions.get(format_param, 'flv')
1478 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1479 self._video_dimensions.get(format_param, '???'),
1480 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1484 'url': video_real_url,
1485 'uploader': video_uploader,
1486 'uploader_id': video_uploader_id,
1487 'upload_date': upload_date,
1488 'title': video_title,
1489 'ext': video_extension,
1490 'format': video_format,
1491 'thumbnail': video_thumbnail,
1492 'description': video_description,
1493 'player_url': player_url,
1494 'subtitles': video_subtitles,
1495 'duration': video_duration,
1496 'age_limit': 18 if age_gate else 0,
1500 class YoutubePlaylistIE(InfoExtractor):
1501 IE_DESC = u'YouTube.com playlists'
1502 _VALID_URL = r"""(?:
1507 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1508 \? (?:.*?&)*? (?:p|a|list)=
1511 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1514 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1516 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1518 IE_NAME = u'youtube:playlist'
1521 def suitable(cls, url):
1522 """Receives a URL and returns True if suitable for this IE."""
1523 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1525 def _real_extract(self, url):
1526 # Extract playlist id
1527 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1529 raise ExtractorError(u'Invalid URL: %s' % url)
1530 playlist_id = mobj.group(1) or mobj.group(2)
1532 # Check if it's a video-specific URL
1533 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1534 if 'v' in query_dict:
1535 video_id = query_dict['v'][0]
1536 if self._downloader.params.get('noplaylist'):
1537 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1538 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1540 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1542 # Download playlist videos from API
1545 for page_num in itertools.count(1):
1546 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1547 if start_index >= 1000:
1548 self._downloader.report_warning(u'Max number of results reached')
1550 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1551 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1554 response = json.loads(page)
1555 except ValueError as err:
1556 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1558 if 'feed' not in response:
1559 raise ExtractorError(u'Got a malformed response from YouTube API')
1560 playlist_title = response['feed']['title']['$t']
1561 if 'entry' not in response['feed']:
1562 # Number of videos is a multiple of self._MAX_RESULTS
1565 for entry in response['feed']['entry']:
1566 index = entry['yt$position']['$t']
1567 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1570 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1573 videos = [v[1] for v in sorted(videos)]
1575 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1576 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1579 class YoutubeChannelIE(InfoExtractor):
1580 IE_DESC = u'YouTube.com channels'
1581 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1582 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1583 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1584 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1585 IE_NAME = u'youtube:channel'
1587 def extract_videos_from_page(self, page):
1589 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1590 if mobj.group(1) not in ids_in_page:
1591 ids_in_page.append(mobj.group(1))
1594 def _real_extract(self, url):
1595 # Extract channel id
1596 mobj = re.match(self._VALID_URL, url)
1598 raise ExtractorError(u'Invalid URL: %s' % url)
1600 # Download channel page
1601 channel_id = mobj.group(1)
1605 url = self._TEMPLATE_URL % (channel_id, pagenum)
1606 page = self._download_webpage(url, channel_id,
1607 u'Downloading page #%s' % pagenum)
1609 # Extract video identifiers
1610 ids_in_page = self.extract_videos_from_page(page)
1611 video_ids.extend(ids_in_page)
1613 # Download any subsequent channel pages using the json-based channel_ajax query
1614 if self._MORE_PAGES_INDICATOR in page:
1615 for pagenum in itertools.count(1):
1616 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1617 page = self._download_webpage(url, channel_id,
1618 u'Downloading page #%s' % pagenum)
1620 page = json.loads(page)
1622 ids_in_page = self.extract_videos_from_page(page['content_html'])
1623 video_ids.extend(ids_in_page)
1625 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1628 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1630 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1631 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1632 return [self.playlist_result(url_entries, channel_id)]
1635 class YoutubeUserIE(InfoExtractor):
1636 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1637 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!watch(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1638 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1639 _GDATA_PAGE_SIZE = 50
1640 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1641 IE_NAME = u'youtube:user'
1644 def suitable(cls, url):
1645 # Don't return True if the url can be extracted with other youtube
1646 # extractor, the regex would is too permissive and it would match.
1647 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1648 if any(ie.suitable(url) for ie in other_ies): return False
1649 else: return super(YoutubeUserIE, cls).suitable(url)
1651 def _real_extract(self, url):
1653 mobj = re.match(self._VALID_URL, url)
1655 raise ExtractorError(u'Invalid URL: %s' % url)
1657 username = mobj.group(1)
1659 # Download video ids using YouTube Data API. Result size per
1660 # query is limited (currently to 50 videos) so we need to query
1661 # page by page until there are no video ids - it means we got
1666 for pagenum in itertools.count(0):
1667 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1669 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1670 page = self._download_webpage(gdata_url, username,
1671 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1674 response = json.loads(page)
1675 except ValueError as err:
1676 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1677 if 'entry' not in response['feed']:
1678 # Number of videos is a multiple of self._MAX_RESULTS
1681 # Extract video identifiers
1683 for entry in response['feed']['entry']:
1684 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1685 video_ids.extend(ids_in_page)
1687 # A little optimization - if current page is not
1688 # "full", ie. does not contain PAGE_SIZE video ids then
1689 # we can assume that this page is the last one - there
1690 # are no more ids on further pages - no need to query
1693 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1696 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1697 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1698 return [self.playlist_result(url_results, playlist_title = username)]
1700 class YoutubeSearchIE(SearchInfoExtractor):
1701 IE_DESC = u'YouTube.com searches'
1702 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1704 IE_NAME = u'youtube:search'
1705 _SEARCH_KEY = 'ytsearch'
1707 def report_download_page(self, query, pagenum):
1708 """Report attempt to download search page with given number."""
1709 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1711 def _get_n_results(self, query, n):
1712 """Get a specified number of results for a query"""
1718 while (50 * pagenum) < limit:
1719 self.report_download_page(query, pagenum+1)
1720 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1721 request = compat_urllib_request.Request(result_url)
1723 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1724 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1725 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1726 api_response = json.loads(data)['data']
1728 if not 'items' in api_response:
1729 raise ExtractorError(u'[youtube] No video results')
1731 new_ids = list(video['id'] for video in api_response['items'])
1732 video_ids += new_ids
1734 limit = min(n, api_response['totalItems'])
1737 if len(video_ids) > n:
1738 video_ids = video_ids[:n]
1739 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1740 return self.playlist_result(videos, query)
1743 class YoutubeShowIE(InfoExtractor):
1744 IE_DESC = u'YouTube.com (multi-season) shows'
1745 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1746 IE_NAME = u'youtube:show'
1748 def _real_extract(self, url):
1749 mobj = re.match(self._VALID_URL, url)
1750 show_name = mobj.group(1)
1751 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1752 # There's one playlist for each season of the show
1753 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1754 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1755 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1758 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1760 Base class for extractors that fetch info from
1761 http://www.youtube.com/feed_ajax
1762 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1764 _LOGIN_REQUIRED = True
1766 # use action_load_personal_feed instead of action_load_system_feed
1767 _PERSONAL_FEED = False
1770 def _FEED_TEMPLATE(self):
1771 action = 'action_load_system_feed'
1772 if self._PERSONAL_FEED:
1773 action = 'action_load_personal_feed'
1774 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1778 return u'youtube:%s' % self._FEED_NAME
1780 def _real_initialize(self):
1783 def _real_extract(self, url):
1785 # The step argument is available only in 2.7 or higher
1786 for i in itertools.count(0):
1787 paging = i*self._PAGING_STEP
1788 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1789 u'%s feed' % self._FEED_NAME,
1790 u'Downloading page %s' % i)
1791 info = json.loads(info)
1792 feed_html = info['feed_html']
1793 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1794 ids = orderedSet(m.group(1) for m in m_ids)
1795 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1796 if info['paging'] is None:
1798 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1800 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1801 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1802 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1803 _FEED_NAME = 'subscriptions'
1804 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1806 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1807 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1808 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1809 _FEED_NAME = 'recommended'
1810 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1812 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1813 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1814 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1815 _FEED_NAME = 'watch_later'
1816 _PLAYLIST_TITLE = u'Youtube Watch Later'
1818 _PERSONAL_FEED = True
1820 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1821 IE_NAME = u'youtube:favorites'
1822 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1823 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1824 _LOGIN_REQUIRED = True
1826 def _real_extract(self, url):
1827 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1828 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1829 return self.url_result(playlist_id, 'YoutubePlaylist')
1832 class YoutubeTruncatedURLIE(InfoExtractor):
1833 IE_NAME = 'youtube:truncated_url'
1834 IE_DESC = False # Do not list
1835 _VALID_URL = r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$'
1837 def _real_extract(self, url):
1838 raise ExtractorError(
1839 u'Did you forget to quote the URL? Remember that & is a meta '
1840 u'character in most shells, so you want to put the URL in quotes, '
1842 u'\'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\''
1843 u' (or simply youtube-dl BaW_jenozKc ).',