14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:5b292926389560516e384ac437c0ec07",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
385 def suitable(cls, url):
386 """Receives a URL and returns True if suitable for this IE."""
387 if YoutubePlaylistIE.suitable(url): return False
388 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
390 def __init__(self, *args, **kwargs):
391 super(YoutubeIE, self).__init__(*args, **kwargs)
392 self._player_cache = {}
394 def report_video_webpage_download(self, video_id):
395 """Report attempt to download video webpage."""
396 self.to_screen(u'%s: Downloading video webpage' % video_id)
398 def report_video_info_webpage_download(self, video_id):
399 """Report attempt to download video info webpage."""
400 self.to_screen(u'%s: Downloading video info webpage' % video_id)
402 def report_information_extraction(self, video_id):
403 """Report attempt to extract video information."""
404 self.to_screen(u'%s: Extracting video information' % video_id)
406 def report_unavailable_format(self, video_id, format):
407 """Report extracted video URL."""
408 self.to_screen(u'%s: Format %s not available' % (video_id, format))
410 def report_rtmp_download(self):
411 """Indicate the download will use the RTMP protocol."""
412 self.to_screen(u'RTMP download detected')
414 def _extract_signature_function(self, video_id, player_url, slen):
415 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
417 player_type = id_m.group('ext')
418 player_id = id_m.group('id')
420 # Read from filesystem cache
421 func_id = '%s_%s_%d' % (player_type, player_id, slen)
422 assert os.path.basename(func_id) == func_id
423 xdg_cache_home = os.environ.get('XDG_CACHE_HOME')
425 userCacheDir = os.path.join(xdg_cache_home, 'youtube-dl')
427 userCacheDir = os.path.join(os.path.expanduser('~'), '.cache', 'youtube-dl')
428 cache_dir = self._downloader.params.get('cachedir', userCacheDir)
430 cache_enabled = cache_dir is not None
432 cache_fn = os.path.join(os.path.expanduser(cache_dir),
436 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
437 cache_spec = json.load(cachef)
438 return lambda s: u''.join(s[i] for i in cache_spec)
440 pass # No cache available
442 if player_type == 'js':
443 code = self._download_webpage(
444 player_url, video_id,
445 note=u'Downloading %s player %s' % (player_type, player_id),
446 errnote=u'Download of %s failed' % player_url)
447 res = self._parse_sig_js(code)
448 elif player_type == 'swf':
449 urlh = self._request_webpage(
450 player_url, video_id,
451 note=u'Downloading %s player %s' % (player_type, player_id),
452 errnote=u'Download of %s failed' % player_url)
454 res = self._parse_sig_swf(code)
456 assert False, 'Invalid player type %r' % player_type
460 test_string = u''.join(map(compat_chr, range(slen)))
461 cache_res = res(test_string)
462 cache_spec = [ord(c) for c in cache_res]
464 os.makedirs(os.path.dirname(cache_fn))
465 except OSError as ose:
466 if ose.errno != errno.EEXIST:
468 write_json_file(cache_spec, cache_fn)
470 tb = traceback.format_exc()
471 self._downloader.report_warning(
472 u'Writing cache to %r failed: %s' % (cache_fn, tb))
476 def _print_sig_code(self, func, slen):
477 def gen_sig_code(idxs):
478 def _genslice(start, end, step):
479 starts = u'' if start == 0 else str(start)
480 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
481 steps = u'' if step == 1 else (u':%d' % step)
482 return u's[%s%s%s]' % (starts, ends, steps)
485 start = '(Never used)' # Quelch pyflakes warnings - start will be
486 # set as soon as step is set
487 for i, prev in zip(idxs[1:], idxs[:-1]):
491 yield _genslice(start, prev, step)
494 if i - prev in [-1, 1]:
499 yield u's[%d]' % prev
503 yield _genslice(start, i, step)
505 test_string = u''.join(map(compat_chr, range(slen)))
506 cache_res = func(test_string)
507 cache_spec = [ord(c) for c in cache_res]
508 expr_code = u' + '.join(gen_sig_code(cache_spec))
509 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
510 self.to_screen(u'Extracted signature function:\n' + code)
512 def _parse_sig_js(self, jscode):
513 funcname = self._search_regex(
514 r'signature=([a-zA-Z]+)', jscode,
515 u'Initial JS player signature function name')
520 return string.lowercase.index(varname)
522 def interpret_statement(stmt, local_vars, allow_recursion=20):
523 if allow_recursion < 0:
524 raise ExtractorError(u'Recursion limit reached')
526 if stmt.startswith(u'var '):
527 stmt = stmt[len(u'var '):]
528 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
529 r'=(?P<expr>.*)$', stmt)
531 if ass_m.groupdict().get('index'):
533 lvar = local_vars[ass_m.group('out')]
534 idx = interpret_expression(ass_m.group('index'),
535 local_vars, allow_recursion)
536 assert isinstance(idx, int)
539 expr = ass_m.group('expr')
542 local_vars[ass_m.group('out')] = val
544 expr = ass_m.group('expr')
545 elif stmt.startswith(u'return '):
547 expr = stmt[len(u'return '):]
549 raise ExtractorError(
550 u'Cannot determine left side of statement in %r' % stmt)
552 v = interpret_expression(expr, local_vars, allow_recursion)
555 def interpret_expression(expr, local_vars, allow_recursion):
560 return local_vars[expr]
562 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
564 member = m.group('member')
565 val = local_vars[m.group('in')]
566 if member == 'split("")':
568 if member == 'join("")':
570 if member == 'length':
572 if member == 'reverse()':
574 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
576 idx = interpret_expression(
577 slice_m.group('idx'), local_vars, allow_recursion-1)
581 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
583 val = local_vars[m.group('in')]
584 idx = interpret_expression(m.group('idx'), local_vars,
588 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
590 a = interpret_expression(m.group('a'),
591 local_vars, allow_recursion)
592 b = interpret_expression(m.group('b'),
593 local_vars, allow_recursion)
597 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
599 fname = m.group('func')
600 if fname not in functions:
601 functions[fname] = extract_function(fname)
602 argvals = [int(v) if v.isdigit() else local_vars[v]
603 for v in m.group('args').split(',')]
604 return functions[fname](argvals)
605 raise ExtractorError(u'Unsupported JS expression %r' % expr)
607 def extract_function(funcname):
609 r'function ' + re.escape(funcname) +
610 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
612 argnames = func_m.group('args').split(',')
615 local_vars = dict(zip(argnames, args))
616 for stmt in func_m.group('code').split(';'):
617 res = interpret_statement(stmt, local_vars)
621 initial_function = extract_function(funcname)
622 return lambda s: initial_function([s])
624 def _parse_sig_swf(self, file_contents):
625 if file_contents[1:3] != b'WS':
626 raise ExtractorError(
627 u'Not an SWF file; header is %r' % file_contents[:3])
628 if file_contents[:1] == b'C':
629 content = zlib.decompress(file_contents[8:])
631 raise NotImplementedError(u'Unsupported compression format %r' %
634 def extract_tags(content):
636 while pos < len(content):
637 header16 = struct.unpack('<H', content[pos:pos+2])[0]
639 tag_code = header16 >> 6
640 tag_len = header16 & 0x3f
642 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
644 assert pos+tag_len <= len(content)
645 yield (tag_code, content[pos:pos+tag_len])
649 for tag_code, tag in extract_tags(content)
651 p = code_tag.index(b'\0', 4) + 1
652 code_reader = io.BytesIO(code_tag[p:])
654 # Parse ABC (AVM2 ByteCode)
655 def read_int(reader=None):
663 b = struct.unpack('<B', buf)[0]
664 res = res | ((b & 0x7f) << shift)
670 def u30(reader=None):
671 res = read_int(reader)
672 assert res & 0xf0000000 == 0
676 def s32(reader=None):
678 if v & 0x80000000 != 0:
679 v = - ((v ^ 0xffffffff) + 1)
682 def read_string(reader=None):
686 resb = reader.read(slen)
687 assert len(resb) == slen
688 return resb.decode('utf-8')
690 def read_bytes(count, reader=None):
693 resb = reader.read(count)
694 assert len(resb) == count
697 def read_byte(reader=None):
698 resb = read_bytes(1, reader=reader)
699 res = struct.unpack('<B', resb)[0]
702 # minor_version + major_version
707 for _c in range(1, int_count):
710 for _c in range(1, uint_count):
713 read_bytes((double_count-1) * 8)
715 constant_strings = [u'']
716 for _c in range(1, string_count):
718 constant_strings.append(s)
719 namespace_count = u30()
720 for _c in range(1, namespace_count):
724 for _c in range(1, ns_set_count):
726 for _c2 in range(count):
728 multiname_count = u30()
737 0x0e: 2, # MultinameA
738 0x1b: 1, # MultinameL
739 0x1c: 1, # MultinameLA
742 for _c in range(1, multiname_count):
744 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
746 u30() # namespace_idx
748 multinames.append(constant_strings[name_idx])
750 multinames.append('[MULTINAME kind: %d]' % kind)
751 for _c2 in range(MULTINAME_SIZES[kind]):
756 MethodInfo = collections.namedtuple(
758 ['NEED_ARGUMENTS', 'NEED_REST'])
760 for method_id in range(method_count):
763 for _ in range(param_count):
765 u30() # name index (always 0 for youtube)
767 if flags & 0x08 != 0:
770 for c in range(option_count):
773 if flags & 0x80 != 0:
774 # Param names present
775 for _ in range(param_count):
777 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
778 method_infos.append(mi)
781 metadata_count = u30()
782 for _c in range(metadata_count):
785 for _c2 in range(item_count):
789 def parse_traits_info():
790 trait_name_idx = u30()
791 kind_full = read_byte()
792 kind = kind_full & 0x0f
793 attrs = kind_full >> 4
795 if kind in [0x00, 0x06]: # Slot or Const
797 u30() # type_name_idx
801 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
804 methods[multinames[trait_name_idx]] = method_idx
805 elif kind == 0x04: # Class
808 elif kind == 0x05: # Function
811 methods[function_idx] = multinames[trait_name_idx]
813 raise ExtractorError(u'Unsupported trait kind %d' % kind)
815 if attrs & 0x4 != 0: # Metadata present
816 metadata_count = u30()
817 for _c3 in range(metadata_count):
818 u30() # metadata index
823 TARGET_CLASSNAME = u'SignatureDecipher'
824 searched_idx = multinames.index(TARGET_CLASSNAME)
825 searched_class_id = None
827 for class_id in range(class_count):
829 if name_idx == searched_idx:
830 # We found the class we're looking for!
831 searched_class_id = class_id
832 u30() # super_name idx
834 if flags & 0x08 != 0: # Protected namespace is present
835 u30() # protected_ns_idx
837 for _c2 in range(intrf_count):
841 for _c2 in range(trait_count):
844 if searched_class_id is None:
845 raise ExtractorError(u'Target class %r not found' %
850 for class_id in range(class_count):
853 for _c2 in range(trait_count):
854 trait_methods = parse_traits_info()
855 if class_id == searched_class_id:
856 method_names.update(trait_methods.items())
857 method_idxs.update(dict(
859 for name, idx in trait_methods.items()))
863 for _c in range(script_count):
866 for _c2 in range(trait_count):
870 method_body_count = u30()
871 Method = collections.namedtuple('Method', ['code', 'local_count'])
873 for _c in range(method_body_count):
877 u30() # init_scope_depth
878 u30() # max_scope_depth
880 code = read_bytes(code_length)
881 if method_idx in method_idxs:
882 m = Method(code, local_count)
883 methods[method_idxs[method_idx]] = m
884 exception_count = u30()
885 for _c2 in range(exception_count):
892 for _c2 in range(trait_count):
895 assert p + code_reader.tell() == len(code_tag)
896 assert len(methods) == len(method_idxs)
898 method_pyfunctions = {}
900 def extract_function(func_name):
901 if func_name in method_pyfunctions:
902 return method_pyfunctions[func_name]
903 if func_name not in methods:
904 raise ExtractorError(u'Cannot find function %r' % func_name)
905 m = methods[func_name]
908 registers = ['(this)'] + list(args) + [None] * m.local_count
910 coder = io.BytesIO(m.code)
912 opcode = struct.unpack('!B', coder.read(1))[0]
913 if opcode == 36: # pushbyte
914 v = struct.unpack('!B', coder.read(1))[0]
916 elif opcode == 44: # pushstring
918 stack.append(constant_strings[idx])
919 elif opcode == 48: # pushscope
920 # We don't implement the scope register, so we'll just
921 # ignore the popped value
923 elif opcode == 70: # callproperty
925 mname = multinames[index]
926 arg_count = u30(coder)
927 args = list(reversed(
928 [stack.pop() for _ in range(arg_count)]))
930 if mname == u'split':
931 assert len(args) == 1
932 assert isinstance(args[0], compat_str)
933 assert isinstance(obj, compat_str)
937 res = obj.split(args[0])
939 elif mname == u'slice':
940 assert len(args) == 1
941 assert isinstance(args[0], int)
942 assert isinstance(obj, list)
945 elif mname == u'join':
946 assert len(args) == 1
947 assert isinstance(args[0], compat_str)
948 assert isinstance(obj, list)
949 res = args[0].join(obj)
951 elif mname in method_pyfunctions:
952 stack.append(method_pyfunctions[mname](args))
954 raise NotImplementedError(
955 u'Unsupported property %r on %r'
957 elif opcode == 72: # returnvalue
960 elif opcode == 79: # callpropvoid
962 mname = multinames[index]
963 arg_count = u30(coder)
964 args = list(reversed(
965 [stack.pop() for _ in range(arg_count)]))
967 if mname == u'reverse':
968 assert isinstance(obj, list)
971 raise NotImplementedError(
972 u'Unsupported (void) property %r on %r'
974 elif opcode == 93: # findpropstrict
976 mname = multinames[index]
977 res = extract_function(mname)
979 elif opcode == 97: # setproperty
984 assert isinstance(obj, list)
985 assert isinstance(idx, int)
987 elif opcode == 98: # getlocal
989 stack.append(registers[index])
990 elif opcode == 99: # setlocal
993 registers[index] = value
994 elif opcode == 102: # getproperty
996 pname = multinames[index]
997 if pname == u'length':
999 assert isinstance(obj, list)
1000 stack.append(len(obj))
1001 else: # Assume attribute access
1003 assert isinstance(idx, int)
1005 assert isinstance(obj, list)
1006 stack.append(obj[idx])
1007 elif opcode == 128: # coerce
1009 elif opcode == 133: # coerce_s
1010 assert isinstance(stack[-1], (type(None), compat_str))
1011 elif opcode == 164: # modulo
1012 value2 = stack.pop()
1013 value1 = stack.pop()
1014 res = value1 % value2
1016 elif opcode == 208: # getlocal_0
1017 stack.append(registers[0])
1018 elif opcode == 209: # getlocal_1
1019 stack.append(registers[1])
1020 elif opcode == 210: # getlocal_2
1021 stack.append(registers[2])
1022 elif opcode == 211: # getlocal_3
1023 stack.append(registers[3])
1024 elif opcode == 214: # setlocal_2
1025 registers[2] = stack.pop()
1026 elif opcode == 215: # setlocal_3
1027 registers[3] = stack.pop()
1029 raise NotImplementedError(
1030 u'Unsupported opcode %d' % opcode)
1032 method_pyfunctions[func_name] = resfunc
1035 initial_function = extract_function(u'decipher')
1036 return lambda s: initial_function([s])
1038 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1039 """Turn the encrypted s field into a working signature"""
1041 if player_url is not None:
1043 if player_url not in self._player_cache:
1044 func = self._extract_signature_function(
1045 video_id, player_url, len(s)
1047 self._player_cache[player_url] = func
1048 func = self._player_cache[player_url]
1049 if self._downloader.params.get('youtube_print_sig_code'):
1050 self._print_sig_code(func, len(s))
1053 tb = traceback.format_exc()
1054 self._downloader.report_warning(
1055 u'Automatic signature extraction failed: ' + tb)
1057 self._downloader.report_warning(
1058 u'Warning: Falling back to static signature algorithm')
1060 return self._static_decrypt_signature(
1061 s, video_id, player_url, age_gate)
1063 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1065 # The videos with age protection use another player, so the
1066 # algorithms can be different.
1068 return s[2:63] + s[82] + s[64:82] + s[63]
1071 return s[86:29:-1] + s[88] + s[28:5:-1]
1073 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1075 return s[84:27:-1] + s[86] + s[26:5:-1]
1077 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1079 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1081 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1083 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1085 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1087 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1089 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1091 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1093 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1095 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1097 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1099 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1102 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1104 def _get_available_subtitles(self, video_id):
1106 sub_list = self._download_webpage(
1107 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1108 video_id, note=False)
1109 except ExtractorError as err:
1110 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1112 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1117 params = compat_urllib_parse.urlencode({
1120 'fmt': self._downloader.params.get('subtitlesformat'),
1122 url = u'http://www.youtube.com/api/timedtext?' + params
1123 sub_lang_list[lang] = url
1124 if not sub_lang_list:
1125 self._downloader.report_warning(u'video doesn\'t have subtitles')
1127 return sub_lang_list
1129 def _get_available_automatic_caption(self, video_id, webpage):
1130 """We need the webpage for getting the captions url, pass it as an
1131 argument to speed up the process."""
1132 sub_format = self._downloader.params.get('subtitlesformat')
1133 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1134 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1135 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1137 self._downloader.report_warning(err_msg)
1139 player_config = json.loads(mobj.group(1))
1141 args = player_config[u'args']
1142 caption_url = args[u'ttsurl']
1143 timestamp = args[u'timestamp']
1144 # We get the available subtitles
1145 list_params = compat_urllib_parse.urlencode({
1150 list_url = caption_url + '&' + list_params
1151 list_page = self._download_webpage(list_url, video_id)
1152 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1153 original_lang_node = caption_list.find('track')
1154 if original_lang_node.attrib.get('kind') != 'asr' :
1155 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1157 original_lang = original_lang_node.attrib['lang_code']
1160 for lang_node in caption_list.findall('target'):
1161 sub_lang = lang_node.attrib['lang_code']
1162 params = compat_urllib_parse.urlencode({
1163 'lang': original_lang,
1169 sub_lang_list[sub_lang] = caption_url + '&' + params
1170 return sub_lang_list
1171 # An extractor error can be raise by the download process if there are
1172 # no automatic captions but there are subtitles
1173 except (KeyError, ExtractorError):
1174 self._downloader.report_warning(err_msg)
1177 def _print_formats(self, formats):
1178 print('Available formats:')
1180 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1181 self._video_dimensions.get(x, '???'),
1182 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1184 def _extract_id(self, url):
1185 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1187 raise ExtractorError(u'Invalid URL: %s' % url)
1188 video_id = mobj.group(2)
1191 def _get_video_url_list(self, url_map):
1193 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1194 with the requested formats.
1196 req_format = self._downloader.params.get('format', None)
1197 format_limit = self._downloader.params.get('format_limit', None)
1198 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1199 if format_limit is not None and format_limit in available_formats:
1200 format_list = available_formats[available_formats.index(format_limit):]
1202 format_list = available_formats
1203 existing_formats = [x for x in format_list if x in url_map]
1204 if len(existing_formats) == 0:
1205 raise ExtractorError(u'no known formats available for video')
1206 if self._downloader.params.get('listformats', None):
1207 self._print_formats(existing_formats)
1209 if req_format is None or req_format == 'best':
1210 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1211 elif req_format == 'worst':
1212 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1213 elif req_format in ('-1', 'all'):
1214 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1216 # Specific formats. We pick the first in a slash-delimeted sequence.
1217 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1218 # available in the specified format. For example,
1219 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1220 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1221 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1222 req_formats = req_format.split('/')
1223 video_url_list = None
1224 for rf in req_formats:
1226 video_url_list = [(rf, url_map[rf])]
1228 if rf in self._video_formats_map:
1229 for srf in self._video_formats_map[rf]:
1231 video_url_list = [(srf, url_map[srf])]
1236 if video_url_list is None:
1237 raise ExtractorError(u'requested format not available')
1238 return video_url_list
1240 def _extract_from_m3u8(self, manifest_url, video_id):
1242 def _get_urls(_manifest):
1243 lines = _manifest.split('\n')
1244 urls = filter(lambda l: l and not l.startswith('#'),
1247 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1248 formats_urls = _get_urls(manifest)
1249 for format_url in formats_urls:
1250 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1251 url_map[itag] = format_url
1254 def _real_extract(self, url):
1255 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1256 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1258 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1259 mobj = re.search(self._NEXT_URL_RE, url)
1261 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1262 video_id = self._extract_id(url)
1265 self.report_video_webpage_download(video_id)
1266 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1267 request = compat_urllib_request.Request(url)
1269 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1270 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1271 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1273 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1275 # Attempt to extract SWF player URL
1276 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1277 if mobj is not None:
1278 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1283 self.report_video_info_webpage_download(video_id)
1284 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1285 self.report_age_confirmation()
1287 # We simulate the access to the video from www.youtube.com/v/{video_id}
1288 # this can be viewed without login into Youtube
1289 data = compat_urllib_parse.urlencode({'video_id': video_id,
1293 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1297 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1298 video_info_webpage = self._download_webpage(video_info_url, video_id,
1300 errnote='unable to download video info webpage')
1301 video_info = compat_parse_qs(video_info_webpage)
1304 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1305 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1306 % (video_id, el_type))
1307 video_info_webpage = self._download_webpage(video_info_url, video_id,
1309 errnote='unable to download video info webpage')
1310 video_info = compat_parse_qs(video_info_webpage)
1311 if 'token' in video_info:
1313 if 'token' not in video_info:
1314 if 'reason' in video_info:
1315 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1317 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1319 # Check for "rental" videos
1320 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1321 raise ExtractorError(u'"rental" videos not supported')
1323 # Start extracting information
1324 self.report_information_extraction(video_id)
1327 if 'author' not in video_info:
1328 raise ExtractorError(u'Unable to extract uploader name')
1329 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1332 video_uploader_id = None
1333 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1334 if mobj is not None:
1335 video_uploader_id = mobj.group(1)
1337 self._downloader.report_warning(u'unable to extract uploader nickname')
1340 if 'title' not in video_info:
1341 raise ExtractorError(u'Unable to extract video title')
1342 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1345 # We try first to get a high quality image:
1346 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1347 video_webpage, re.DOTALL)
1348 if m_thumb is not None:
1349 video_thumbnail = m_thumb.group(1)
1350 elif 'thumbnail_url' not in video_info:
1351 self._downloader.report_warning(u'unable to extract video thumbnail')
1352 video_thumbnail = None
1353 else: # don't panic if we can't find it
1354 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1358 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1359 if mobj is not None:
1360 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1361 upload_date = unified_strdate(upload_date)
1364 video_description = get_element_by_id("eow-description", video_webpage)
1365 if video_description:
1366 video_description = clean_html(video_description)
1368 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1370 video_description = unescapeHTML(fd_mobj.group(1))
1372 video_description = u''
1375 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1377 if self._downloader.params.get('listsubtitles', False):
1378 self._list_available_subtitles(video_id, video_webpage)
1381 if 'length_seconds' not in video_info:
1382 self._downloader.report_warning(u'unable to extract video duration')
1385 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1387 # Decide which formats to download
1390 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1392 raise ValueError('Could not find vevo ID')
1393 info = json.loads(mobj.group(1))
1395 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1396 # this signatures are encrypted
1397 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1399 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1400 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1401 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1403 if 'url_encoded_fmt_stream_map' in video_info:
1404 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1406 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1407 elif 'adaptive_fmts' in video_info:
1408 if 'url_encoded_fmt_stream_map' in video_info:
1409 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1411 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1415 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1416 self.report_rtmp_download()
1417 video_url_list = [(None, video_info['conn'][0])]
1418 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1419 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1420 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1422 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1423 url_data = compat_parse_qs(url_data_str)
1424 if 'itag' in url_data and 'url' in url_data:
1425 url = url_data['url'][0]
1426 if 'sig' in url_data:
1427 url += '&signature=' + url_data['sig'][0]
1428 elif 's' in url_data:
1429 encrypted_sig = url_data['s'][0]
1430 if self._downloader.params.get('verbose'):
1432 if player_url is None:
1433 player_version = 'unknown'
1435 player_version = self._search_regex(
1436 r'-(.+)\.swf$', player_url,
1437 u'flash player', fatal=False)
1438 player_desc = 'flash player %s' % player_version
1440 player_version = self._search_regex(
1441 r'html5player-(.+?)\.js', video_webpage,
1442 'html5 player', fatal=False)
1443 player_desc = u'html5 player %s' % player_version
1445 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1446 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1447 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1450 jsplayer_url_json = self._search_regex(
1451 r'"assets":.+?"js":\s*("[^"]+")',
1452 video_webpage, u'JS player URL')
1453 player_url = json.loads(jsplayer_url_json)
1455 signature = self._decrypt_signature(
1456 encrypted_sig, video_id, player_url, age_gate)
1457 url += '&signature=' + signature
1458 if 'ratebypass' not in url:
1459 url += '&ratebypass=yes'
1460 url_map[url_data['itag'][0]] = url
1461 video_url_list = self._get_video_url_list(url_map)
1462 if not video_url_list:
1464 elif video_info.get('hlsvp'):
1465 manifest_url = video_info['hlsvp'][0]
1466 url_map = self._extract_from_m3u8(manifest_url, video_id)
1467 video_url_list = self._get_video_url_list(url_map)
1468 if not video_url_list:
1472 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1475 for format_param, video_real_url in video_url_list:
1477 video_extension = self._video_extensions.get(format_param, 'flv')
1479 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1480 self._video_dimensions.get(format_param, '???'),
1481 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1485 'url': video_real_url,
1486 'uploader': video_uploader,
1487 'uploader_id': video_uploader_id,
1488 'upload_date': upload_date,
1489 'title': video_title,
1490 'ext': video_extension,
1491 'format': video_format,
1492 'thumbnail': video_thumbnail,
1493 'description': video_description,
1494 'player_url': player_url,
1495 'subtitles': video_subtitles,
1496 'duration': video_duration
1500 class YoutubePlaylistIE(InfoExtractor):
1501 IE_DESC = u'YouTube.com playlists'
1502 _VALID_URL = r"""(?:
1507 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1508 \? (?:.*?&)*? (?:p|a|list)=
1511 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1514 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1516 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1518 IE_NAME = u'youtube:playlist'
1521 def suitable(cls, url):
1522 """Receives a URL and returns True if suitable for this IE."""
1523 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1525 def _real_extract(self, url):
1526 # Extract playlist id
1527 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1529 raise ExtractorError(u'Invalid URL: %s' % url)
1531 # Download playlist videos from API
1532 playlist_id = mobj.group(1) or mobj.group(2)
1535 for page_num in itertools.count(1):
1536 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1537 if start_index >= 1000:
1538 self._downloader.report_warning(u'Max number of results reached')
1540 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1541 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1544 response = json.loads(page)
1545 except ValueError as err:
1546 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1548 if 'feed' not in response:
1549 raise ExtractorError(u'Got a malformed response from YouTube API')
1550 playlist_title = response['feed']['title']['$t']
1551 if 'entry' not in response['feed']:
1552 # Number of videos is a multiple of self._MAX_RESULTS
1555 for entry in response['feed']['entry']:
1556 index = entry['yt$position']['$t']
1557 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1560 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1563 videos = [v[1] for v in sorted(videos)]
1565 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1566 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1569 class YoutubeChannelIE(InfoExtractor):
1570 IE_DESC = u'YouTube.com channels'
1571 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1572 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1573 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1574 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1575 IE_NAME = u'youtube:channel'
1577 def extract_videos_from_page(self, page):
1579 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1580 if mobj.group(1) not in ids_in_page:
1581 ids_in_page.append(mobj.group(1))
1584 def _real_extract(self, url):
1585 # Extract channel id
1586 mobj = re.match(self._VALID_URL, url)
1588 raise ExtractorError(u'Invalid URL: %s' % url)
1590 # Download channel page
1591 channel_id = mobj.group(1)
1595 url = self._TEMPLATE_URL % (channel_id, pagenum)
1596 page = self._download_webpage(url, channel_id,
1597 u'Downloading page #%s' % pagenum)
1599 # Extract video identifiers
1600 ids_in_page = self.extract_videos_from_page(page)
1601 video_ids.extend(ids_in_page)
1603 # Download any subsequent channel pages using the json-based channel_ajax query
1604 if self._MORE_PAGES_INDICATOR in page:
1605 for pagenum in itertools.count(1):
1606 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1607 page = self._download_webpage(url, channel_id,
1608 u'Downloading page #%s' % pagenum)
1610 page = json.loads(page)
1612 ids_in_page = self.extract_videos_from_page(page['content_html'])
1613 video_ids.extend(ids_in_page)
1615 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1618 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1620 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1621 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1622 return [self.playlist_result(url_entries, channel_id)]
1625 class YoutubeUserIE(InfoExtractor):
1626 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1627 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1628 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1629 _GDATA_PAGE_SIZE = 50
1630 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1631 IE_NAME = u'youtube:user'
1634 def suitable(cls, url):
1635 # Don't return True if the url can be extracted with other youtube
1636 # extractor, the regex would is too permissive and it would match.
1637 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1638 if any(ie.suitable(url) for ie in other_ies): return False
1639 else: return super(YoutubeUserIE, cls).suitable(url)
1641 def _real_extract(self, url):
1643 mobj = re.match(self._VALID_URL, url)
1645 raise ExtractorError(u'Invalid URL: %s' % url)
1647 username = mobj.group(1)
1649 # Download video ids using YouTube Data API. Result size per
1650 # query is limited (currently to 50 videos) so we need to query
1651 # page by page until there are no video ids - it means we got
1656 for pagenum in itertools.count(0):
1657 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1659 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1660 page = self._download_webpage(gdata_url, username,
1661 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1664 response = json.loads(page)
1665 except ValueError as err:
1666 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1667 if 'entry' not in response['feed']:
1668 # Number of videos is a multiple of self._MAX_RESULTS
1671 # Extract video identifiers
1673 for entry in response['feed']['entry']:
1674 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1675 video_ids.extend(ids_in_page)
1677 # A little optimization - if current page is not
1678 # "full", ie. does not contain PAGE_SIZE video ids then
1679 # we can assume that this page is the last one - there
1680 # are no more ids on further pages - no need to query
1683 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1686 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1687 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1688 return [self.playlist_result(url_results, playlist_title = username)]
1690 class YoutubeSearchIE(SearchInfoExtractor):
1691 IE_DESC = u'YouTube.com searches'
1692 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1694 IE_NAME = u'youtube:search'
1695 _SEARCH_KEY = 'ytsearch'
1697 def report_download_page(self, query, pagenum):
1698 """Report attempt to download search page with given number."""
1699 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1701 def _get_n_results(self, query, n):
1702 """Get a specified number of results for a query"""
1708 while (50 * pagenum) < limit:
1709 self.report_download_page(query, pagenum+1)
1710 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1711 request = compat_urllib_request.Request(result_url)
1713 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1715 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1716 api_response = json.loads(data)['data']
1718 if not 'items' in api_response:
1719 raise ExtractorError(u'[youtube] No video results')
1721 new_ids = list(video['id'] for video in api_response['items'])
1722 video_ids += new_ids
1724 limit = min(n, api_response['totalItems'])
1727 if len(video_ids) > n:
1728 video_ids = video_ids[:n]
1729 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1730 return self.playlist_result(videos, query)
1733 class YoutubeShowIE(InfoExtractor):
1734 IE_DESC = u'YouTube.com (multi-season) shows'
1735 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1736 IE_NAME = u'youtube:show'
1738 def _real_extract(self, url):
1739 mobj = re.match(self._VALID_URL, url)
1740 show_name = mobj.group(1)
1741 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1742 # There's one playlist for each season of the show
1743 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1744 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1745 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1748 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1750 Base class for extractors that fetch info from
1751 http://www.youtube.com/feed_ajax
1752 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1754 _LOGIN_REQUIRED = True
1756 # use action_load_personal_feed instead of action_load_system_feed
1757 _PERSONAL_FEED = False
1760 def _FEED_TEMPLATE(self):
1761 action = 'action_load_system_feed'
1762 if self._PERSONAL_FEED:
1763 action = 'action_load_personal_feed'
1764 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1768 return u'youtube:%s' % self._FEED_NAME
1770 def _real_initialize(self):
1773 def _real_extract(self, url):
1775 # The step argument is available only in 2.7 or higher
1776 for i in itertools.count(0):
1777 paging = i*self._PAGING_STEP
1778 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1779 u'%s feed' % self._FEED_NAME,
1780 u'Downloading page %s' % i)
1781 info = json.loads(info)
1782 feed_html = info['feed_html']
1783 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1784 ids = orderedSet(m.group(1) for m in m_ids)
1785 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1786 if info['paging'] is None:
1788 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1790 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1791 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1792 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1793 _FEED_NAME = 'subscriptions'
1794 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1796 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1797 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1798 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1799 _FEED_NAME = 'recommended'
1800 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1802 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1803 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1804 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1805 _FEED_NAME = 'watch_later'
1806 _PLAYLIST_TITLE = u'Youtube Watch Later'
1808 _PERSONAL_FEED = True
1810 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1811 IE_NAME = u'youtube:favorites'
1812 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1814 _LOGIN_REQUIRED = True
1816 def _real_extract(self, url):
1817 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1818 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1819 return self.url_result(playlist_id, 'YoutubePlaylist')