14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 if cache_dir != u'NONE':
442 cache_fn = os.path.join(os.path.expanduser(cache_dir),
446 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
447 cache_spec = json.load(cachef)
448 return lambda s: u''.join(s[i] for i in cache_spec)
450 pass # No cache available
452 if player_type == 'js':
453 code = self._download_webpage(
454 player_url, video_id,
455 note=u'Downloading %s player %s' % (player_type, player_id),
456 errnote=u'Download of %s failed' % player_url)
457 res = self._parse_sig_js(code)
458 elif player_type == 'swf':
459 urlh = self._request_webpage(
460 player_url, video_id,
461 note=u'Downloading %s player %s' % (player_type, player_id),
462 errnote=u'Download of %s failed' % player_url)
464 res = self._parse_sig_swf(code)
466 assert False, 'Invalid player type %r' % player_type
468 if cache_dir is not False:
470 cache_res = res(map(compat_chr, range(slen)))
471 cache_spec = [ord(c) for c in cache_res]
473 os.makedirs(os.path.dirname(cache_fn))
474 except OSError as ose:
475 if ose.errno != errno.EEXIST:
477 write_json_file(cache_spec, cache_fn)
479 tb = traceback.format_exc()
480 self._downloader.report_warning(
481 u'Writing cache to %r failed: %s' % (cache_fn, tb))
485 def _print_sig_code(self, func, slen):
486 def gen_sig_code(idxs):
487 def _genslice(start, end, step):
488 starts = u'' if start == 0 else str(start)
489 ends = u':%d' % (end+step)
490 steps = u'' if step == 1 else (':%d' % step)
491 return u's[%s%s%s]' % (starts, ends, steps)
494 start = '(Never used)' # Quelch pyflakes warnings - start will be
495 # set as soon as step is set
496 for i, prev in zip(idxs[1:], idxs[:-1]):
500 yield _genslice(start, prev, step)
503 if i - prev in [-1, 1]:
508 yield u's[%d]' % prev
512 yield _genslice(start, i, step)
514 cache_res = func(map(compat_chr, range(slen)))
515 cache_spec = [ord(c) for c in cache_res]
516 expr_code = u' + '.join(gen_sig_code(cache_spec))
517 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
518 self.to_screen(u'Extracted signature:\n' + code)
520 def _parse_sig_js(self, jscode):
521 funcname = self._search_regex(
522 r'signature=([a-zA-Z]+)', jscode,
523 u'Initial JS player signature function name')
528 return string.lowercase.index(varname)
530 def interpret_statement(stmt, local_vars, allow_recursion=20):
531 if allow_recursion < 0:
532 raise ExtractorError(u'Recursion limit reached')
534 if stmt.startswith(u'var '):
535 stmt = stmt[len(u'var '):]
536 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
537 r'=(?P<expr>.*)$', stmt)
539 if ass_m.groupdict().get('index'):
541 lvar = local_vars[ass_m.group('out')]
542 idx = interpret_expression(ass_m.group('index'),
543 local_vars, allow_recursion)
544 assert isinstance(idx, int)
547 expr = ass_m.group('expr')
550 local_vars[ass_m.group('out')] = val
552 expr = ass_m.group('expr')
553 elif stmt.startswith(u'return '):
555 expr = stmt[len(u'return '):]
557 raise ExtractorError(
558 u'Cannot determine left side of statement in %r' % stmt)
560 v = interpret_expression(expr, local_vars, allow_recursion)
563 def interpret_expression(expr, local_vars, allow_recursion):
568 return local_vars[expr]
570 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
572 member = m.group('member')
573 val = local_vars[m.group('in')]
574 if member == 'split("")':
576 if member == 'join("")':
578 if member == 'length':
580 if member == 'reverse()':
582 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
584 idx = interpret_expression(
585 slice_m.group('idx'), local_vars, allow_recursion-1)
589 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
591 val = local_vars[m.group('in')]
592 idx = interpret_expression(m.group('idx'), local_vars,
596 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
598 a = interpret_expression(m.group('a'),
599 local_vars, allow_recursion)
600 b = interpret_expression(m.group('b'),
601 local_vars, allow_recursion)
605 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
607 fname = m.group('func')
608 if fname not in functions:
609 functions[fname] = extract_function(fname)
610 argvals = [int(v) if v.isdigit() else local_vars[v]
611 for v in m.group('args').split(',')]
612 return functions[fname](argvals)
613 raise ExtractorError(u'Unsupported JS expression %r' % expr)
615 def extract_function(funcname):
617 r'function ' + re.escape(funcname) +
618 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
620 argnames = func_m.group('args').split(',')
623 local_vars = dict(zip(argnames, args))
624 for stmt in func_m.group('code').split(';'):
625 res = interpret_statement(stmt, local_vars)
629 initial_function = extract_function(funcname)
630 return lambda s: initial_function([s])
632 def _parse_sig_swf(self, file_contents):
633 if file_contents[1:3] != b'WS':
634 raise ExtractorError(
635 u'Not an SWF file; header is %r' % file_contents[:3])
636 if file_contents[:1] == b'C':
637 content = zlib.decompress(file_contents[8:])
639 raise NotImplementedError(u'Unsupported compression format %r' %
642 def extract_tags(content):
644 while pos < len(content):
645 header16 = struct.unpack('<H', content[pos:pos+2])[0]
647 tag_code = header16 >> 6
648 tag_len = header16 & 0x3f
650 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
652 assert pos+tag_len <= len(content)
653 yield (tag_code, content[pos:pos+tag_len])
657 for tag_code, tag in extract_tags(content)
659 p = code_tag.index(b'\0', 4) + 1
660 code_reader = io.BytesIO(code_tag[p:])
662 # Parse ABC (AVM2 ByteCode)
663 def read_int(reader=None):
671 b = struct.unpack('<B', buf)[0]
672 res = res | ((b & 0x7f) << shift)
678 def u30(reader=None):
679 res = read_int(reader)
680 assert res & 0xf0000000 == 0
684 def s32(reader=None):
686 if v & 0x80000000 != 0:
687 v = - ((v ^ 0xffffffff) + 1)
690 def read_string(reader=None):
694 resb = reader.read(slen)
695 assert len(resb) == slen
696 return resb.decode('utf-8')
698 def read_bytes(count, reader=None):
701 resb = reader.read(count)
702 assert len(resb) == count
705 def read_byte(reader=None):
706 resb = read_bytes(1, reader=reader)
707 res = struct.unpack('<B', resb)[0]
710 # minor_version + major_version
715 for _c in range(1, int_count):
718 for _c in range(1, uint_count):
721 read_bytes((double_count-1) * 8)
723 constant_strings = [u'']
724 for _c in range(1, string_count):
726 constant_strings.append(s)
727 namespace_count = u30()
728 for _c in range(1, namespace_count):
732 for _c in range(1, ns_set_count):
734 for _c2 in range(count):
736 multiname_count = u30()
745 0x0e: 2, # MultinameA
746 0x1b: 1, # MultinameL
747 0x1c: 1, # MultinameLA
750 for _c in range(1, multiname_count):
752 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
754 u30() # namespace_idx
756 multinames.append(constant_strings[name_idx])
758 multinames.append('[MULTINAME kind: %d]' % kind)
759 for _c2 in range(MULTINAME_SIZES[kind]):
764 MethodInfo = collections.namedtuple(
766 ['NEED_ARGUMENTS', 'NEED_REST'])
768 for method_id in range(method_count):
771 for _ in range(param_count):
773 u30() # name index (always 0 for youtube)
775 if flags & 0x08 != 0:
778 for c in range(option_count):
781 if flags & 0x80 != 0:
782 # Param names present
783 for _ in range(param_count):
785 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
786 method_infos.append(mi)
789 metadata_count = u30()
790 for _c in range(metadata_count):
793 for _c2 in range(item_count):
797 def parse_traits_info():
798 trait_name_idx = u30()
799 kind_full = read_byte()
800 kind = kind_full & 0x0f
801 attrs = kind_full >> 4
803 if kind in [0x00, 0x06]: # Slot or Const
805 u30() # type_name_idx
809 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
812 methods[multinames[trait_name_idx]] = method_idx
813 elif kind == 0x04: # Class
816 elif kind == 0x05: # Function
819 methods[function_idx] = multinames[trait_name_idx]
821 raise ExtractorError(u'Unsupported trait kind %d' % kind)
823 if attrs & 0x4 != 0: # Metadata present
824 metadata_count = u30()
825 for _c3 in range(metadata_count):
826 u30() # metadata index
831 TARGET_CLASSNAME = u'SignatureDecipher'
832 searched_idx = multinames.index(TARGET_CLASSNAME)
833 searched_class_id = None
835 for class_id in range(class_count):
837 if name_idx == searched_idx:
838 # We found the class we're looking for!
839 searched_class_id = class_id
840 u30() # super_name idx
842 if flags & 0x08 != 0: # Protected namespace is present
843 u30() # protected_ns_idx
845 for _c2 in range(intrf_count):
849 for _c2 in range(trait_count):
852 if searched_class_id is None:
853 raise ExtractorError(u'Target class %r not found' %
858 for class_id in range(class_count):
861 for _c2 in range(trait_count):
862 trait_methods = parse_traits_info()
863 if class_id == searched_class_id:
864 method_names.update(trait_methods.items())
865 method_idxs.update(dict(
867 for name, idx in trait_methods.items()))
871 for _c in range(script_count):
874 for _c2 in range(trait_count):
878 method_body_count = u30()
879 Method = collections.namedtuple('Method', ['code', 'local_count'])
881 for _c in range(method_body_count):
885 u30() # init_scope_depth
886 u30() # max_scope_depth
888 code = read_bytes(code_length)
889 if method_idx in method_idxs:
890 m = Method(code, local_count)
891 methods[method_idxs[method_idx]] = m
892 exception_count = u30()
893 for _c2 in range(exception_count):
900 for _c2 in range(trait_count):
903 assert p + code_reader.tell() == len(code_tag)
904 assert len(methods) == len(method_idxs)
906 method_pyfunctions = {}
908 def extract_function(func_name):
909 if func_name in method_pyfunctions:
910 return method_pyfunctions[func_name]
911 if func_name not in methods:
912 raise ExtractorError(u'Cannot find function %r' % func_name)
913 m = methods[func_name]
916 registers = ['(this)'] + list(args) + [None] * m.local_count
918 coder = io.BytesIO(m.code)
920 opcode = struct.unpack('!B', coder.read(1))[0]
921 if opcode == 36: # pushbyte
922 v = struct.unpack('!B', coder.read(1))[0]
924 elif opcode == 44: # pushstring
926 stack.append(constant_strings[idx])
927 elif opcode == 48: # pushscope
928 # We don't implement the scope register, so we'll just
929 # ignore the popped value
931 elif opcode == 70: # callproperty
933 mname = multinames[index]
934 arg_count = u30(coder)
935 args = list(reversed(
936 [stack.pop() for _ in range(arg_count)]))
938 if mname == u'split':
939 assert len(args) == 1
940 assert isinstance(args[0], compat_str)
941 assert isinstance(obj, compat_str)
945 res = obj.split(args[0])
947 elif mname == u'slice':
948 assert len(args) == 1
949 assert isinstance(args[0], int)
950 assert isinstance(obj, list)
953 elif mname == u'join':
954 assert len(args) == 1
955 assert isinstance(args[0], compat_str)
956 assert isinstance(obj, list)
957 res = args[0].join(obj)
959 elif mname in method_pyfunctions:
960 stack.append(method_pyfunctions[mname](args))
962 raise NotImplementedError(
963 u'Unsupported property %r on %r'
965 elif opcode == 72: # returnvalue
968 elif opcode == 79: # callpropvoid
970 mname = multinames[index]
971 arg_count = u30(coder)
972 args = list(reversed(
973 [stack.pop() for _ in range(arg_count)]))
975 if mname == u'reverse':
976 assert isinstance(obj, list)
979 raise NotImplementedError(
980 u'Unsupported (void) property %r on %r'
982 elif opcode == 93: # findpropstrict
984 mname = multinames[index]
985 res = extract_function(mname)
987 elif opcode == 97: # setproperty
992 assert isinstance(obj, list)
993 assert isinstance(idx, int)
995 elif opcode == 98: # getlocal
997 stack.append(registers[index])
998 elif opcode == 99: # setlocal
1001 registers[index] = value
1002 elif opcode == 102: # getproperty
1004 pname = multinames[index]
1005 if pname == u'length':
1007 assert isinstance(obj, list)
1008 stack.append(len(obj))
1009 else: # Assume attribute access
1011 assert isinstance(idx, int)
1013 assert isinstance(obj, list)
1014 stack.append(obj[idx])
1015 elif opcode == 128: # coerce
1017 elif opcode == 133: # coerce_s
1018 assert isinstance(stack[-1], (type(None), compat_str))
1019 elif opcode == 164: # modulo
1020 value2 = stack.pop()
1021 value1 = stack.pop()
1022 res = value1 % value2
1024 elif opcode == 208: # getlocal_0
1025 stack.append(registers[0])
1026 elif opcode == 209: # getlocal_1
1027 stack.append(registers[1])
1028 elif opcode == 210: # getlocal_2
1029 stack.append(registers[2])
1030 elif opcode == 211: # getlocal_3
1031 stack.append(registers[3])
1032 elif opcode == 214: # setlocal_2
1033 registers[2] = stack.pop()
1034 elif opcode == 215: # setlocal_3
1035 registers[3] = stack.pop()
1037 raise NotImplementedError(
1038 u'Unsupported opcode %d' % opcode)
1040 method_pyfunctions[func_name] = resfunc
1043 initial_function = extract_function(u'decipher')
1044 return lambda s: initial_function([s])
1046 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1047 """Turn the encrypted s field into a working signature"""
1049 if player_url is not None:
1051 if player_url not in self._player_cache:
1052 func = self._extract_signature_function(
1053 video_id, player_url, len(s)
1055 self._player_cache[player_url] = func
1056 func = self._player_cache[player_url]
1057 if self._downloader.params.get('youtube_print_sig_code'):
1058 self._print_sig_code(func, len(s))
1061 tb = traceback.format_exc()
1062 self._downloader.report_warning(
1063 u'Automatic signature extraction failed: ' + tb)
1065 self._downloader.report_warning(
1066 u'Warning: Falling back to static signature algorithm')
1067 return self._static_decrypt_signature(
1068 s, video_id, player_url, age_gate)
1070 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1072 # The videos with age protection use another player, so the
1073 # algorithms can be different.
1075 return s[2:63] + s[82] + s[64:82] + s[63]
1078 return s[86:29:-1] + s[88] + s[28:5:-1]
1080 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1082 return s[84:27:-1] + s[86] + s[26:5:-1]
1084 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1086 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1088 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1090 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1092 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1094 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1096 return s[81:36:-1] + s[0] + s[35:2:-1]
1098 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1100 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1102 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1104 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1106 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1109 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1111 def _decrypt_signature_age_gate(self, s):
1112 # The videos with age protection use another player, so the algorithms
1115 return s[2:63] + s[82] + s[64:82] + s[63]
1117 # Fallback to the other algortihms
1118 return self._decrypt_signature(s)
1120 def _get_available_subtitles(self, video_id):
1122 sub_list = self._download_webpage(
1123 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1124 video_id, note=False)
1125 except ExtractorError as err:
1126 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1128 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1133 params = compat_urllib_parse.urlencode({
1136 'fmt': self._downloader.params.get('subtitlesformat'),
1138 url = u'http://www.youtube.com/api/timedtext?' + params
1139 sub_lang_list[lang] = url
1140 if not sub_lang_list:
1141 self._downloader.report_warning(u'video doesn\'t have subtitles')
1143 return sub_lang_list
1145 def _get_available_automatic_caption(self, video_id, webpage):
1146 """We need the webpage for getting the captions url, pass it as an
1147 argument to speed up the process."""
1148 sub_format = self._downloader.params.get('subtitlesformat')
1149 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1150 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1151 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1153 self._downloader.report_warning(err_msg)
1155 player_config = json.loads(mobj.group(1))
1157 args = player_config[u'args']
1158 caption_url = args[u'ttsurl']
1159 timestamp = args[u'timestamp']
1160 # We get the available subtitles
1161 list_params = compat_urllib_parse.urlencode({
1166 list_url = caption_url + '&' + list_params
1167 list_page = self._download_webpage(list_url, video_id)
1168 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1169 original_lang_node = caption_list.find('track')
1170 if original_lang_node.attrib.get('kind') != 'asr' :
1171 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1173 original_lang = original_lang_node.attrib['lang_code']
1176 for lang_node in caption_list.findall('target'):
1177 sub_lang = lang_node.attrib['lang_code']
1178 params = compat_urllib_parse.urlencode({
1179 'lang': original_lang,
1185 sub_lang_list[sub_lang] = caption_url + '&' + params
1186 return sub_lang_list
1187 # An extractor error can be raise by the download process if there are
1188 # no automatic captions but there are subtitles
1189 except (KeyError, ExtractorError):
1190 self._downloader.report_warning(err_msg)
1193 def _print_formats(self, formats):
1194 print('Available formats:')
1196 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1197 self._video_dimensions.get(x, '???'),
1198 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1200 def _extract_id(self, url):
1201 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1203 raise ExtractorError(u'Invalid URL: %s' % url)
1204 video_id = mobj.group(2)
1207 def _get_video_url_list(self, url_map):
1209 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1210 with the requested formats.
1212 req_format = self._downloader.params.get('format', None)
1213 format_limit = self._downloader.params.get('format_limit', None)
1214 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1215 if format_limit is not None and format_limit in available_formats:
1216 format_list = available_formats[available_formats.index(format_limit):]
1218 format_list = available_formats
1219 existing_formats = [x for x in format_list if x in url_map]
1220 if len(existing_formats) == 0:
1221 raise ExtractorError(u'no known formats available for video')
1222 if self._downloader.params.get('listformats', None):
1223 self._print_formats(existing_formats)
1225 if req_format is None or req_format == 'best':
1226 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1227 elif req_format == 'worst':
1228 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1229 elif req_format in ('-1', 'all'):
1230 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1232 # Specific formats. We pick the first in a slash-delimeted sequence.
1233 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1234 # available in the specified format. For example,
1235 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1236 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1237 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1238 req_formats = req_format.split('/')
1239 video_url_list = None
1240 for rf in req_formats:
1242 video_url_list = [(rf, url_map[rf])]
1244 if rf in self._video_formats_map:
1245 for srf in self._video_formats_map[rf]:
1247 video_url_list = [(srf, url_map[srf])]
1252 if video_url_list is None:
1253 raise ExtractorError(u'requested format not available')
1254 return video_url_list
1256 def _extract_from_m3u8(self, manifest_url, video_id):
1258 def _get_urls(_manifest):
1259 lines = _manifest.split('\n')
1260 urls = filter(lambda l: l and not l.startswith('#'),
1263 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1264 formats_urls = _get_urls(manifest)
1265 for format_url in formats_urls:
1266 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1267 url_map[itag] = format_url
1270 def _real_extract(self, url):
1271 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1272 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1274 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1275 mobj = re.search(self._NEXT_URL_RE, url)
1277 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1278 video_id = self._extract_id(url)
1281 self.report_video_webpage_download(video_id)
1282 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1283 request = compat_urllib_request.Request(url)
1285 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1287 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1289 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1291 # Attempt to extract SWF player URL
1292 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1293 if mobj is not None:
1294 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1299 self.report_video_info_webpage_download(video_id)
1300 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1301 self.report_age_confirmation()
1303 # We simulate the access to the video from www.youtube.com/v/{video_id}
1304 # this can be viewed without login into Youtube
1305 data = compat_urllib_parse.urlencode({'video_id': video_id,
1309 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1313 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1314 video_info_webpage = self._download_webpage(video_info_url, video_id,
1316 errnote='unable to download video info webpage')
1317 video_info = compat_parse_qs(video_info_webpage)
1320 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1321 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1322 % (video_id, el_type))
1323 video_info_webpage = self._download_webpage(video_info_url, video_id,
1325 errnote='unable to download video info webpage')
1326 video_info = compat_parse_qs(video_info_webpage)
1327 if 'token' in video_info:
1329 if 'token' not in video_info:
1330 if 'reason' in video_info:
1331 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1333 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1335 # Check for "rental" videos
1336 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1337 raise ExtractorError(u'"rental" videos not supported')
1339 # Start extracting information
1340 self.report_information_extraction(video_id)
1343 if 'author' not in video_info:
1344 raise ExtractorError(u'Unable to extract uploader name')
1345 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1348 video_uploader_id = None
1349 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1350 if mobj is not None:
1351 video_uploader_id = mobj.group(1)
1353 self._downloader.report_warning(u'unable to extract uploader nickname')
1356 if 'title' not in video_info:
1357 raise ExtractorError(u'Unable to extract video title')
1358 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1361 # We try first to get a high quality image:
1362 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1363 video_webpage, re.DOTALL)
1364 if m_thumb is not None:
1365 video_thumbnail = m_thumb.group(1)
1366 elif 'thumbnail_url' not in video_info:
1367 self._downloader.report_warning(u'unable to extract video thumbnail')
1368 video_thumbnail = ''
1369 else: # don't panic if we can't find it
1370 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1374 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1375 if mobj is not None:
1376 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1377 upload_date = unified_strdate(upload_date)
1380 video_description = get_element_by_id("eow-description", video_webpage)
1381 if video_description:
1382 video_description = clean_html(video_description)
1384 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1386 video_description = unescapeHTML(fd_mobj.group(1))
1388 video_description = u''
1391 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1393 if self._downloader.params.get('listsubtitles', False):
1394 self._list_available_subtitles(video_id, video_webpage)
1397 if 'length_seconds' not in video_info:
1398 self._downloader.report_warning(u'unable to extract video duration')
1401 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1403 # Decide which formats to download
1406 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1408 raise ValueError('Could not find vevo ID')
1409 info = json.loads(mobj.group(1))
1411 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1412 # this signatures are encrypted
1413 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1415 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1416 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1417 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1419 if 'url_encoded_fmt_stream_map' in video_info:
1420 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1422 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1423 elif 'adaptive_fmts' in video_info:
1424 if 'url_encoded_fmt_stream_map' in video_info:
1425 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1427 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1431 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1432 self.report_rtmp_download()
1433 video_url_list = [(None, video_info['conn'][0])]
1434 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1435 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1436 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1438 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1439 url_data = compat_parse_qs(url_data_str)
1440 if 'itag' in url_data and 'url' in url_data:
1441 url = url_data['url'][0]
1442 if 'sig' in url_data:
1443 url += '&signature=' + url_data['sig'][0]
1444 elif 's' in url_data:
1445 encrypted_sig = url_data['s'][0]
1446 if self._downloader.params.get('verbose'):
1448 player_version = self._search_regex(
1450 player_url if player_url else None,
1451 'flash player', fatal=False)
1452 player_desc = 'flash player %s' % player_version
1454 player_version = self._search_regex(
1455 r'html5player-(.+?)\.js', video_webpage,
1456 'html5 player', fatal=False)
1457 player_desc = u'html5 player %s' % player_version
1459 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1460 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1461 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1464 jsplayer_url_json = self._search_regex(
1465 r'"assets":.+?"js":\s*("[^"]+")',
1466 video_webpage, u'JS player URL')
1467 player_url = json.loads(jsplayer_url_json)
1469 signature = self._decrypt_signature(
1470 encrypted_sig, video_id, player_url, age_gate)
1471 url += '&signature=' + signature
1472 if 'ratebypass' not in url:
1473 url += '&ratebypass=yes'
1474 url_map[url_data['itag'][0]] = url
1475 video_url_list = self._get_video_url_list(url_map)
1476 if not video_url_list:
1478 elif video_info.get('hlsvp'):
1479 manifest_url = video_info['hlsvp'][0]
1480 url_map = self._extract_from_m3u8(manifest_url, video_id)
1481 video_url_list = self._get_video_url_list(url_map)
1482 if not video_url_list:
1486 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1489 for format_param, video_real_url in video_url_list:
1491 video_extension = self._video_extensions.get(format_param, 'flv')
1493 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1494 self._video_dimensions.get(format_param, '???'),
1495 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1499 'url': video_real_url,
1500 'uploader': video_uploader,
1501 'uploader_id': video_uploader_id,
1502 'upload_date': upload_date,
1503 'title': video_title,
1504 'ext': video_extension,
1505 'format': video_format,
1506 'thumbnail': video_thumbnail,
1507 'description': video_description,
1508 'player_url': player_url,
1509 'subtitles': video_subtitles,
1510 'duration': video_duration
1514 class YoutubePlaylistIE(InfoExtractor):
1515 IE_DESC = u'YouTube.com playlists'
1516 _VALID_URL = r"""(?:
1521 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1522 \? (?:.*?&)*? (?:p|a|list)=
1525 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1528 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1530 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1532 IE_NAME = u'youtube:playlist'
1535 def suitable(cls, url):
1536 """Receives a URL and returns True if suitable for this IE."""
1537 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1539 def _real_extract(self, url):
1540 # Extract playlist id
1541 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1543 raise ExtractorError(u'Invalid URL: %s' % url)
1545 # Download playlist videos from API
1546 playlist_id = mobj.group(1) or mobj.group(2)
1549 for page_num in itertools.count(1):
1550 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1551 if start_index >= 1000:
1552 self._downloader.report_warning(u'Max number of results reached')
1554 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1555 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1558 response = json.loads(page)
1559 except ValueError as err:
1560 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1562 if 'feed' not in response:
1563 raise ExtractorError(u'Got a malformed response from YouTube API')
1564 playlist_title = response['feed']['title']['$t']
1565 if 'entry' not in response['feed']:
1566 # Number of videos is a multiple of self._MAX_RESULTS
1569 for entry in response['feed']['entry']:
1570 index = entry['yt$position']['$t']
1571 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1574 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1577 videos = [v[1] for v in sorted(videos)]
1579 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1580 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1583 class YoutubeChannelIE(InfoExtractor):
1584 IE_DESC = u'YouTube.com channels'
1585 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1586 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1587 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1588 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1589 IE_NAME = u'youtube:channel'
1591 def extract_videos_from_page(self, page):
1593 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1594 if mobj.group(1) not in ids_in_page:
1595 ids_in_page.append(mobj.group(1))
1598 def _real_extract(self, url):
1599 # Extract channel id
1600 mobj = re.match(self._VALID_URL, url)
1602 raise ExtractorError(u'Invalid URL: %s' % url)
1604 # Download channel page
1605 channel_id = mobj.group(1)
1609 url = self._TEMPLATE_URL % (channel_id, pagenum)
1610 page = self._download_webpage(url, channel_id,
1611 u'Downloading page #%s' % pagenum)
1613 # Extract video identifiers
1614 ids_in_page = self.extract_videos_from_page(page)
1615 video_ids.extend(ids_in_page)
1617 # Download any subsequent channel pages using the json-based channel_ajax query
1618 if self._MORE_PAGES_INDICATOR in page:
1619 for pagenum in itertools.count(1):
1620 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1621 page = self._download_webpage(url, channel_id,
1622 u'Downloading page #%s' % pagenum)
1624 page = json.loads(page)
1626 ids_in_page = self.extract_videos_from_page(page['content_html'])
1627 video_ids.extend(ids_in_page)
1629 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1632 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1634 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1635 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1636 return [self.playlist_result(url_entries, channel_id)]
1639 class YoutubeUserIE(InfoExtractor):
1640 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1641 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1642 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1643 _GDATA_PAGE_SIZE = 50
1644 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1645 IE_NAME = u'youtube:user'
1648 def suitable(cls, url):
1649 # Don't return True if the url can be extracted with other youtube
1650 # extractor, the regex would is too permissive and it would match.
1651 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1652 if any(ie.suitable(url) for ie in other_ies): return False
1653 else: return super(YoutubeUserIE, cls).suitable(url)
1655 def _real_extract(self, url):
1657 mobj = re.match(self._VALID_URL, url)
1659 raise ExtractorError(u'Invalid URL: %s' % url)
1661 username = mobj.group(1)
1663 # Download video ids using YouTube Data API. Result size per
1664 # query is limited (currently to 50 videos) so we need to query
1665 # page by page until there are no video ids - it means we got
1670 for pagenum in itertools.count(0):
1671 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1673 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1674 page = self._download_webpage(gdata_url, username,
1675 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1678 response = json.loads(page)
1679 except ValueError as err:
1680 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1681 if 'entry' not in response['feed']:
1682 # Number of videos is a multiple of self._MAX_RESULTS
1685 # Extract video identifiers
1687 for entry in response['feed']['entry']:
1688 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1689 video_ids.extend(ids_in_page)
1691 # A little optimization - if current page is not
1692 # "full", ie. does not contain PAGE_SIZE video ids then
1693 # we can assume that this page is the last one - there
1694 # are no more ids on further pages - no need to query
1697 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1700 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1701 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1702 return [self.playlist_result(url_results, playlist_title = username)]
1704 class YoutubeSearchIE(SearchInfoExtractor):
1705 IE_DESC = u'YouTube.com searches'
1706 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1708 IE_NAME = u'youtube:search'
1709 _SEARCH_KEY = 'ytsearch'
1711 def report_download_page(self, query, pagenum):
1712 """Report attempt to download search page with given number."""
1713 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1715 def _get_n_results(self, query, n):
1716 """Get a specified number of results for a query"""
1722 while (50 * pagenum) < limit:
1723 self.report_download_page(query, pagenum+1)
1724 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1725 request = compat_urllib_request.Request(result_url)
1727 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1728 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1729 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1730 api_response = json.loads(data)['data']
1732 if not 'items' in api_response:
1733 raise ExtractorError(u'[youtube] No video results')
1735 new_ids = list(video['id'] for video in api_response['items'])
1736 video_ids += new_ids
1738 limit = min(n, api_response['totalItems'])
1741 if len(video_ids) > n:
1742 video_ids = video_ids[:n]
1743 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1744 return self.playlist_result(videos, query)
1747 class YoutubeShowIE(InfoExtractor):
1748 IE_DESC = u'YouTube.com (multi-season) shows'
1749 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1750 IE_NAME = u'youtube:show'
1752 def _real_extract(self, url):
1753 mobj = re.match(self._VALID_URL, url)
1754 show_name = mobj.group(1)
1755 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1756 # There's one playlist for each season of the show
1757 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1758 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1759 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1762 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1764 Base class for extractors that fetch info from
1765 http://www.youtube.com/feed_ajax
1766 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1768 _LOGIN_REQUIRED = True
1770 # use action_load_personal_feed instead of action_load_system_feed
1771 _PERSONAL_FEED = False
1774 def _FEED_TEMPLATE(self):
1775 action = 'action_load_system_feed'
1776 if self._PERSONAL_FEED:
1777 action = 'action_load_personal_feed'
1778 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1782 return u'youtube:%s' % self._FEED_NAME
1784 def _real_initialize(self):
1787 def _real_extract(self, url):
1789 # The step argument is available only in 2.7 or higher
1790 for i in itertools.count(0):
1791 paging = i*self._PAGING_STEP
1792 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1793 u'%s feed' % self._FEED_NAME,
1794 u'Downloading page %s' % i)
1795 info = json.loads(info)
1796 feed_html = info['feed_html']
1797 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1798 ids = orderedSet(m.group(1) for m in m_ids)
1799 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1800 if info['paging'] is None:
1802 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1804 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1805 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1806 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1807 _FEED_NAME = 'subscriptions'
1808 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1810 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1811 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1812 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1813 _FEED_NAME = 'recommended'
1814 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1816 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1817 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1818 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1819 _FEED_NAME = 'watch_later'
1820 _PLAYLIST_TITLE = u'Youtube Watch Later'
1822 _PERSONAL_FEED = True
1824 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1825 IE_NAME = u'youtube:favorites'
1826 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1827 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1828 _LOGIN_REQUIRED = True
1830 def _real_extract(self, url):
1831 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1832 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1833 return self.url_result(playlist_id, 'YoutubePlaylist')