14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:5b292926389560516e384ac437c0ec07",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
385 def suitable(cls, url):
386 """Receives a URL and returns True if suitable for this IE."""
387 if YoutubePlaylistIE.suitable(url): return False
388 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
390 def __init__(self, *args, **kwargs):
391 super(YoutubeIE, self).__init__(*args, **kwargs)
392 self._player_cache = {}
394 def report_video_webpage_download(self, video_id):
395 """Report attempt to download video webpage."""
396 self.to_screen(u'%s: Downloading video webpage' % video_id)
398 def report_video_info_webpage_download(self, video_id):
399 """Report attempt to download video info webpage."""
400 self.to_screen(u'%s: Downloading video info webpage' % video_id)
402 def report_information_extraction(self, video_id):
403 """Report attempt to extract video information."""
404 self.to_screen(u'%s: Extracting video information' % video_id)
406 def report_unavailable_format(self, video_id, format):
407 """Report extracted video URL."""
408 self.to_screen(u'%s: Format %s not available' % (video_id, format))
410 def report_rtmp_download(self):
411 """Indicate the download will use the RTMP protocol."""
412 self.to_screen(u'RTMP download detected')
414 def _extract_signature_function(self, video_id, player_url, slen):
415 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
417 player_type = id_m.group('ext')
418 player_id = id_m.group('id')
420 # Read from filesystem cache
421 func_id = '%s_%s_%d' % (player_type, player_id, slen)
422 assert os.path.basename(func_id) == func_id
423 cache_dir = self._downloader.params.get('cachedir',
424 u'~/.youtube-dl/cache')
426 cache_enabled = cache_dir is not None
428 cache_fn = os.path.join(os.path.expanduser(cache_dir),
432 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
433 cache_spec = json.load(cachef)
434 return lambda s: u''.join(s[i] for i in cache_spec)
436 pass # No cache available
438 if player_type == 'js':
439 code = self._download_webpage(
440 player_url, video_id,
441 note=u'Downloading %s player %s' % (player_type, player_id),
442 errnote=u'Download of %s failed' % player_url)
443 res = self._parse_sig_js(code)
444 elif player_type == 'swf':
445 urlh = self._request_webpage(
446 player_url, video_id,
447 note=u'Downloading %s player %s' % (player_type, player_id),
448 errnote=u'Download of %s failed' % player_url)
450 res = self._parse_sig_swf(code)
452 assert False, 'Invalid player type %r' % player_type
456 test_string = u''.join(map(compat_chr, range(slen)))
457 cache_res = res(test_string)
458 cache_spec = [ord(c) for c in cache_res]
460 os.makedirs(os.path.dirname(cache_fn))
461 except OSError as ose:
462 if ose.errno != errno.EEXIST:
464 write_json_file(cache_spec, cache_fn)
466 tb = traceback.format_exc()
467 self._downloader.report_warning(
468 u'Writing cache to %r failed: %s' % (cache_fn, tb))
472 def _print_sig_code(self, func, slen):
473 def gen_sig_code(idxs):
474 def _genslice(start, end, step):
475 starts = u'' if start == 0 else str(start)
476 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
477 steps = u'' if step == 1 else (u':%d' % step)
478 return u's[%s%s%s]' % (starts, ends, steps)
481 start = '(Never used)' # Quelch pyflakes warnings - start will be
482 # set as soon as step is set
483 for i, prev in zip(idxs[1:], idxs[:-1]):
487 yield _genslice(start, prev, step)
490 if i - prev in [-1, 1]:
495 yield u's[%d]' % prev
499 yield _genslice(start, i, step)
501 test_string = u''.join(map(compat_chr, range(slen)))
502 cache_res = func(test_string)
503 cache_spec = [ord(c) for c in cache_res]
504 expr_code = u' + '.join(gen_sig_code(cache_spec))
505 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
506 self.to_screen(u'Extracted signature function:\n' + code)
508 def _parse_sig_js(self, jscode):
509 funcname = self._search_regex(
510 r'signature=([a-zA-Z]+)', jscode,
511 u'Initial JS player signature function name')
516 return string.lowercase.index(varname)
518 def interpret_statement(stmt, local_vars, allow_recursion=20):
519 if allow_recursion < 0:
520 raise ExtractorError(u'Recursion limit reached')
522 if stmt.startswith(u'var '):
523 stmt = stmt[len(u'var '):]
524 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
525 r'=(?P<expr>.*)$', stmt)
527 if ass_m.groupdict().get('index'):
529 lvar = local_vars[ass_m.group('out')]
530 idx = interpret_expression(ass_m.group('index'),
531 local_vars, allow_recursion)
532 assert isinstance(idx, int)
535 expr = ass_m.group('expr')
538 local_vars[ass_m.group('out')] = val
540 expr = ass_m.group('expr')
541 elif stmt.startswith(u'return '):
543 expr = stmt[len(u'return '):]
545 raise ExtractorError(
546 u'Cannot determine left side of statement in %r' % stmt)
548 v = interpret_expression(expr, local_vars, allow_recursion)
551 def interpret_expression(expr, local_vars, allow_recursion):
556 return local_vars[expr]
558 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
560 member = m.group('member')
561 val = local_vars[m.group('in')]
562 if member == 'split("")':
564 if member == 'join("")':
566 if member == 'length':
568 if member == 'reverse()':
570 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
572 idx = interpret_expression(
573 slice_m.group('idx'), local_vars, allow_recursion-1)
577 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
579 val = local_vars[m.group('in')]
580 idx = interpret_expression(m.group('idx'), local_vars,
584 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
586 a = interpret_expression(m.group('a'),
587 local_vars, allow_recursion)
588 b = interpret_expression(m.group('b'),
589 local_vars, allow_recursion)
593 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
595 fname = m.group('func')
596 if fname not in functions:
597 functions[fname] = extract_function(fname)
598 argvals = [int(v) if v.isdigit() else local_vars[v]
599 for v in m.group('args').split(',')]
600 return functions[fname](argvals)
601 raise ExtractorError(u'Unsupported JS expression %r' % expr)
603 def extract_function(funcname):
605 r'function ' + re.escape(funcname) +
606 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
608 argnames = func_m.group('args').split(',')
611 local_vars = dict(zip(argnames, args))
612 for stmt in func_m.group('code').split(';'):
613 res = interpret_statement(stmt, local_vars)
617 initial_function = extract_function(funcname)
618 return lambda s: initial_function([s])
620 def _parse_sig_swf(self, file_contents):
621 if file_contents[1:3] != b'WS':
622 raise ExtractorError(
623 u'Not an SWF file; header is %r' % file_contents[:3])
624 if file_contents[:1] == b'C':
625 content = zlib.decompress(file_contents[8:])
627 raise NotImplementedError(u'Unsupported compression format %r' %
630 def extract_tags(content):
632 while pos < len(content):
633 header16 = struct.unpack('<H', content[pos:pos+2])[0]
635 tag_code = header16 >> 6
636 tag_len = header16 & 0x3f
638 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
640 assert pos+tag_len <= len(content)
641 yield (tag_code, content[pos:pos+tag_len])
645 for tag_code, tag in extract_tags(content)
647 p = code_tag.index(b'\0', 4) + 1
648 code_reader = io.BytesIO(code_tag[p:])
650 # Parse ABC (AVM2 ByteCode)
651 def read_int(reader=None):
659 b = struct.unpack('<B', buf)[0]
660 res = res | ((b & 0x7f) << shift)
666 def u30(reader=None):
667 res = read_int(reader)
668 assert res & 0xf0000000 == 0
672 def s32(reader=None):
674 if v & 0x80000000 != 0:
675 v = - ((v ^ 0xffffffff) + 1)
678 def read_string(reader=None):
682 resb = reader.read(slen)
683 assert len(resb) == slen
684 return resb.decode('utf-8')
686 def read_bytes(count, reader=None):
689 resb = reader.read(count)
690 assert len(resb) == count
693 def read_byte(reader=None):
694 resb = read_bytes(1, reader=reader)
695 res = struct.unpack('<B', resb)[0]
698 # minor_version + major_version
703 for _c in range(1, int_count):
706 for _c in range(1, uint_count):
709 read_bytes((double_count-1) * 8)
711 constant_strings = [u'']
712 for _c in range(1, string_count):
714 constant_strings.append(s)
715 namespace_count = u30()
716 for _c in range(1, namespace_count):
720 for _c in range(1, ns_set_count):
722 for _c2 in range(count):
724 multiname_count = u30()
733 0x0e: 2, # MultinameA
734 0x1b: 1, # MultinameL
735 0x1c: 1, # MultinameLA
738 for _c in range(1, multiname_count):
740 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
742 u30() # namespace_idx
744 multinames.append(constant_strings[name_idx])
746 multinames.append('[MULTINAME kind: %d]' % kind)
747 for _c2 in range(MULTINAME_SIZES[kind]):
752 MethodInfo = collections.namedtuple(
754 ['NEED_ARGUMENTS', 'NEED_REST'])
756 for method_id in range(method_count):
759 for _ in range(param_count):
761 u30() # name index (always 0 for youtube)
763 if flags & 0x08 != 0:
766 for c in range(option_count):
769 if flags & 0x80 != 0:
770 # Param names present
771 for _ in range(param_count):
773 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
774 method_infos.append(mi)
777 metadata_count = u30()
778 for _c in range(metadata_count):
781 for _c2 in range(item_count):
785 def parse_traits_info():
786 trait_name_idx = u30()
787 kind_full = read_byte()
788 kind = kind_full & 0x0f
789 attrs = kind_full >> 4
791 if kind in [0x00, 0x06]: # Slot or Const
793 u30() # type_name_idx
797 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
800 methods[multinames[trait_name_idx]] = method_idx
801 elif kind == 0x04: # Class
804 elif kind == 0x05: # Function
807 methods[function_idx] = multinames[trait_name_idx]
809 raise ExtractorError(u'Unsupported trait kind %d' % kind)
811 if attrs & 0x4 != 0: # Metadata present
812 metadata_count = u30()
813 for _c3 in range(metadata_count):
814 u30() # metadata index
819 TARGET_CLASSNAME = u'SignatureDecipher'
820 searched_idx = multinames.index(TARGET_CLASSNAME)
821 searched_class_id = None
823 for class_id in range(class_count):
825 if name_idx == searched_idx:
826 # We found the class we're looking for!
827 searched_class_id = class_id
828 u30() # super_name idx
830 if flags & 0x08 != 0: # Protected namespace is present
831 u30() # protected_ns_idx
833 for _c2 in range(intrf_count):
837 for _c2 in range(trait_count):
840 if searched_class_id is None:
841 raise ExtractorError(u'Target class %r not found' %
846 for class_id in range(class_count):
849 for _c2 in range(trait_count):
850 trait_methods = parse_traits_info()
851 if class_id == searched_class_id:
852 method_names.update(trait_methods.items())
853 method_idxs.update(dict(
855 for name, idx in trait_methods.items()))
859 for _c in range(script_count):
862 for _c2 in range(trait_count):
866 method_body_count = u30()
867 Method = collections.namedtuple('Method', ['code', 'local_count'])
869 for _c in range(method_body_count):
873 u30() # init_scope_depth
874 u30() # max_scope_depth
876 code = read_bytes(code_length)
877 if method_idx in method_idxs:
878 m = Method(code, local_count)
879 methods[method_idxs[method_idx]] = m
880 exception_count = u30()
881 for _c2 in range(exception_count):
888 for _c2 in range(trait_count):
891 assert p + code_reader.tell() == len(code_tag)
892 assert len(methods) == len(method_idxs)
894 method_pyfunctions = {}
896 def extract_function(func_name):
897 if func_name in method_pyfunctions:
898 return method_pyfunctions[func_name]
899 if func_name not in methods:
900 raise ExtractorError(u'Cannot find function %r' % func_name)
901 m = methods[func_name]
904 registers = ['(this)'] + list(args) + [None] * m.local_count
906 coder = io.BytesIO(m.code)
908 opcode = struct.unpack('!B', coder.read(1))[0]
909 if opcode == 36: # pushbyte
910 v = struct.unpack('!B', coder.read(1))[0]
912 elif opcode == 44: # pushstring
914 stack.append(constant_strings[idx])
915 elif opcode == 48: # pushscope
916 # We don't implement the scope register, so we'll just
917 # ignore the popped value
919 elif opcode == 70: # callproperty
921 mname = multinames[index]
922 arg_count = u30(coder)
923 args = list(reversed(
924 [stack.pop() for _ in range(arg_count)]))
926 if mname == u'split':
927 assert len(args) == 1
928 assert isinstance(args[0], compat_str)
929 assert isinstance(obj, compat_str)
933 res = obj.split(args[0])
935 elif mname == u'slice':
936 assert len(args) == 1
937 assert isinstance(args[0], int)
938 assert isinstance(obj, list)
941 elif mname == u'join':
942 assert len(args) == 1
943 assert isinstance(args[0], compat_str)
944 assert isinstance(obj, list)
945 res = args[0].join(obj)
947 elif mname in method_pyfunctions:
948 stack.append(method_pyfunctions[mname](args))
950 raise NotImplementedError(
951 u'Unsupported property %r on %r'
953 elif opcode == 72: # returnvalue
956 elif opcode == 79: # callpropvoid
958 mname = multinames[index]
959 arg_count = u30(coder)
960 args = list(reversed(
961 [stack.pop() for _ in range(arg_count)]))
963 if mname == u'reverse':
964 assert isinstance(obj, list)
967 raise NotImplementedError(
968 u'Unsupported (void) property %r on %r'
970 elif opcode == 93: # findpropstrict
972 mname = multinames[index]
973 res = extract_function(mname)
975 elif opcode == 97: # setproperty
980 assert isinstance(obj, list)
981 assert isinstance(idx, int)
983 elif opcode == 98: # getlocal
985 stack.append(registers[index])
986 elif opcode == 99: # setlocal
989 registers[index] = value
990 elif opcode == 102: # getproperty
992 pname = multinames[index]
993 if pname == u'length':
995 assert isinstance(obj, list)
996 stack.append(len(obj))
997 else: # Assume attribute access
999 assert isinstance(idx, int)
1001 assert isinstance(obj, list)
1002 stack.append(obj[idx])
1003 elif opcode == 128: # coerce
1005 elif opcode == 133: # coerce_s
1006 assert isinstance(stack[-1], (type(None), compat_str))
1007 elif opcode == 164: # modulo
1008 value2 = stack.pop()
1009 value1 = stack.pop()
1010 res = value1 % value2
1012 elif opcode == 208: # getlocal_0
1013 stack.append(registers[0])
1014 elif opcode == 209: # getlocal_1
1015 stack.append(registers[1])
1016 elif opcode == 210: # getlocal_2
1017 stack.append(registers[2])
1018 elif opcode == 211: # getlocal_3
1019 stack.append(registers[3])
1020 elif opcode == 214: # setlocal_2
1021 registers[2] = stack.pop()
1022 elif opcode == 215: # setlocal_3
1023 registers[3] = stack.pop()
1025 raise NotImplementedError(
1026 u'Unsupported opcode %d' % opcode)
1028 method_pyfunctions[func_name] = resfunc
1031 initial_function = extract_function(u'decipher')
1032 return lambda s: initial_function([s])
1034 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1035 """Turn the encrypted s field into a working signature"""
1037 if player_url is not None:
1039 if player_url not in self._player_cache:
1040 func = self._extract_signature_function(
1041 video_id, player_url, len(s)
1043 self._player_cache[player_url] = func
1044 func = self._player_cache[player_url]
1045 if self._downloader.params.get('youtube_print_sig_code'):
1046 self._print_sig_code(func, len(s))
1049 tb = traceback.format_exc()
1050 self._downloader.report_warning(
1051 u'Automatic signature extraction failed: ' + tb)
1053 self._downloader.report_warning(
1054 u'Warning: Falling back to static signature algorithm')
1056 return self._static_decrypt_signature(
1057 s, video_id, player_url, age_gate)
1059 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1061 # The videos with age protection use another player, so the
1062 # algorithms can be different.
1064 return s[2:63] + s[82] + s[64:82] + s[63]
1067 return s[86:29:-1] + s[88] + s[28:5:-1]
1069 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1071 return s[84:27:-1] + s[86] + s[26:5:-1]
1073 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1075 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1077 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1079 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1081 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1083 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1085 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1087 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1089 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1091 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1093 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1095 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1098 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1100 def _get_available_subtitles(self, video_id):
1102 sub_list = self._download_webpage(
1103 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1104 video_id, note=False)
1105 except ExtractorError as err:
1106 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1108 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1113 params = compat_urllib_parse.urlencode({
1116 'fmt': self._downloader.params.get('subtitlesformat'),
1118 url = u'http://www.youtube.com/api/timedtext?' + params
1119 sub_lang_list[lang] = url
1120 if not sub_lang_list:
1121 self._downloader.report_warning(u'video doesn\'t have subtitles')
1123 return sub_lang_list
1125 def _get_available_automatic_caption(self, video_id, webpage):
1126 """We need the webpage for getting the captions url, pass it as an
1127 argument to speed up the process."""
1128 sub_format = self._downloader.params.get('subtitlesformat')
1129 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1130 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1131 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1133 self._downloader.report_warning(err_msg)
1135 player_config = json.loads(mobj.group(1))
1137 args = player_config[u'args']
1138 caption_url = args[u'ttsurl']
1139 timestamp = args[u'timestamp']
1140 # We get the available subtitles
1141 list_params = compat_urllib_parse.urlencode({
1146 list_url = caption_url + '&' + list_params
1147 list_page = self._download_webpage(list_url, video_id)
1148 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1149 original_lang_node = caption_list.find('track')
1150 if original_lang_node.attrib.get('kind') != 'asr' :
1151 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1153 original_lang = original_lang_node.attrib['lang_code']
1156 for lang_node in caption_list.findall('target'):
1157 sub_lang = lang_node.attrib['lang_code']
1158 params = compat_urllib_parse.urlencode({
1159 'lang': original_lang,
1165 sub_lang_list[sub_lang] = caption_url + '&' + params
1166 return sub_lang_list
1167 # An extractor error can be raise by the download process if there are
1168 # no automatic captions but there are subtitles
1169 except (KeyError, ExtractorError):
1170 self._downloader.report_warning(err_msg)
1173 def _print_formats(self, formats):
1174 print('Available formats:')
1176 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1177 self._video_dimensions.get(x, '???'),
1178 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1180 def _extract_id(self, url):
1181 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1183 raise ExtractorError(u'Invalid URL: %s' % url)
1184 video_id = mobj.group(2)
1187 def _get_video_url_list(self, url_map):
1189 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1190 with the requested formats.
1192 req_format = self._downloader.params.get('format', None)
1193 format_limit = self._downloader.params.get('format_limit', None)
1194 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1195 if format_limit is not None and format_limit in available_formats:
1196 format_list = available_formats[available_formats.index(format_limit):]
1198 format_list = available_formats
1199 existing_formats = [x for x in format_list if x in url_map]
1200 if len(existing_formats) == 0:
1201 raise ExtractorError(u'no known formats available for video')
1202 if self._downloader.params.get('listformats', None):
1203 self._print_formats(existing_formats)
1205 if req_format is None or req_format == 'best':
1206 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1207 elif req_format == 'worst':
1208 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1209 elif req_format in ('-1', 'all'):
1210 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1212 # Specific formats. We pick the first in a slash-delimeted sequence.
1213 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1214 # available in the specified format. For example,
1215 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1216 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1217 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1218 req_formats = req_format.split('/')
1219 video_url_list = None
1220 for rf in req_formats:
1222 video_url_list = [(rf, url_map[rf])]
1224 if rf in self._video_formats_map:
1225 for srf in self._video_formats_map[rf]:
1227 video_url_list = [(srf, url_map[srf])]
1232 if video_url_list is None:
1233 raise ExtractorError(u'requested format not available')
1234 return video_url_list
1236 def _extract_from_m3u8(self, manifest_url, video_id):
1238 def _get_urls(_manifest):
1239 lines = _manifest.split('\n')
1240 urls = filter(lambda l: l and not l.startswith('#'),
1243 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1244 formats_urls = _get_urls(manifest)
1245 for format_url in formats_urls:
1246 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1247 url_map[itag] = format_url
1250 def _real_extract(self, url):
1251 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1252 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1254 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1255 mobj = re.search(self._NEXT_URL_RE, url)
1257 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1258 video_id = self._extract_id(url)
1261 self.report_video_webpage_download(video_id)
1262 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1263 request = compat_urllib_request.Request(url)
1265 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1266 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1267 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1269 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1271 # Attempt to extract SWF player URL
1272 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1273 if mobj is not None:
1274 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1279 self.report_video_info_webpage_download(video_id)
1280 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1281 self.report_age_confirmation()
1283 # We simulate the access to the video from www.youtube.com/v/{video_id}
1284 # this can be viewed without login into Youtube
1285 data = compat_urllib_parse.urlencode({'video_id': video_id,
1289 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1293 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1294 video_info_webpage = self._download_webpage(video_info_url, video_id,
1296 errnote='unable to download video info webpage')
1297 video_info = compat_parse_qs(video_info_webpage)
1300 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1301 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1302 % (video_id, el_type))
1303 video_info_webpage = self._download_webpage(video_info_url, video_id,
1305 errnote='unable to download video info webpage')
1306 video_info = compat_parse_qs(video_info_webpage)
1307 if 'token' in video_info:
1309 if 'token' not in video_info:
1310 if 'reason' in video_info:
1311 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1313 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1315 # Check for "rental" videos
1316 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1317 raise ExtractorError(u'"rental" videos not supported')
1319 # Start extracting information
1320 self.report_information_extraction(video_id)
1323 if 'author' not in video_info:
1324 raise ExtractorError(u'Unable to extract uploader name')
1325 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1328 video_uploader_id = None
1329 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1330 if mobj is not None:
1331 video_uploader_id = mobj.group(1)
1333 self._downloader.report_warning(u'unable to extract uploader nickname')
1336 if 'title' not in video_info:
1337 raise ExtractorError(u'Unable to extract video title')
1338 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1341 # We try first to get a high quality image:
1342 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1343 video_webpage, re.DOTALL)
1344 if m_thumb is not None:
1345 video_thumbnail = m_thumb.group(1)
1346 elif 'thumbnail_url' not in video_info:
1347 self._downloader.report_warning(u'unable to extract video thumbnail')
1348 video_thumbnail = None
1349 else: # don't panic if we can't find it
1350 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1354 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1355 if mobj is not None:
1356 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1357 upload_date = unified_strdate(upload_date)
1360 video_description = get_element_by_id("eow-description", video_webpage)
1361 if video_description:
1362 video_description = clean_html(video_description)
1364 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1366 video_description = unescapeHTML(fd_mobj.group(1))
1368 video_description = u''
1371 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1373 if self._downloader.params.get('listsubtitles', False):
1374 self._list_available_subtitles(video_id, video_webpage)
1377 if 'length_seconds' not in video_info:
1378 self._downloader.report_warning(u'unable to extract video duration')
1381 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1383 # Decide which formats to download
1386 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1388 raise ValueError('Could not find vevo ID')
1389 info = json.loads(mobj.group(1))
1391 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1392 # this signatures are encrypted
1393 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1395 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1396 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1397 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1399 if 'url_encoded_fmt_stream_map' in video_info:
1400 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1402 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1403 elif 'adaptive_fmts' in video_info:
1404 if 'url_encoded_fmt_stream_map' in video_info:
1405 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1407 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1411 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1412 self.report_rtmp_download()
1413 video_url_list = [(None, video_info['conn'][0])]
1414 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1415 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1416 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1418 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1419 url_data = compat_parse_qs(url_data_str)
1420 if 'itag' in url_data and 'url' in url_data:
1421 url = url_data['url'][0]
1422 if 'sig' in url_data:
1423 url += '&signature=' + url_data['sig'][0]
1424 elif 's' in url_data:
1425 encrypted_sig = url_data['s'][0]
1426 if self._downloader.params.get('verbose'):
1428 if player_url is None:
1429 player_version = 'unknown'
1431 player_version = self._search_regex(
1432 r'-(.+)\.swf$', player_url,
1433 u'flash player', fatal=False)
1434 player_desc = 'flash player %s' % player_version
1436 player_version = self._search_regex(
1437 r'html5player-(.+?)\.js', video_webpage,
1438 'html5 player', fatal=False)
1439 player_desc = u'html5 player %s' % player_version
1441 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1442 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1443 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1446 jsplayer_url_json = self._search_regex(
1447 r'"assets":.+?"js":\s*("[^"]+")',
1448 video_webpage, u'JS player URL')
1449 player_url = json.loads(jsplayer_url_json)
1451 signature = self._decrypt_signature(
1452 encrypted_sig, video_id, player_url, age_gate)
1453 url += '&signature=' + signature
1454 if 'ratebypass' not in url:
1455 url += '&ratebypass=yes'
1456 url_map[url_data['itag'][0]] = url
1457 video_url_list = self._get_video_url_list(url_map)
1458 if not video_url_list:
1460 elif video_info.get('hlsvp'):
1461 manifest_url = video_info['hlsvp'][0]
1462 url_map = self._extract_from_m3u8(manifest_url, video_id)
1463 video_url_list = self._get_video_url_list(url_map)
1464 if not video_url_list:
1468 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1471 for format_param, video_real_url in video_url_list:
1473 video_extension = self._video_extensions.get(format_param, 'flv')
1475 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1476 self._video_dimensions.get(format_param, '???'),
1477 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1481 'url': video_real_url,
1482 'uploader': video_uploader,
1483 'uploader_id': video_uploader_id,
1484 'upload_date': upload_date,
1485 'title': video_title,
1486 'ext': video_extension,
1487 'format': video_format,
1488 'thumbnail': video_thumbnail,
1489 'description': video_description,
1490 'player_url': player_url,
1491 'subtitles': video_subtitles,
1492 'duration': video_duration
1496 class YoutubePlaylistIE(InfoExtractor):
1497 IE_DESC = u'YouTube.com playlists'
1498 _VALID_URL = r"""(?:
1503 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1504 \? (?:.*?&)*? (?:p|a|list)=
1507 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1510 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1512 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1514 IE_NAME = u'youtube:playlist'
1517 def suitable(cls, url):
1518 """Receives a URL and returns True if suitable for this IE."""
1519 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1521 def _real_extract(self, url):
1522 # Extract playlist id
1523 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1525 raise ExtractorError(u'Invalid URL: %s' % url)
1527 # Download playlist videos from API
1528 playlist_id = mobj.group(1) or mobj.group(2)
1531 for page_num in itertools.count(1):
1532 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1533 if start_index >= 1000:
1534 self._downloader.report_warning(u'Max number of results reached')
1536 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1537 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1540 response = json.loads(page)
1541 except ValueError as err:
1542 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1544 if 'feed' not in response:
1545 raise ExtractorError(u'Got a malformed response from YouTube API')
1546 playlist_title = response['feed']['title']['$t']
1547 if 'entry' not in response['feed']:
1548 # Number of videos is a multiple of self._MAX_RESULTS
1551 for entry in response['feed']['entry']:
1552 index = entry['yt$position']['$t']
1553 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1556 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1559 videos = [v[1] for v in sorted(videos)]
1561 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1562 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1565 class YoutubeChannelIE(InfoExtractor):
1566 IE_DESC = u'YouTube.com channels'
1567 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1568 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1569 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1570 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1571 IE_NAME = u'youtube:channel'
1573 def extract_videos_from_page(self, page):
1575 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1576 if mobj.group(1) not in ids_in_page:
1577 ids_in_page.append(mobj.group(1))
1580 def _real_extract(self, url):
1581 # Extract channel id
1582 mobj = re.match(self._VALID_URL, url)
1584 raise ExtractorError(u'Invalid URL: %s' % url)
1586 # Download channel page
1587 channel_id = mobj.group(1)
1591 url = self._TEMPLATE_URL % (channel_id, pagenum)
1592 page = self._download_webpage(url, channel_id,
1593 u'Downloading page #%s' % pagenum)
1595 # Extract video identifiers
1596 ids_in_page = self.extract_videos_from_page(page)
1597 video_ids.extend(ids_in_page)
1599 # Download any subsequent channel pages using the json-based channel_ajax query
1600 if self._MORE_PAGES_INDICATOR in page:
1601 for pagenum in itertools.count(1):
1602 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1603 page = self._download_webpage(url, channel_id,
1604 u'Downloading page #%s' % pagenum)
1606 page = json.loads(page)
1608 ids_in_page = self.extract_videos_from_page(page['content_html'])
1609 video_ids.extend(ids_in_page)
1611 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1614 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1616 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1617 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1618 return [self.playlist_result(url_entries, channel_id)]
1621 class YoutubeUserIE(InfoExtractor):
1622 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1623 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1624 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1625 _GDATA_PAGE_SIZE = 50
1626 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1627 IE_NAME = u'youtube:user'
1630 def suitable(cls, url):
1631 # Don't return True if the url can be extracted with other youtube
1632 # extractor, the regex would is too permissive and it would match.
1633 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1634 if any(ie.suitable(url) for ie in other_ies): return False
1635 else: return super(YoutubeUserIE, cls).suitable(url)
1637 def _real_extract(self, url):
1639 mobj = re.match(self._VALID_URL, url)
1641 raise ExtractorError(u'Invalid URL: %s' % url)
1643 username = mobj.group(1)
1645 # Download video ids using YouTube Data API. Result size per
1646 # query is limited (currently to 50 videos) so we need to query
1647 # page by page until there are no video ids - it means we got
1652 for pagenum in itertools.count(0):
1653 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1655 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1656 page = self._download_webpage(gdata_url, username,
1657 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1660 response = json.loads(page)
1661 except ValueError as err:
1662 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1663 if 'entry' not in response['feed']:
1664 # Number of videos is a multiple of self._MAX_RESULTS
1667 # Extract video identifiers
1669 for entry in response['feed']['entry']:
1670 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1671 video_ids.extend(ids_in_page)
1673 # A little optimization - if current page is not
1674 # "full", ie. does not contain PAGE_SIZE video ids then
1675 # we can assume that this page is the last one - there
1676 # are no more ids on further pages - no need to query
1679 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1682 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1683 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1684 return [self.playlist_result(url_results, playlist_title = username)]
1686 class YoutubeSearchIE(SearchInfoExtractor):
1687 IE_DESC = u'YouTube.com searches'
1688 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1690 IE_NAME = u'youtube:search'
1691 _SEARCH_KEY = 'ytsearch'
1693 def report_download_page(self, query, pagenum):
1694 """Report attempt to download search page with given number."""
1695 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1697 def _get_n_results(self, query, n):
1698 """Get a specified number of results for a query"""
1704 while (50 * pagenum) < limit:
1705 self.report_download_page(query, pagenum+1)
1706 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1707 request = compat_urllib_request.Request(result_url)
1709 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1711 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1712 api_response = json.loads(data)['data']
1714 if not 'items' in api_response:
1715 raise ExtractorError(u'[youtube] No video results')
1717 new_ids = list(video['id'] for video in api_response['items'])
1718 video_ids += new_ids
1720 limit = min(n, api_response['totalItems'])
1723 if len(video_ids) > n:
1724 video_ids = video_ids[:n]
1725 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1726 return self.playlist_result(videos, query)
1729 class YoutubeShowIE(InfoExtractor):
1730 IE_DESC = u'YouTube.com (multi-season) shows'
1731 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1732 IE_NAME = u'youtube:show'
1734 def _real_extract(self, url):
1735 mobj = re.match(self._VALID_URL, url)
1736 show_name = mobj.group(1)
1737 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1738 # There's one playlist for each season of the show
1739 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1740 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1741 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1744 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1746 Base class for extractors that fetch info from
1747 http://www.youtube.com/feed_ajax
1748 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1750 _LOGIN_REQUIRED = True
1752 # use action_load_personal_feed instead of action_load_system_feed
1753 _PERSONAL_FEED = False
1756 def _FEED_TEMPLATE(self):
1757 action = 'action_load_system_feed'
1758 if self._PERSONAL_FEED:
1759 action = 'action_load_personal_feed'
1760 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1764 return u'youtube:%s' % self._FEED_NAME
1766 def _real_initialize(self):
1769 def _real_extract(self, url):
1771 # The step argument is available only in 2.7 or higher
1772 for i in itertools.count(0):
1773 paging = i*self._PAGING_STEP
1774 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1775 u'%s feed' % self._FEED_NAME,
1776 u'Downloading page %s' % i)
1777 info = json.loads(info)
1778 feed_html = info['feed_html']
1779 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1780 ids = orderedSet(m.group(1) for m in m_ids)
1781 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1782 if info['paging'] is None:
1784 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1786 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1787 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1788 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1789 _FEED_NAME = 'subscriptions'
1790 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1792 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1793 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1794 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1795 _FEED_NAME = 'recommended'
1796 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1798 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1799 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1800 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1801 _FEED_NAME = 'watch_later'
1802 _PLAYLIST_TITLE = u'Youtube Watch Later'
1804 _PERSONAL_FEED = True
1806 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1807 IE_NAME = u'youtube:favorites'
1808 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1809 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1810 _LOGIN_REQUIRED = True
1812 def _real_extract(self, url):
1813 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1814 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1815 return self.url_result(playlist_id, 'YoutubePlaylist')