14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
38 class YoutubeBaseInfoExtractor(InfoExtractor):
39 """Provide base functions for Youtube extractors"""
40 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
41 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
42 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
43 _NETRC_MACHINE = 'youtube'
44 # If True it will raise an error if no login info is provided
45 _LOGIN_REQUIRED = False
47 def report_lang(self):
48 """Report attempt to set language."""
49 self.to_screen(u'Setting language')
51 def _set_language(self):
52 request = compat_urllib_request.Request(self._LANG_URL)
55 compat_urllib_request.urlopen(request).read()
56 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
57 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
62 (username, password) = self._get_login_info()
63 # No authentication to be performed
65 if self._LOGIN_REQUIRED:
66 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
69 request = compat_urllib_request.Request(self._LOGIN_URL)
71 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
72 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
73 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
78 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
81 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
87 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
91 u'PersistentCookie': u'yes',
93 u'bgresponse': u'js_disabled',
94 u'checkConnection': u'',
95 u'checkedDomains': u'youtube',
101 u'signIn': u'Sign in',
103 u'service': u'youtube',
107 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
109 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
110 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
111 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
114 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
115 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
116 self._downloader.report_warning(u'unable to log in: bad username or password')
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
119 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
123 def _confirm_age(self):
126 'action_confirm': 'Confirm',
128 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
130 self.report_age_confirmation()
131 compat_urllib_request.urlopen(request).read().decode('utf-8')
132 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
133 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
136 def _real_initialize(self):
137 if self._downloader is None:
139 if not self._set_language():
141 if not self._login():
146 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
147 IE_DESC = u'YouTube.com'
150 (?:https?://)? # http(s):// (optional)
151 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
152 tube\.majestyc\.net/|
153 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
154 (?:.*?\#/)? # handle anchor (#/) redirect urls
155 (?: # the various things that can precede the ID:
156 (?:(?:v|embed|e)/) # v/ or embed/ or e/
157 |(?: # or the v= param in all its forms
158 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
159 (?:\?|\#!?) # the params delimiter ? or # or #!
160 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
164 |youtu\.be/ # just youtu.be/xxxx
166 )? # all until now is optional -> you can pass the naked ID
167 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
168 (?(1).+)? # if we found the ID, everything can follow
170 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
171 # Listed in order of quality
172 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
173 # Apple HTTP Live Streaming
174 '96', '95', '94', '93', '92', '132', '151',
176 '85', '84', '102', '83', '101', '82', '100',
178 '138', '137', '248', '136', '247', '135', '246',
179 '245', '244', '134', '243', '133', '242', '160',
181 '141', '172', '140', '171', '139',
183 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
184 # Apple HTTP Live Streaming
185 '96', '95', '94', '93', '92', '132', '151',
187 '85', '102', '84', '101', '83', '100', '82',
189 '138', '248', '137', '247', '136', '246', '245',
190 '244', '135', '243', '134', '242', '133', '160',
192 '172', '141', '171', '140', '139',
194 _video_formats_map = {
195 'flv': ['35', '34', '6', '5'],
196 '3gp': ['36', '17', '13'],
197 'mp4': ['38', '37', '22', '18'],
198 'webm': ['46', '45', '44', '43'],
200 _video_extensions = {
222 # Apple HTTP Live Streaming
254 _video_dimensions = {
336 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
337 u"file": u"BaW_jenozKc.mp4",
339 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
340 u"uploader": u"Philipp Hagemeister",
341 u"uploader_id": u"phihag",
342 u"upload_date": u"20121002",
343 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
347 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
348 u"file": u"1ltcDfZMA3U.flv",
349 u"note": u"Test VEVO video (#897)",
351 u"upload_date": u"20070518",
352 u"title": u"Maps - It Will Find You",
353 u"description": u"Music video by Maps performing It Will Find You.",
354 u"uploader": u"MuteUSA",
355 u"uploader_id": u"MuteUSA"
359 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
360 u"file": u"UxxajLWwzqY.mp4",
361 u"note": u"Test generic use_cipher_signature video (#897)",
363 u"upload_date": u"20120506",
364 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
365 u"description": u"md5:5b292926389560516e384ac437c0ec07",
366 u"uploader": u"Icona Pop",
367 u"uploader_id": u"IconaPop"
371 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
372 u"file": u"07FYdnEawAQ.mp4",
373 u"note": u"Test VEVO video with age protection (#956)",
375 u"upload_date": u"20130703",
376 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
377 u"description": u"md5:64249768eec3bc4276236606ea996373",
378 u"uploader": u"justintimberlakeVEVO",
379 u"uploader_id": u"justintimberlakeVEVO"
386 def suitable(cls, url):
387 """Receives a URL and returns True if suitable for this IE."""
388 if YoutubePlaylistIE.suitable(url): return False
389 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
391 def __init__(self, *args, **kwargs):
392 super(YoutubeIE, self).__init__(*args, **kwargs)
393 self._player_cache = {}
395 def report_video_webpage_download(self, video_id):
396 """Report attempt to download video webpage."""
397 self.to_screen(u'%s: Downloading video webpage' % video_id)
399 def report_video_info_webpage_download(self, video_id):
400 """Report attempt to download video info webpage."""
401 self.to_screen(u'%s: Downloading video info webpage' % video_id)
403 def report_information_extraction(self, video_id):
404 """Report attempt to extract video information."""
405 self.to_screen(u'%s: Extracting video information' % video_id)
407 def report_unavailable_format(self, video_id, format):
408 """Report extracted video URL."""
409 self.to_screen(u'%s: Format %s not available' % (video_id, format))
411 def report_rtmp_download(self):
412 """Indicate the download will use the RTMP protocol."""
413 self.to_screen(u'RTMP download detected')
415 def _extract_signature_function(self, video_id, player_url, slen):
416 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
418 player_type = id_m.group('ext')
419 player_id = id_m.group('id')
421 # Read from filesystem cache
422 func_id = '%s_%s_%d' % (player_type, player_id, slen)
423 assert os.path.basename(func_id) == func_id
424 xdg_cache_home = os.environ.get('XDG_CACHE_HOME')
426 userCacheDir = os.path.join(xdg_cache_home, 'youtube-dl')
428 userCacheDir = os.path.join(os.path.expanduser('~'), '.cache', 'youtube-dl')
429 cache_dir = self._downloader.params.get('cachedir', userCacheDir)
431 cache_enabled = cache_dir is not None
433 cache_fn = os.path.join(os.path.expanduser(cache_dir),
437 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
438 cache_spec = json.load(cachef)
439 return lambda s: u''.join(s[i] for i in cache_spec)
441 pass # No cache available
443 if player_type == 'js':
444 code = self._download_webpage(
445 player_url, video_id,
446 note=u'Downloading %s player %s' % (player_type, player_id),
447 errnote=u'Download of %s failed' % player_url)
448 res = self._parse_sig_js(code)
449 elif player_type == 'swf':
450 urlh = self._request_webpage(
451 player_url, video_id,
452 note=u'Downloading %s player %s' % (player_type, player_id),
453 errnote=u'Download of %s failed' % player_url)
455 res = self._parse_sig_swf(code)
457 assert False, 'Invalid player type %r' % player_type
461 test_string = u''.join(map(compat_chr, range(slen)))
462 cache_res = res(test_string)
463 cache_spec = [ord(c) for c in cache_res]
465 os.makedirs(os.path.dirname(cache_fn))
466 except OSError as ose:
467 if ose.errno != errno.EEXIST:
469 write_json_file(cache_spec, cache_fn)
471 tb = traceback.format_exc()
472 self._downloader.report_warning(
473 u'Writing cache to %r failed: %s' % (cache_fn, tb))
477 def _print_sig_code(self, func, slen):
478 def gen_sig_code(idxs):
479 def _genslice(start, end, step):
480 starts = u'' if start == 0 else str(start)
481 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
482 steps = u'' if step == 1 else (u':%d' % step)
483 return u's[%s%s%s]' % (starts, ends, steps)
486 start = '(Never used)' # Quelch pyflakes warnings - start will be
487 # set as soon as step is set
488 for i, prev in zip(idxs[1:], idxs[:-1]):
492 yield _genslice(start, prev, step)
495 if i - prev in [-1, 1]:
500 yield u's[%d]' % prev
504 yield _genslice(start, i, step)
506 test_string = u''.join(map(compat_chr, range(slen)))
507 cache_res = func(test_string)
508 cache_spec = [ord(c) for c in cache_res]
509 expr_code = u' + '.join(gen_sig_code(cache_spec))
510 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
511 self.to_screen(u'Extracted signature function:\n' + code)
513 def _parse_sig_js(self, jscode):
514 funcname = self._search_regex(
515 r'signature=([a-zA-Z]+)', jscode,
516 u'Initial JS player signature function name')
521 return string.lowercase.index(varname)
523 def interpret_statement(stmt, local_vars, allow_recursion=20):
524 if allow_recursion < 0:
525 raise ExtractorError(u'Recursion limit reached')
527 if stmt.startswith(u'var '):
528 stmt = stmt[len(u'var '):]
529 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
530 r'=(?P<expr>.*)$', stmt)
532 if ass_m.groupdict().get('index'):
534 lvar = local_vars[ass_m.group('out')]
535 idx = interpret_expression(ass_m.group('index'),
536 local_vars, allow_recursion)
537 assert isinstance(idx, int)
540 expr = ass_m.group('expr')
543 local_vars[ass_m.group('out')] = val
545 expr = ass_m.group('expr')
546 elif stmt.startswith(u'return '):
548 expr = stmt[len(u'return '):]
550 raise ExtractorError(
551 u'Cannot determine left side of statement in %r' % stmt)
553 v = interpret_expression(expr, local_vars, allow_recursion)
556 def interpret_expression(expr, local_vars, allow_recursion):
561 return local_vars[expr]
563 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
565 member = m.group('member')
566 val = local_vars[m.group('in')]
567 if member == 'split("")':
569 if member == 'join("")':
571 if member == 'length':
573 if member == 'reverse()':
575 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
577 idx = interpret_expression(
578 slice_m.group('idx'), local_vars, allow_recursion-1)
582 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
584 val = local_vars[m.group('in')]
585 idx = interpret_expression(m.group('idx'), local_vars,
589 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
591 a = interpret_expression(m.group('a'),
592 local_vars, allow_recursion)
593 b = interpret_expression(m.group('b'),
594 local_vars, allow_recursion)
598 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
600 fname = m.group('func')
601 if fname not in functions:
602 functions[fname] = extract_function(fname)
603 argvals = [int(v) if v.isdigit() else local_vars[v]
604 for v in m.group('args').split(',')]
605 return functions[fname](argvals)
606 raise ExtractorError(u'Unsupported JS expression %r' % expr)
608 def extract_function(funcname):
610 r'function ' + re.escape(funcname) +
611 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
613 argnames = func_m.group('args').split(',')
616 local_vars = dict(zip(argnames, args))
617 for stmt in func_m.group('code').split(';'):
618 res = interpret_statement(stmt, local_vars)
622 initial_function = extract_function(funcname)
623 return lambda s: initial_function([s])
625 def _parse_sig_swf(self, file_contents):
626 if file_contents[1:3] != b'WS':
627 raise ExtractorError(
628 u'Not an SWF file; header is %r' % file_contents[:3])
629 if file_contents[:1] == b'C':
630 content = zlib.decompress(file_contents[8:])
632 raise NotImplementedError(u'Unsupported compression format %r' %
635 def extract_tags(content):
637 while pos < len(content):
638 header16 = struct.unpack('<H', content[pos:pos+2])[0]
640 tag_code = header16 >> 6
641 tag_len = header16 & 0x3f
643 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
645 assert pos+tag_len <= len(content)
646 yield (tag_code, content[pos:pos+tag_len])
650 for tag_code, tag in extract_tags(content)
652 p = code_tag.index(b'\0', 4) + 1
653 code_reader = io.BytesIO(code_tag[p:])
655 # Parse ABC (AVM2 ByteCode)
656 def read_int(reader=None):
664 b = struct.unpack('<B', buf)[0]
665 res = res | ((b & 0x7f) << shift)
671 def u30(reader=None):
672 res = read_int(reader)
673 assert res & 0xf0000000 == 0
677 def s32(reader=None):
679 if v & 0x80000000 != 0:
680 v = - ((v ^ 0xffffffff) + 1)
683 def read_string(reader=None):
687 resb = reader.read(slen)
688 assert len(resb) == slen
689 return resb.decode('utf-8')
691 def read_bytes(count, reader=None):
694 resb = reader.read(count)
695 assert len(resb) == count
698 def read_byte(reader=None):
699 resb = read_bytes(1, reader=reader)
700 res = struct.unpack('<B', resb)[0]
703 # minor_version + major_version
708 for _c in range(1, int_count):
711 for _c in range(1, uint_count):
714 read_bytes((double_count-1) * 8)
716 constant_strings = [u'']
717 for _c in range(1, string_count):
719 constant_strings.append(s)
720 namespace_count = u30()
721 for _c in range(1, namespace_count):
725 for _c in range(1, ns_set_count):
727 for _c2 in range(count):
729 multiname_count = u30()
738 0x0e: 2, # MultinameA
739 0x1b: 1, # MultinameL
740 0x1c: 1, # MultinameLA
743 for _c in range(1, multiname_count):
745 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
747 u30() # namespace_idx
749 multinames.append(constant_strings[name_idx])
751 multinames.append('[MULTINAME kind: %d]' % kind)
752 for _c2 in range(MULTINAME_SIZES[kind]):
757 MethodInfo = collections.namedtuple(
759 ['NEED_ARGUMENTS', 'NEED_REST'])
761 for method_id in range(method_count):
764 for _ in range(param_count):
766 u30() # name index (always 0 for youtube)
768 if flags & 0x08 != 0:
771 for c in range(option_count):
774 if flags & 0x80 != 0:
775 # Param names present
776 for _ in range(param_count):
778 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
779 method_infos.append(mi)
782 metadata_count = u30()
783 for _c in range(metadata_count):
786 for _c2 in range(item_count):
790 def parse_traits_info():
791 trait_name_idx = u30()
792 kind_full = read_byte()
793 kind = kind_full & 0x0f
794 attrs = kind_full >> 4
796 if kind in [0x00, 0x06]: # Slot or Const
798 u30() # type_name_idx
802 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
805 methods[multinames[trait_name_idx]] = method_idx
806 elif kind == 0x04: # Class
809 elif kind == 0x05: # Function
812 methods[function_idx] = multinames[trait_name_idx]
814 raise ExtractorError(u'Unsupported trait kind %d' % kind)
816 if attrs & 0x4 != 0: # Metadata present
817 metadata_count = u30()
818 for _c3 in range(metadata_count):
819 u30() # metadata index
824 TARGET_CLASSNAME = u'SignatureDecipher'
825 searched_idx = multinames.index(TARGET_CLASSNAME)
826 searched_class_id = None
828 for class_id in range(class_count):
830 if name_idx == searched_idx:
831 # We found the class we're looking for!
832 searched_class_id = class_id
833 u30() # super_name idx
835 if flags & 0x08 != 0: # Protected namespace is present
836 u30() # protected_ns_idx
838 for _c2 in range(intrf_count):
842 for _c2 in range(trait_count):
845 if searched_class_id is None:
846 raise ExtractorError(u'Target class %r not found' %
851 for class_id in range(class_count):
854 for _c2 in range(trait_count):
855 trait_methods = parse_traits_info()
856 if class_id == searched_class_id:
857 method_names.update(trait_methods.items())
858 method_idxs.update(dict(
860 for name, idx in trait_methods.items()))
864 for _c in range(script_count):
867 for _c2 in range(trait_count):
871 method_body_count = u30()
872 Method = collections.namedtuple('Method', ['code', 'local_count'])
874 for _c in range(method_body_count):
878 u30() # init_scope_depth
879 u30() # max_scope_depth
881 code = read_bytes(code_length)
882 if method_idx in method_idxs:
883 m = Method(code, local_count)
884 methods[method_idxs[method_idx]] = m
885 exception_count = u30()
886 for _c2 in range(exception_count):
893 for _c2 in range(trait_count):
896 assert p + code_reader.tell() == len(code_tag)
897 assert len(methods) == len(method_idxs)
899 method_pyfunctions = {}
901 def extract_function(func_name):
902 if func_name in method_pyfunctions:
903 return method_pyfunctions[func_name]
904 if func_name not in methods:
905 raise ExtractorError(u'Cannot find function %r' % func_name)
906 m = methods[func_name]
909 registers = ['(this)'] + list(args) + [None] * m.local_count
911 coder = io.BytesIO(m.code)
913 opcode = struct.unpack('!B', coder.read(1))[0]
914 if opcode == 36: # pushbyte
915 v = struct.unpack('!B', coder.read(1))[0]
917 elif opcode == 44: # pushstring
919 stack.append(constant_strings[idx])
920 elif opcode == 48: # pushscope
921 # We don't implement the scope register, so we'll just
922 # ignore the popped value
924 elif opcode == 70: # callproperty
926 mname = multinames[index]
927 arg_count = u30(coder)
928 args = list(reversed(
929 [stack.pop() for _ in range(arg_count)]))
931 if mname == u'split':
932 assert len(args) == 1
933 assert isinstance(args[0], compat_str)
934 assert isinstance(obj, compat_str)
938 res = obj.split(args[0])
940 elif mname == u'slice':
941 assert len(args) == 1
942 assert isinstance(args[0], int)
943 assert isinstance(obj, list)
946 elif mname == u'join':
947 assert len(args) == 1
948 assert isinstance(args[0], compat_str)
949 assert isinstance(obj, list)
950 res = args[0].join(obj)
952 elif mname in method_pyfunctions:
953 stack.append(method_pyfunctions[mname](args))
955 raise NotImplementedError(
956 u'Unsupported property %r on %r'
958 elif opcode == 72: # returnvalue
961 elif opcode == 79: # callpropvoid
963 mname = multinames[index]
964 arg_count = u30(coder)
965 args = list(reversed(
966 [stack.pop() for _ in range(arg_count)]))
968 if mname == u'reverse':
969 assert isinstance(obj, list)
972 raise NotImplementedError(
973 u'Unsupported (void) property %r on %r'
975 elif opcode == 93: # findpropstrict
977 mname = multinames[index]
978 res = extract_function(mname)
980 elif opcode == 97: # setproperty
985 assert isinstance(obj, list)
986 assert isinstance(idx, int)
988 elif opcode == 98: # getlocal
990 stack.append(registers[index])
991 elif opcode == 99: # setlocal
994 registers[index] = value
995 elif opcode == 102: # getproperty
997 pname = multinames[index]
998 if pname == u'length':
1000 assert isinstance(obj, list)
1001 stack.append(len(obj))
1002 else: # Assume attribute access
1004 assert isinstance(idx, int)
1006 assert isinstance(obj, list)
1007 stack.append(obj[idx])
1008 elif opcode == 128: # coerce
1010 elif opcode == 133: # coerce_s
1011 assert isinstance(stack[-1], (type(None), compat_str))
1012 elif opcode == 164: # modulo
1013 value2 = stack.pop()
1014 value1 = stack.pop()
1015 res = value1 % value2
1017 elif opcode == 208: # getlocal_0
1018 stack.append(registers[0])
1019 elif opcode == 209: # getlocal_1
1020 stack.append(registers[1])
1021 elif opcode == 210: # getlocal_2
1022 stack.append(registers[2])
1023 elif opcode == 211: # getlocal_3
1024 stack.append(registers[3])
1025 elif opcode == 214: # setlocal_2
1026 registers[2] = stack.pop()
1027 elif opcode == 215: # setlocal_3
1028 registers[3] = stack.pop()
1030 raise NotImplementedError(
1031 u'Unsupported opcode %d' % opcode)
1033 method_pyfunctions[func_name] = resfunc
1036 initial_function = extract_function(u'decipher')
1037 return lambda s: initial_function([s])
1039 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1040 """Turn the encrypted s field into a working signature"""
1042 if player_url is not None:
1044 if player_url not in self._player_cache:
1045 func = self._extract_signature_function(
1046 video_id, player_url, len(s)
1048 self._player_cache[player_url] = func
1049 func = self._player_cache[player_url]
1050 if self._downloader.params.get('youtube_print_sig_code'):
1051 self._print_sig_code(func, len(s))
1054 tb = traceback.format_exc()
1055 self._downloader.report_warning(
1056 u'Automatic signature extraction failed: ' + tb)
1058 self._downloader.report_warning(
1059 u'Warning: Falling back to static signature algorithm')
1061 return self._static_decrypt_signature(
1062 s, video_id, player_url, age_gate)
1064 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1066 # The videos with age protection use another player, so the
1067 # algorithms can be different.
1069 return s[2:63] + s[82] + s[64:82] + s[63]
1072 return s[86:29:-1] + s[88] + s[28:5:-1]
1074 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1076 return s[84:27:-1] + s[86] + s[26:5:-1]
1078 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1080 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1082 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1084 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1086 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1088 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1090 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1092 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1094 return s[12] + s[79:12:-1] + s[80] + s[11::-1]
1096 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1098 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1100 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1103 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1105 def _get_available_subtitles(self, video_id):
1107 sub_list = self._download_webpage(
1108 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1109 video_id, note=False)
1110 except ExtractorError as err:
1111 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1113 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1118 params = compat_urllib_parse.urlencode({
1121 'fmt': self._downloader.params.get('subtitlesformat'),
1123 url = u'http://www.youtube.com/api/timedtext?' + params
1124 sub_lang_list[lang] = url
1125 if not sub_lang_list:
1126 self._downloader.report_warning(u'video doesn\'t have subtitles')
1128 return sub_lang_list
1130 def _get_available_automatic_caption(self, video_id, webpage):
1131 """We need the webpage for getting the captions url, pass it as an
1132 argument to speed up the process."""
1133 sub_format = self._downloader.params.get('subtitlesformat')
1134 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1135 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1136 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1138 self._downloader.report_warning(err_msg)
1140 player_config = json.loads(mobj.group(1))
1142 args = player_config[u'args']
1143 caption_url = args[u'ttsurl']
1144 timestamp = args[u'timestamp']
1145 # We get the available subtitles
1146 list_params = compat_urllib_parse.urlencode({
1151 list_url = caption_url + '&' + list_params
1152 list_page = self._download_webpage(list_url, video_id)
1153 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1154 original_lang_node = caption_list.find('track')
1155 if original_lang_node.attrib.get('kind') != 'asr' :
1156 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1158 original_lang = original_lang_node.attrib['lang_code']
1161 for lang_node in caption_list.findall('target'):
1162 sub_lang = lang_node.attrib['lang_code']
1163 params = compat_urllib_parse.urlencode({
1164 'lang': original_lang,
1170 sub_lang_list[sub_lang] = caption_url + '&' + params
1171 return sub_lang_list
1172 # An extractor error can be raise by the download process if there are
1173 # no automatic captions but there are subtitles
1174 except (KeyError, ExtractorError):
1175 self._downloader.report_warning(err_msg)
1178 def _print_formats(self, formats):
1179 print('Available formats:')
1181 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1182 self._video_dimensions.get(x, '???'),
1183 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1185 def _extract_id(self, url):
1186 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1188 raise ExtractorError(u'Invalid URL: %s' % url)
1189 video_id = mobj.group(2)
1192 def _get_video_url_list(self, url_map):
1194 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1195 with the requested formats.
1197 req_format = self._downloader.params.get('format', None)
1198 format_limit = self._downloader.params.get('format_limit', None)
1199 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1200 if format_limit is not None and format_limit in available_formats:
1201 format_list = available_formats[available_formats.index(format_limit):]
1203 format_list = available_formats
1204 existing_formats = [x for x in format_list if x in url_map]
1205 if len(existing_formats) == 0:
1206 raise ExtractorError(u'no known formats available for video')
1207 if self._downloader.params.get('listformats', None):
1208 self._print_formats(existing_formats)
1210 if req_format is None or req_format == 'best':
1211 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1212 elif req_format == 'worst':
1213 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1214 elif req_format in ('-1', 'all'):
1215 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1217 # Specific formats. We pick the first in a slash-delimeted sequence.
1218 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1219 # available in the specified format. For example,
1220 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1221 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1222 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1223 req_formats = req_format.split('/')
1224 video_url_list = None
1225 for rf in req_formats:
1227 video_url_list = [(rf, url_map[rf])]
1229 if rf in self._video_formats_map:
1230 for srf in self._video_formats_map[rf]:
1232 video_url_list = [(srf, url_map[srf])]
1237 if video_url_list is None:
1238 raise ExtractorError(u'requested format not available')
1239 return video_url_list
1241 def _extract_from_m3u8(self, manifest_url, video_id):
1243 def _get_urls(_manifest):
1244 lines = _manifest.split('\n')
1245 urls = filter(lambda l: l and not l.startswith('#'),
1248 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1249 formats_urls = _get_urls(manifest)
1250 for format_url in formats_urls:
1251 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1252 url_map[itag] = format_url
1255 def _real_extract(self, url):
1256 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1257 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1259 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1260 mobj = re.search(self._NEXT_URL_RE, url)
1262 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1263 video_id = self._extract_id(url)
1266 self.report_video_webpage_download(video_id)
1267 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1268 request = compat_urllib_request.Request(url)
1270 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1271 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1272 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1274 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1276 # Attempt to extract SWF player URL
1277 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1278 if mobj is not None:
1279 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1284 self.report_video_info_webpage_download(video_id)
1285 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1286 self.report_age_confirmation()
1288 # We simulate the access to the video from www.youtube.com/v/{video_id}
1289 # this can be viewed without login into Youtube
1290 data = compat_urllib_parse.urlencode({'video_id': video_id,
1294 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1298 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1299 video_info_webpage = self._download_webpage(video_info_url, video_id,
1301 errnote='unable to download video info webpage')
1302 video_info = compat_parse_qs(video_info_webpage)
1305 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1306 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1307 % (video_id, el_type))
1308 video_info_webpage = self._download_webpage(video_info_url, video_id,
1310 errnote='unable to download video info webpage')
1311 video_info = compat_parse_qs(video_info_webpage)
1312 if 'token' in video_info:
1314 if 'token' not in video_info:
1315 if 'reason' in video_info:
1316 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1318 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1320 # Check for "rental" videos
1321 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1322 raise ExtractorError(u'"rental" videos not supported')
1324 # Start extracting information
1325 self.report_information_extraction(video_id)
1328 if 'author' not in video_info:
1329 raise ExtractorError(u'Unable to extract uploader name')
1330 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1333 video_uploader_id = None
1334 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1335 if mobj is not None:
1336 video_uploader_id = mobj.group(1)
1338 self._downloader.report_warning(u'unable to extract uploader nickname')
1341 if 'title' in video_info:
1342 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1344 self._downloader.report_warning(u'Unable to extract video title')
1348 # We try first to get a high quality image:
1349 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1350 video_webpage, re.DOTALL)
1351 if m_thumb is not None:
1352 video_thumbnail = m_thumb.group(1)
1353 elif 'thumbnail_url' not in video_info:
1354 self._downloader.report_warning(u'unable to extract video thumbnail')
1355 video_thumbnail = None
1356 else: # don't panic if we can't find it
1357 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1361 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1362 if mobj is not None:
1363 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1364 upload_date = unified_strdate(upload_date)
1367 video_description = get_element_by_id("eow-description", video_webpage)
1368 if video_description:
1369 video_description = clean_html(video_description)
1371 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1373 video_description = unescapeHTML(fd_mobj.group(1))
1375 video_description = u''
1378 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1380 if self._downloader.params.get('listsubtitles', False):
1381 self._list_available_subtitles(video_id, video_webpage)
1384 if 'length_seconds' not in video_info:
1385 self._downloader.report_warning(u'unable to extract video duration')
1388 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1390 # Decide which formats to download
1393 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1395 raise ValueError('Could not find vevo ID')
1396 info = json.loads(mobj.group(1))
1398 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1399 # this signatures are encrypted
1400 if 'url_encoded_fmt_stream_map' not in args:
1401 raise ValueError(u'No stream_map present') # caught below
1402 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1404 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1405 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1406 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1408 if 'url_encoded_fmt_stream_map' in video_info:
1409 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1411 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1412 elif 'adaptive_fmts' in video_info:
1413 if 'url_encoded_fmt_stream_map' in video_info:
1414 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1416 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1420 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1421 self.report_rtmp_download()
1422 video_url_list = [(None, video_info['conn'][0])]
1423 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1424 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1425 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1427 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1428 url_data = compat_parse_qs(url_data_str)
1429 if 'itag' in url_data and 'url' in url_data:
1430 url = url_data['url'][0]
1431 if 'sig' in url_data:
1432 url += '&signature=' + url_data['sig'][0]
1433 elif 's' in url_data:
1434 encrypted_sig = url_data['s'][0]
1435 if self._downloader.params.get('verbose'):
1437 if player_url is None:
1438 player_version = 'unknown'
1440 player_version = self._search_regex(
1441 r'-(.+)\.swf$', player_url,
1442 u'flash player', fatal=False)
1443 player_desc = 'flash player %s' % player_version
1445 player_version = self._search_regex(
1446 r'html5player-(.+?)\.js', video_webpage,
1447 'html5 player', fatal=False)
1448 player_desc = u'html5 player %s' % player_version
1450 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1451 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1452 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1455 jsplayer_url_json = self._search_regex(
1456 r'"assets":.+?"js":\s*("[^"]+")',
1457 video_webpage, u'JS player URL')
1458 player_url = json.loads(jsplayer_url_json)
1460 signature = self._decrypt_signature(
1461 encrypted_sig, video_id, player_url, age_gate)
1462 url += '&signature=' + signature
1463 if 'ratebypass' not in url:
1464 url += '&ratebypass=yes'
1465 url_map[url_data['itag'][0]] = url
1466 video_url_list = self._get_video_url_list(url_map)
1467 if not video_url_list:
1469 elif video_info.get('hlsvp'):
1470 manifest_url = video_info['hlsvp'][0]
1471 url_map = self._extract_from_m3u8(manifest_url, video_id)
1472 video_url_list = self._get_video_url_list(url_map)
1473 if not video_url_list:
1477 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1480 for format_param, video_real_url in video_url_list:
1482 video_extension = self._video_extensions.get(format_param, 'flv')
1484 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1485 self._video_dimensions.get(format_param, '???'),
1486 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1490 'url': video_real_url,
1491 'uploader': video_uploader,
1492 'uploader_id': video_uploader_id,
1493 'upload_date': upload_date,
1494 'title': video_title,
1495 'ext': video_extension,
1496 'format': video_format,
1497 'thumbnail': video_thumbnail,
1498 'description': video_description,
1499 'player_url': player_url,
1500 'subtitles': video_subtitles,
1501 'duration': video_duration
1505 class YoutubePlaylistIE(InfoExtractor):
1506 IE_DESC = u'YouTube.com playlists'
1507 _VALID_URL = r"""(?:
1512 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1513 \? (?:.*?&)*? (?:p|a|list)=
1516 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1519 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1521 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1523 IE_NAME = u'youtube:playlist'
1526 def suitable(cls, url):
1527 """Receives a URL and returns True if suitable for this IE."""
1528 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1530 def _real_extract(self, url):
1531 # Extract playlist id
1532 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1534 raise ExtractorError(u'Invalid URL: %s' % url)
1535 playlist_id = mobj.group(1) or mobj.group(2)
1537 # Check if it's a video-specific URL
1538 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1539 if 'v' in query_dict:
1540 video_id = query_dict['v'][0]
1541 if self._downloader.params.get('noplaylist'):
1542 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1543 return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube')
1545 self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1547 # Download playlist videos from API
1550 for page_num in itertools.count(1):
1551 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1552 if start_index >= 1000:
1553 self._downloader.report_warning(u'Max number of results reached')
1555 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1556 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1559 response = json.loads(page)
1560 except ValueError as err:
1561 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1563 if 'feed' not in response:
1564 raise ExtractorError(u'Got a malformed response from YouTube API')
1565 playlist_title = response['feed']['title']['$t']
1566 if 'entry' not in response['feed']:
1567 # Number of videos is a multiple of self._MAX_RESULTS
1570 for entry in response['feed']['entry']:
1571 index = entry['yt$position']['$t']
1572 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1575 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1578 videos = [v[1] for v in sorted(videos)]
1580 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1581 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1584 class YoutubeChannelIE(InfoExtractor):
1585 IE_DESC = u'YouTube.com channels'
1586 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1587 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1588 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1589 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1590 IE_NAME = u'youtube:channel'
1592 def extract_videos_from_page(self, page):
1594 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1595 if mobj.group(1) not in ids_in_page:
1596 ids_in_page.append(mobj.group(1))
1599 def _real_extract(self, url):
1600 # Extract channel id
1601 mobj = re.match(self._VALID_URL, url)
1603 raise ExtractorError(u'Invalid URL: %s' % url)
1605 # Download channel page
1606 channel_id = mobj.group(1)
1610 url = self._TEMPLATE_URL % (channel_id, pagenum)
1611 page = self._download_webpage(url, channel_id,
1612 u'Downloading page #%s' % pagenum)
1614 # Extract video identifiers
1615 ids_in_page = self.extract_videos_from_page(page)
1616 video_ids.extend(ids_in_page)
1618 # Download any subsequent channel pages using the json-based channel_ajax query
1619 if self._MORE_PAGES_INDICATOR in page:
1620 for pagenum in itertools.count(1):
1621 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1622 page = self._download_webpage(url, channel_id,
1623 u'Downloading page #%s' % pagenum)
1625 page = json.loads(page)
1627 ids_in_page = self.extract_videos_from_page(page['content_html'])
1628 video_ids.extend(ids_in_page)
1630 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1633 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1635 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1636 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1637 return [self.playlist_result(url_entries, channel_id)]
1640 class YoutubeUserIE(InfoExtractor):
1641 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1642 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1643 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1644 _GDATA_PAGE_SIZE = 50
1645 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1646 IE_NAME = u'youtube:user'
1649 def suitable(cls, url):
1650 # Don't return True if the url can be extracted with other youtube
1651 # extractor, the regex would is too permissive and it would match.
1652 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1653 if any(ie.suitable(url) for ie in other_ies): return False
1654 else: return super(YoutubeUserIE, cls).suitable(url)
1656 def _real_extract(self, url):
1658 mobj = re.match(self._VALID_URL, url)
1660 raise ExtractorError(u'Invalid URL: %s' % url)
1662 username = mobj.group(1)
1664 # Download video ids using YouTube Data API. Result size per
1665 # query is limited (currently to 50 videos) so we need to query
1666 # page by page until there are no video ids - it means we got
1671 for pagenum in itertools.count(0):
1672 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1674 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1675 page = self._download_webpage(gdata_url, username,
1676 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1679 response = json.loads(page)
1680 except ValueError as err:
1681 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1682 if 'entry' not in response['feed']:
1683 # Number of videos is a multiple of self._MAX_RESULTS
1686 # Extract video identifiers
1688 for entry in response['feed']['entry']:
1689 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1690 video_ids.extend(ids_in_page)
1692 # A little optimization - if current page is not
1693 # "full", ie. does not contain PAGE_SIZE video ids then
1694 # we can assume that this page is the last one - there
1695 # are no more ids on further pages - no need to query
1698 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1701 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1702 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1703 return [self.playlist_result(url_results, playlist_title = username)]
1705 class YoutubeSearchIE(SearchInfoExtractor):
1706 IE_DESC = u'YouTube.com searches'
1707 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1709 IE_NAME = u'youtube:search'
1710 _SEARCH_KEY = 'ytsearch'
1712 def report_download_page(self, query, pagenum):
1713 """Report attempt to download search page with given number."""
1714 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1716 def _get_n_results(self, query, n):
1717 """Get a specified number of results for a query"""
1723 while (50 * pagenum) < limit:
1724 self.report_download_page(query, pagenum+1)
1725 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1726 request = compat_urllib_request.Request(result_url)
1728 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1729 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1730 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1731 api_response = json.loads(data)['data']
1733 if not 'items' in api_response:
1734 raise ExtractorError(u'[youtube] No video results')
1736 new_ids = list(video['id'] for video in api_response['items'])
1737 video_ids += new_ids
1739 limit = min(n, api_response['totalItems'])
1742 if len(video_ids) > n:
1743 video_ids = video_ids[:n]
1744 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1745 return self.playlist_result(videos, query)
1748 class YoutubeShowIE(InfoExtractor):
1749 IE_DESC = u'YouTube.com (multi-season) shows'
1750 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1751 IE_NAME = u'youtube:show'
1753 def _real_extract(self, url):
1754 mobj = re.match(self._VALID_URL, url)
1755 show_name = mobj.group(1)
1756 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1757 # There's one playlist for each season of the show
1758 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1759 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1760 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1763 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1765 Base class for extractors that fetch info from
1766 http://www.youtube.com/feed_ajax
1767 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1769 _LOGIN_REQUIRED = True
1771 # use action_load_personal_feed instead of action_load_system_feed
1772 _PERSONAL_FEED = False
1775 def _FEED_TEMPLATE(self):
1776 action = 'action_load_system_feed'
1777 if self._PERSONAL_FEED:
1778 action = 'action_load_personal_feed'
1779 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1783 return u'youtube:%s' % self._FEED_NAME
1785 def _real_initialize(self):
1788 def _real_extract(self, url):
1790 # The step argument is available only in 2.7 or higher
1791 for i in itertools.count(0):
1792 paging = i*self._PAGING_STEP
1793 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1794 u'%s feed' % self._FEED_NAME,
1795 u'Downloading page %s' % i)
1796 info = json.loads(info)
1797 feed_html = info['feed_html']
1798 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1799 ids = orderedSet(m.group(1) for m in m_ids)
1800 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1801 if info['paging'] is None:
1803 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1805 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1807 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1808 _FEED_NAME = 'subscriptions'
1809 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1811 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1814 _FEED_NAME = 'recommended'
1815 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1817 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1820 _FEED_NAME = 'watch_later'
1821 _PLAYLIST_TITLE = u'Youtube Watch Later'
1823 _PERSONAL_FEED = True
1825 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1826 IE_NAME = u'youtube:favorites'
1827 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1828 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1829 _LOGIN_REQUIRED = True
1831 def _real_extract(self, url):
1832 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1833 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1834 return self.url_result(playlist_id, 'YoutubePlaylist')