14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 cache_enabled = cache_dir != u'NONE'
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
451 pass # No cache available
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
456 note=u'Downloading %s player %s' % (player_type, player_id),
457 errnote=u'Download of %s failed' % player_url)
458 res = self._parse_sig_js(code)
459 elif player_type == 'swf':
460 urlh = self._request_webpage(
461 player_url, video_id,
462 note=u'Downloading %s player %s' % (player_type, player_id),
463 errnote=u'Download of %s failed' % player_url)
465 res = self._parse_sig_swf(code)
467 assert False, 'Invalid player type %r' % player_type
471 cache_res = res(map(compat_chr, range(slen)))
472 cache_spec = [ord(c) for c in cache_res]
474 os.makedirs(os.path.dirname(cache_fn))
475 except OSError as ose:
476 if ose.errno != errno.EEXIST:
478 write_json_file(cache_spec, cache_fn)
480 tb = traceback.format_exc()
481 self._downloader.report_warning(
482 u'Writing cache to %r failed: %s' % (cache_fn, tb))
486 def _print_sig_code(self, func, slen):
487 def gen_sig_code(idxs):
488 def _genslice(start, end, step):
489 starts = u'' if start == 0 else str(start)
490 ends = u':%d' % (end+step)
491 steps = u'' if step == 1 else (':%d' % step)
492 return u's[%s%s%s]' % (starts, ends, steps)
495 start = '(Never used)' # Quelch pyflakes warnings - start will be
496 # set as soon as step is set
497 for i, prev in zip(idxs[1:], idxs[:-1]):
501 yield _genslice(start, prev, step)
504 if i - prev in [-1, 1]:
509 yield u's[%d]' % prev
513 yield _genslice(start, i, step)
515 cache_res = func(map(compat_chr, range(slen)))
516 cache_spec = [ord(c) for c in cache_res]
517 expr_code = u' + '.join(gen_sig_code(cache_spec))
518 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
519 self.to_screen(u'Extracted signature function:\n' + code)
521 def _parse_sig_js(self, jscode):
522 funcname = self._search_regex(
523 r'signature=([a-zA-Z]+)', jscode,
524 u'Initial JS player signature function name')
529 return string.lowercase.index(varname)
531 def interpret_statement(stmt, local_vars, allow_recursion=20):
532 if allow_recursion < 0:
533 raise ExtractorError(u'Recursion limit reached')
535 if stmt.startswith(u'var '):
536 stmt = stmt[len(u'var '):]
537 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
538 r'=(?P<expr>.*)$', stmt)
540 if ass_m.groupdict().get('index'):
542 lvar = local_vars[ass_m.group('out')]
543 idx = interpret_expression(ass_m.group('index'),
544 local_vars, allow_recursion)
545 assert isinstance(idx, int)
548 expr = ass_m.group('expr')
551 local_vars[ass_m.group('out')] = val
553 expr = ass_m.group('expr')
554 elif stmt.startswith(u'return '):
556 expr = stmt[len(u'return '):]
558 raise ExtractorError(
559 u'Cannot determine left side of statement in %r' % stmt)
561 v = interpret_expression(expr, local_vars, allow_recursion)
564 def interpret_expression(expr, local_vars, allow_recursion):
569 return local_vars[expr]
571 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
573 member = m.group('member')
574 val = local_vars[m.group('in')]
575 if member == 'split("")':
577 if member == 'join("")':
579 if member == 'length':
581 if member == 'reverse()':
583 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
585 idx = interpret_expression(
586 slice_m.group('idx'), local_vars, allow_recursion-1)
590 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
592 val = local_vars[m.group('in')]
593 idx = interpret_expression(m.group('idx'), local_vars,
597 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
599 a = interpret_expression(m.group('a'),
600 local_vars, allow_recursion)
601 b = interpret_expression(m.group('b'),
602 local_vars, allow_recursion)
606 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
608 fname = m.group('func')
609 if fname not in functions:
610 functions[fname] = extract_function(fname)
611 argvals = [int(v) if v.isdigit() else local_vars[v]
612 for v in m.group('args').split(',')]
613 return functions[fname](argvals)
614 raise ExtractorError(u'Unsupported JS expression %r' % expr)
616 def extract_function(funcname):
618 r'function ' + re.escape(funcname) +
619 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
621 argnames = func_m.group('args').split(',')
624 local_vars = dict(zip(argnames, args))
625 for stmt in func_m.group('code').split(';'):
626 res = interpret_statement(stmt, local_vars)
630 initial_function = extract_function(funcname)
631 return lambda s: initial_function([s])
633 def _parse_sig_swf(self, file_contents):
634 if file_contents[1:3] != b'WS':
635 raise ExtractorError(
636 u'Not an SWF file; header is %r' % file_contents[:3])
637 if file_contents[:1] == b'C':
638 content = zlib.decompress(file_contents[8:])
640 raise NotImplementedError(u'Unsupported compression format %r' %
643 def extract_tags(content):
645 while pos < len(content):
646 header16 = struct.unpack('<H', content[pos:pos+2])[0]
648 tag_code = header16 >> 6
649 tag_len = header16 & 0x3f
651 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
653 assert pos+tag_len <= len(content)
654 yield (tag_code, content[pos:pos+tag_len])
658 for tag_code, tag in extract_tags(content)
660 p = code_tag.index(b'\0', 4) + 1
661 code_reader = io.BytesIO(code_tag[p:])
663 # Parse ABC (AVM2 ByteCode)
664 def read_int(reader=None):
672 b = struct.unpack('<B', buf)[0]
673 res = res | ((b & 0x7f) << shift)
679 def u30(reader=None):
680 res = read_int(reader)
681 assert res & 0xf0000000 == 0
685 def s32(reader=None):
687 if v & 0x80000000 != 0:
688 v = - ((v ^ 0xffffffff) + 1)
691 def read_string(reader=None):
695 resb = reader.read(slen)
696 assert len(resb) == slen
697 return resb.decode('utf-8')
699 def read_bytes(count, reader=None):
702 resb = reader.read(count)
703 assert len(resb) == count
706 def read_byte(reader=None):
707 resb = read_bytes(1, reader=reader)
708 res = struct.unpack('<B', resb)[0]
711 # minor_version + major_version
716 for _c in range(1, int_count):
719 for _c in range(1, uint_count):
722 read_bytes((double_count-1) * 8)
724 constant_strings = [u'']
725 for _c in range(1, string_count):
727 constant_strings.append(s)
728 namespace_count = u30()
729 for _c in range(1, namespace_count):
733 for _c in range(1, ns_set_count):
735 for _c2 in range(count):
737 multiname_count = u30()
746 0x0e: 2, # MultinameA
747 0x1b: 1, # MultinameL
748 0x1c: 1, # MultinameLA
751 for _c in range(1, multiname_count):
753 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
755 u30() # namespace_idx
757 multinames.append(constant_strings[name_idx])
759 multinames.append('[MULTINAME kind: %d]' % kind)
760 for _c2 in range(MULTINAME_SIZES[kind]):
765 MethodInfo = collections.namedtuple(
767 ['NEED_ARGUMENTS', 'NEED_REST'])
769 for method_id in range(method_count):
772 for _ in range(param_count):
774 u30() # name index (always 0 for youtube)
776 if flags & 0x08 != 0:
779 for c in range(option_count):
782 if flags & 0x80 != 0:
783 # Param names present
784 for _ in range(param_count):
786 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
787 method_infos.append(mi)
790 metadata_count = u30()
791 for _c in range(metadata_count):
794 for _c2 in range(item_count):
798 def parse_traits_info():
799 trait_name_idx = u30()
800 kind_full = read_byte()
801 kind = kind_full & 0x0f
802 attrs = kind_full >> 4
804 if kind in [0x00, 0x06]: # Slot or Const
806 u30() # type_name_idx
810 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
813 methods[multinames[trait_name_idx]] = method_idx
814 elif kind == 0x04: # Class
817 elif kind == 0x05: # Function
820 methods[function_idx] = multinames[trait_name_idx]
822 raise ExtractorError(u'Unsupported trait kind %d' % kind)
824 if attrs & 0x4 != 0: # Metadata present
825 metadata_count = u30()
826 for _c3 in range(metadata_count):
827 u30() # metadata index
832 TARGET_CLASSNAME = u'SignatureDecipher'
833 searched_idx = multinames.index(TARGET_CLASSNAME)
834 searched_class_id = None
836 for class_id in range(class_count):
838 if name_idx == searched_idx:
839 # We found the class we're looking for!
840 searched_class_id = class_id
841 u30() # super_name idx
843 if flags & 0x08 != 0: # Protected namespace is present
844 u30() # protected_ns_idx
846 for _c2 in range(intrf_count):
850 for _c2 in range(trait_count):
853 if searched_class_id is None:
854 raise ExtractorError(u'Target class %r not found' %
859 for class_id in range(class_count):
862 for _c2 in range(trait_count):
863 trait_methods = parse_traits_info()
864 if class_id == searched_class_id:
865 method_names.update(trait_methods.items())
866 method_idxs.update(dict(
868 for name, idx in trait_methods.items()))
872 for _c in range(script_count):
875 for _c2 in range(trait_count):
879 method_body_count = u30()
880 Method = collections.namedtuple('Method', ['code', 'local_count'])
882 for _c in range(method_body_count):
886 u30() # init_scope_depth
887 u30() # max_scope_depth
889 code = read_bytes(code_length)
890 if method_idx in method_idxs:
891 m = Method(code, local_count)
892 methods[method_idxs[method_idx]] = m
893 exception_count = u30()
894 for _c2 in range(exception_count):
901 for _c2 in range(trait_count):
904 assert p + code_reader.tell() == len(code_tag)
905 assert len(methods) == len(method_idxs)
907 method_pyfunctions = {}
909 def extract_function(func_name):
910 if func_name in method_pyfunctions:
911 return method_pyfunctions[func_name]
912 if func_name not in methods:
913 raise ExtractorError(u'Cannot find function %r' % func_name)
914 m = methods[func_name]
917 registers = ['(this)'] + list(args) + [None] * m.local_count
919 coder = io.BytesIO(m.code)
921 opcode = struct.unpack('!B', coder.read(1))[0]
922 if opcode == 36: # pushbyte
923 v = struct.unpack('!B', coder.read(1))[0]
925 elif opcode == 44: # pushstring
927 stack.append(constant_strings[idx])
928 elif opcode == 48: # pushscope
929 # We don't implement the scope register, so we'll just
930 # ignore the popped value
932 elif opcode == 70: # callproperty
934 mname = multinames[index]
935 arg_count = u30(coder)
936 args = list(reversed(
937 [stack.pop() for _ in range(arg_count)]))
939 if mname == u'split':
940 assert len(args) == 1
941 assert isinstance(args[0], compat_str)
942 assert isinstance(obj, compat_str)
946 res = obj.split(args[0])
948 elif mname == u'slice':
949 assert len(args) == 1
950 assert isinstance(args[0], int)
951 assert isinstance(obj, list)
954 elif mname == u'join':
955 assert len(args) == 1
956 assert isinstance(args[0], compat_str)
957 assert isinstance(obj, list)
958 res = args[0].join(obj)
960 elif mname in method_pyfunctions:
961 stack.append(method_pyfunctions[mname](args))
963 raise NotImplementedError(
964 u'Unsupported property %r on %r'
966 elif opcode == 72: # returnvalue
969 elif opcode == 79: # callpropvoid
971 mname = multinames[index]
972 arg_count = u30(coder)
973 args = list(reversed(
974 [stack.pop() for _ in range(arg_count)]))
976 if mname == u'reverse':
977 assert isinstance(obj, list)
980 raise NotImplementedError(
981 u'Unsupported (void) property %r on %r'
983 elif opcode == 93: # findpropstrict
985 mname = multinames[index]
986 res = extract_function(mname)
988 elif opcode == 97: # setproperty
993 assert isinstance(obj, list)
994 assert isinstance(idx, int)
996 elif opcode == 98: # getlocal
998 stack.append(registers[index])
999 elif opcode == 99: # setlocal
1002 registers[index] = value
1003 elif opcode == 102: # getproperty
1005 pname = multinames[index]
1006 if pname == u'length':
1008 assert isinstance(obj, list)
1009 stack.append(len(obj))
1010 else: # Assume attribute access
1012 assert isinstance(idx, int)
1014 assert isinstance(obj, list)
1015 stack.append(obj[idx])
1016 elif opcode == 128: # coerce
1018 elif opcode == 133: # coerce_s
1019 assert isinstance(stack[-1], (type(None), compat_str))
1020 elif opcode == 164: # modulo
1021 value2 = stack.pop()
1022 value1 = stack.pop()
1023 res = value1 % value2
1025 elif opcode == 208: # getlocal_0
1026 stack.append(registers[0])
1027 elif opcode == 209: # getlocal_1
1028 stack.append(registers[1])
1029 elif opcode == 210: # getlocal_2
1030 stack.append(registers[2])
1031 elif opcode == 211: # getlocal_3
1032 stack.append(registers[3])
1033 elif opcode == 214: # setlocal_2
1034 registers[2] = stack.pop()
1035 elif opcode == 215: # setlocal_3
1036 registers[3] = stack.pop()
1038 raise NotImplementedError(
1039 u'Unsupported opcode %d' % opcode)
1041 method_pyfunctions[func_name] = resfunc
1044 initial_function = extract_function(u'decipher')
1045 return lambda s: initial_function([s])
1047 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1048 """Turn the encrypted s field into a working signature"""
1050 if player_url is not None:
1052 if player_url not in self._player_cache:
1053 func = self._extract_signature_function(
1054 video_id, player_url, len(s)
1056 self._player_cache[player_url] = func
1057 func = self._player_cache[player_url]
1058 if self._downloader.params.get('youtube_print_sig_code'):
1059 self._print_sig_code(func, len(s))
1062 tb = traceback.format_exc()
1063 self._downloader.report_warning(
1064 u'Automatic signature extraction failed: ' + tb)
1066 self._downloader.report_warning(
1067 u'Warning: Falling back to static signature algorithm')
1068 return self._static_decrypt_signature(
1069 s, video_id, player_url, age_gate)
1071 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1073 # The videos with age protection use another player, so the
1074 # algorithms can be different.
1076 return s[2:63] + s[82] + s[64:82] + s[63]
1079 return s[86:29:-1] + s[88] + s[28:5:-1]
1081 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1083 return s[84:27:-1] + s[86] + s[26:5:-1]
1085 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1087 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1089 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1091 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1093 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1095 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1097 return s[81:36:-1] + s[0] + s[35:2:-1]
1099 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1101 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1103 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1105 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1107 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1110 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1112 def _decrypt_signature_age_gate(self, s):
1113 # The videos with age protection use another player, so the algorithms
1116 return s[2:63] + s[82] + s[64:82] + s[63]
1118 # Fallback to the other algortihms
1119 return self._decrypt_signature(s)
1121 def _get_available_subtitles(self, video_id):
1123 sub_list = self._download_webpage(
1124 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1125 video_id, note=False)
1126 except ExtractorError as err:
1127 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1129 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1134 params = compat_urllib_parse.urlencode({
1137 'fmt': self._downloader.params.get('subtitlesformat'),
1139 url = u'http://www.youtube.com/api/timedtext?' + params
1140 sub_lang_list[lang] = url
1141 if not sub_lang_list:
1142 self._downloader.report_warning(u'video doesn\'t have subtitles')
1144 return sub_lang_list
1146 def _get_available_automatic_caption(self, video_id, webpage):
1147 """We need the webpage for getting the captions url, pass it as an
1148 argument to speed up the process."""
1149 sub_format = self._downloader.params.get('subtitlesformat')
1150 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1151 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1152 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1154 self._downloader.report_warning(err_msg)
1156 player_config = json.loads(mobj.group(1))
1158 args = player_config[u'args']
1159 caption_url = args[u'ttsurl']
1160 timestamp = args[u'timestamp']
1161 # We get the available subtitles
1162 list_params = compat_urllib_parse.urlencode({
1167 list_url = caption_url + '&' + list_params
1168 list_page = self._download_webpage(list_url, video_id)
1169 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1170 original_lang_node = caption_list.find('track')
1171 if original_lang_node.attrib.get('kind') != 'asr' :
1172 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1174 original_lang = original_lang_node.attrib['lang_code']
1177 for lang_node in caption_list.findall('target'):
1178 sub_lang = lang_node.attrib['lang_code']
1179 params = compat_urllib_parse.urlencode({
1180 'lang': original_lang,
1186 sub_lang_list[sub_lang] = caption_url + '&' + params
1187 return sub_lang_list
1188 # An extractor error can be raise by the download process if there are
1189 # no automatic captions but there are subtitles
1190 except (KeyError, ExtractorError):
1191 self._downloader.report_warning(err_msg)
1194 def _print_formats(self, formats):
1195 print('Available formats:')
1197 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1198 self._video_dimensions.get(x, '???'),
1199 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1201 def _extract_id(self, url):
1202 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1204 raise ExtractorError(u'Invalid URL: %s' % url)
1205 video_id = mobj.group(2)
1208 def _get_video_url_list(self, url_map):
1210 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1211 with the requested formats.
1213 req_format = self._downloader.params.get('format', None)
1214 format_limit = self._downloader.params.get('format_limit', None)
1215 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1216 if format_limit is not None and format_limit in available_formats:
1217 format_list = available_formats[available_formats.index(format_limit):]
1219 format_list = available_formats
1220 existing_formats = [x for x in format_list if x in url_map]
1221 if len(existing_formats) == 0:
1222 raise ExtractorError(u'no known formats available for video')
1223 if self._downloader.params.get('listformats', None):
1224 self._print_formats(existing_formats)
1226 if req_format is None or req_format == 'best':
1227 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1228 elif req_format == 'worst':
1229 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1230 elif req_format in ('-1', 'all'):
1231 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1233 # Specific formats. We pick the first in a slash-delimeted sequence.
1234 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1235 # available in the specified format. For example,
1236 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1237 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1238 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1239 req_formats = req_format.split('/')
1240 video_url_list = None
1241 for rf in req_formats:
1243 video_url_list = [(rf, url_map[rf])]
1245 if rf in self._video_formats_map:
1246 for srf in self._video_formats_map[rf]:
1248 video_url_list = [(srf, url_map[srf])]
1253 if video_url_list is None:
1254 raise ExtractorError(u'requested format not available')
1255 return video_url_list
1257 def _extract_from_m3u8(self, manifest_url, video_id):
1259 def _get_urls(_manifest):
1260 lines = _manifest.split('\n')
1261 urls = filter(lambda l: l and not l.startswith('#'),
1264 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1265 formats_urls = _get_urls(manifest)
1266 for format_url in formats_urls:
1267 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1268 url_map[itag] = format_url
1271 def _real_extract(self, url):
1272 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1273 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1275 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1276 mobj = re.search(self._NEXT_URL_RE, url)
1278 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1279 video_id = self._extract_id(url)
1282 self.report_video_webpage_download(video_id)
1283 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1284 request = compat_urllib_request.Request(url)
1286 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1287 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1288 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1290 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1292 # Attempt to extract SWF player URL
1293 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1294 if mobj is not None:
1295 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1300 self.report_video_info_webpage_download(video_id)
1301 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1302 self.report_age_confirmation()
1304 # We simulate the access to the video from www.youtube.com/v/{video_id}
1305 # this can be viewed without login into Youtube
1306 data = compat_urllib_parse.urlencode({'video_id': video_id,
1310 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1314 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1315 video_info_webpage = self._download_webpage(video_info_url, video_id,
1317 errnote='unable to download video info webpage')
1318 video_info = compat_parse_qs(video_info_webpage)
1321 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1322 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1323 % (video_id, el_type))
1324 video_info_webpage = self._download_webpage(video_info_url, video_id,
1326 errnote='unable to download video info webpage')
1327 video_info = compat_parse_qs(video_info_webpage)
1328 if 'token' in video_info:
1330 if 'token' not in video_info:
1331 if 'reason' in video_info:
1332 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1334 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1336 # Check for "rental" videos
1337 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1338 raise ExtractorError(u'"rental" videos not supported')
1340 # Start extracting information
1341 self.report_information_extraction(video_id)
1344 if 'author' not in video_info:
1345 raise ExtractorError(u'Unable to extract uploader name')
1346 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1349 video_uploader_id = None
1350 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1351 if mobj is not None:
1352 video_uploader_id = mobj.group(1)
1354 self._downloader.report_warning(u'unable to extract uploader nickname')
1357 if 'title' not in video_info:
1358 raise ExtractorError(u'Unable to extract video title')
1359 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1362 # We try first to get a high quality image:
1363 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1364 video_webpage, re.DOTALL)
1365 if m_thumb is not None:
1366 video_thumbnail = m_thumb.group(1)
1367 elif 'thumbnail_url' not in video_info:
1368 self._downloader.report_warning(u'unable to extract video thumbnail')
1369 video_thumbnail = ''
1370 else: # don't panic if we can't find it
1371 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1375 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1376 if mobj is not None:
1377 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1378 upload_date = unified_strdate(upload_date)
1381 video_description = get_element_by_id("eow-description", video_webpage)
1382 if video_description:
1383 video_description = clean_html(video_description)
1385 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1387 video_description = unescapeHTML(fd_mobj.group(1))
1389 video_description = u''
1392 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1394 if self._downloader.params.get('listsubtitles', False):
1395 self._list_available_subtitles(video_id, video_webpage)
1398 if 'length_seconds' not in video_info:
1399 self._downloader.report_warning(u'unable to extract video duration')
1402 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1404 # Decide which formats to download
1407 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1409 raise ValueError('Could not find vevo ID')
1410 info = json.loads(mobj.group(1))
1412 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1413 # this signatures are encrypted
1414 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1416 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1417 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1418 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1420 if 'url_encoded_fmt_stream_map' in video_info:
1421 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1423 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1424 elif 'adaptive_fmts' in video_info:
1425 if 'url_encoded_fmt_stream_map' in video_info:
1426 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1428 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1432 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1433 self.report_rtmp_download()
1434 video_url_list = [(None, video_info['conn'][0])]
1435 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1436 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1437 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1439 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1440 url_data = compat_parse_qs(url_data_str)
1441 if 'itag' in url_data and 'url' in url_data:
1442 url = url_data['url'][0]
1443 if 'sig' in url_data:
1444 url += '&signature=' + url_data['sig'][0]
1445 elif 's' in url_data:
1446 encrypted_sig = url_data['s'][0]
1447 if self._downloader.params.get('verbose'):
1449 player_version = self._search_regex(
1451 player_url if player_url else None,
1452 'flash player', fatal=False)
1453 player_desc = 'flash player %s' % player_version
1455 player_version = self._search_regex(
1456 r'html5player-(.+?)\.js', video_webpage,
1457 'html5 player', fatal=False)
1458 player_desc = u'html5 player %s' % player_version
1460 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1461 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1462 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1465 jsplayer_url_json = self._search_regex(
1466 r'"assets":.+?"js":\s*("[^"]+")',
1467 video_webpage, u'JS player URL')
1468 player_url = json.loads(jsplayer_url_json)
1470 signature = self._decrypt_signature(
1471 encrypted_sig, video_id, player_url, age_gate)
1472 url += '&signature=' + signature
1473 if 'ratebypass' not in url:
1474 url += '&ratebypass=yes'
1475 url_map[url_data['itag'][0]] = url
1476 video_url_list = self._get_video_url_list(url_map)
1477 if not video_url_list:
1479 elif video_info.get('hlsvp'):
1480 manifest_url = video_info['hlsvp'][0]
1481 url_map = self._extract_from_m3u8(manifest_url, video_id)
1482 video_url_list = self._get_video_url_list(url_map)
1483 if not video_url_list:
1487 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1490 for format_param, video_real_url in video_url_list:
1492 video_extension = self._video_extensions.get(format_param, 'flv')
1494 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1495 self._video_dimensions.get(format_param, '???'),
1496 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1500 'url': video_real_url,
1501 'uploader': video_uploader,
1502 'uploader_id': video_uploader_id,
1503 'upload_date': upload_date,
1504 'title': video_title,
1505 'ext': video_extension,
1506 'format': video_format,
1507 'thumbnail': video_thumbnail,
1508 'description': video_description,
1509 'player_url': player_url,
1510 'subtitles': video_subtitles,
1511 'duration': video_duration
1515 class YoutubePlaylistIE(InfoExtractor):
1516 IE_DESC = u'YouTube.com playlists'
1517 _VALID_URL = r"""(?:
1522 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1523 \? (?:.*?&)*? (?:p|a|list)=
1526 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1529 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1531 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1533 IE_NAME = u'youtube:playlist'
1536 def suitable(cls, url):
1537 """Receives a URL and returns True if suitable for this IE."""
1538 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1540 def _real_extract(self, url):
1541 # Extract playlist id
1542 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1544 raise ExtractorError(u'Invalid URL: %s' % url)
1546 # Download playlist videos from API
1547 playlist_id = mobj.group(1) or mobj.group(2)
1550 for page_num in itertools.count(1):
1551 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1552 if start_index >= 1000:
1553 self._downloader.report_warning(u'Max number of results reached')
1555 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1556 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1559 response = json.loads(page)
1560 except ValueError as err:
1561 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1563 if 'feed' not in response:
1564 raise ExtractorError(u'Got a malformed response from YouTube API')
1565 playlist_title = response['feed']['title']['$t']
1566 if 'entry' not in response['feed']:
1567 # Number of videos is a multiple of self._MAX_RESULTS
1570 for entry in response['feed']['entry']:
1571 index = entry['yt$position']['$t']
1572 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1575 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1578 videos = [v[1] for v in sorted(videos)]
1580 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1581 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1584 class YoutubeChannelIE(InfoExtractor):
1585 IE_DESC = u'YouTube.com channels'
1586 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1587 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1588 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1589 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1590 IE_NAME = u'youtube:channel'
1592 def extract_videos_from_page(self, page):
1594 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1595 if mobj.group(1) not in ids_in_page:
1596 ids_in_page.append(mobj.group(1))
1599 def _real_extract(self, url):
1600 # Extract channel id
1601 mobj = re.match(self._VALID_URL, url)
1603 raise ExtractorError(u'Invalid URL: %s' % url)
1605 # Download channel page
1606 channel_id = mobj.group(1)
1610 url = self._TEMPLATE_URL % (channel_id, pagenum)
1611 page = self._download_webpage(url, channel_id,
1612 u'Downloading page #%s' % pagenum)
1614 # Extract video identifiers
1615 ids_in_page = self.extract_videos_from_page(page)
1616 video_ids.extend(ids_in_page)
1618 # Download any subsequent channel pages using the json-based channel_ajax query
1619 if self._MORE_PAGES_INDICATOR in page:
1620 for pagenum in itertools.count(1):
1621 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1622 page = self._download_webpage(url, channel_id,
1623 u'Downloading page #%s' % pagenum)
1625 page = json.loads(page)
1627 ids_in_page = self.extract_videos_from_page(page['content_html'])
1628 video_ids.extend(ids_in_page)
1630 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1633 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1635 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1636 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1637 return [self.playlist_result(url_entries, channel_id)]
1640 class YoutubeUserIE(InfoExtractor):
1641 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1642 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1643 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1644 _GDATA_PAGE_SIZE = 50
1645 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1646 IE_NAME = u'youtube:user'
1649 def suitable(cls, url):
1650 # Don't return True if the url can be extracted with other youtube
1651 # extractor, the regex would is too permissive and it would match.
1652 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1653 if any(ie.suitable(url) for ie in other_ies): return False
1654 else: return super(YoutubeUserIE, cls).suitable(url)
1656 def _real_extract(self, url):
1658 mobj = re.match(self._VALID_URL, url)
1660 raise ExtractorError(u'Invalid URL: %s' % url)
1662 username = mobj.group(1)
1664 # Download video ids using YouTube Data API. Result size per
1665 # query is limited (currently to 50 videos) so we need to query
1666 # page by page until there are no video ids - it means we got
1671 for pagenum in itertools.count(0):
1672 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1674 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1675 page = self._download_webpage(gdata_url, username,
1676 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1679 response = json.loads(page)
1680 except ValueError as err:
1681 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1682 if 'entry' not in response['feed']:
1683 # Number of videos is a multiple of self._MAX_RESULTS
1686 # Extract video identifiers
1688 for entry in response['feed']['entry']:
1689 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1690 video_ids.extend(ids_in_page)
1692 # A little optimization - if current page is not
1693 # "full", ie. does not contain PAGE_SIZE video ids then
1694 # we can assume that this page is the last one - there
1695 # are no more ids on further pages - no need to query
1698 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1701 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1702 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1703 return [self.playlist_result(url_results, playlist_title = username)]
1705 class YoutubeSearchIE(SearchInfoExtractor):
1706 IE_DESC = u'YouTube.com searches'
1707 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1709 IE_NAME = u'youtube:search'
1710 _SEARCH_KEY = 'ytsearch'
1712 def report_download_page(self, query, pagenum):
1713 """Report attempt to download search page with given number."""
1714 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1716 def _get_n_results(self, query, n):
1717 """Get a specified number of results for a query"""
1723 while (50 * pagenum) < limit:
1724 self.report_download_page(query, pagenum+1)
1725 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1726 request = compat_urllib_request.Request(result_url)
1728 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1729 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1730 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1731 api_response = json.loads(data)['data']
1733 if not 'items' in api_response:
1734 raise ExtractorError(u'[youtube] No video results')
1736 new_ids = list(video['id'] for video in api_response['items'])
1737 video_ids += new_ids
1739 limit = min(n, api_response['totalItems'])
1742 if len(video_ids) > n:
1743 video_ids = video_ids[:n]
1744 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1745 return self.playlist_result(videos, query)
1748 class YoutubeShowIE(InfoExtractor):
1749 IE_DESC = u'YouTube.com (multi-season) shows'
1750 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1751 IE_NAME = u'youtube:show'
1753 def _real_extract(self, url):
1754 mobj = re.match(self._VALID_URL, url)
1755 show_name = mobj.group(1)
1756 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1757 # There's one playlist for each season of the show
1758 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1759 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1760 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1763 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1765 Base class for extractors that fetch info from
1766 http://www.youtube.com/feed_ajax
1767 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1769 _LOGIN_REQUIRED = True
1771 # use action_load_personal_feed instead of action_load_system_feed
1772 _PERSONAL_FEED = False
1775 def _FEED_TEMPLATE(self):
1776 action = 'action_load_system_feed'
1777 if self._PERSONAL_FEED:
1778 action = 'action_load_personal_feed'
1779 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1783 return u'youtube:%s' % self._FEED_NAME
1785 def _real_initialize(self):
1788 def _real_extract(self, url):
1790 # The step argument is available only in 2.7 or higher
1791 for i in itertools.count(0):
1792 paging = i*self._PAGING_STEP
1793 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1794 u'%s feed' % self._FEED_NAME,
1795 u'Downloading page %s' % i)
1796 info = json.loads(info)
1797 feed_html = info['feed_html']
1798 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1799 ids = orderedSet(m.group(1) for m in m_ids)
1800 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1801 if info['paging'] is None:
1803 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1805 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1806 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1807 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1808 _FEED_NAME = 'subscriptions'
1809 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1811 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1812 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1813 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1814 _FEED_NAME = 'recommended'
1815 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1817 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1818 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1820 _FEED_NAME = 'watch_later'
1821 _PLAYLIST_TITLE = u'Youtube Watch Later'
1823 _PERSONAL_FEED = True
1825 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1826 IE_NAME = u'youtube:favorites'
1827 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1828 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1829 _LOGIN_REQUIRED = True
1831 def _real_extract(self, url):
1832 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1833 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1834 return self.url_result(playlist_id, 'YoutubePlaylist')