14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:bdac09887d209a4ed54b8f76b2bdaa8b",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 cache_enabled = cache_dir is not None
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
451 pass # No cache available
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
456 note=u'Downloading %s player %s' % (player_type, player_id),
457 errnote=u'Download of %s failed' % player_url)
458 res = self._parse_sig_js(code)
459 elif player_type == 'swf':
460 urlh = self._request_webpage(
461 player_url, video_id,
462 note=u'Downloading %s player %s' % (player_type, player_id),
463 errnote=u'Download of %s failed' % player_url)
465 res = self._parse_sig_swf(code)
467 assert False, 'Invalid player type %r' % player_type
471 test_string = u''.join(map(compat_chr, range(slen)))
472 cache_res = res(test_string)
473 cache_spec = [ord(c) for c in cache_res]
475 os.makedirs(os.path.dirname(cache_fn))
476 except OSError as ose:
477 if ose.errno != errno.EEXIST:
479 write_json_file(cache_spec, cache_fn)
481 tb = traceback.format_exc()
482 self._downloader.report_warning(
483 u'Writing cache to %r failed: %s' % (cache_fn, tb))
487 def _print_sig_code(self, func, slen):
488 def gen_sig_code(idxs):
489 def _genslice(start, end, step):
490 starts = u'' if start == 0 else str(start)
491 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
492 steps = u'' if step == 1 else (u':%d' % step)
493 return u's[%s%s%s]' % (starts, ends, steps)
496 start = '(Never used)' # Quelch pyflakes warnings - start will be
497 # set as soon as step is set
498 for i, prev in zip(idxs[1:], idxs[:-1]):
502 yield _genslice(start, prev, step)
505 if i - prev in [-1, 1]:
510 yield u's[%d]' % prev
514 yield _genslice(start, i, step)
516 test_string = u''.join(map(compat_chr, range(slen)))
517 cache_res = func(test_string)
518 cache_spec = [ord(c) for c in cache_res]
519 expr_code = u' + '.join(gen_sig_code(cache_spec))
520 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
521 self.to_screen(u'Extracted signature function:\n' + code)
523 def _parse_sig_js(self, jscode):
524 funcname = self._search_regex(
525 r'signature=([a-zA-Z]+)', jscode,
526 u'Initial JS player signature function name')
531 return string.lowercase.index(varname)
533 def interpret_statement(stmt, local_vars, allow_recursion=20):
534 if allow_recursion < 0:
535 raise ExtractorError(u'Recursion limit reached')
537 if stmt.startswith(u'var '):
538 stmt = stmt[len(u'var '):]
539 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
540 r'=(?P<expr>.*)$', stmt)
542 if ass_m.groupdict().get('index'):
544 lvar = local_vars[ass_m.group('out')]
545 idx = interpret_expression(ass_m.group('index'),
546 local_vars, allow_recursion)
547 assert isinstance(idx, int)
550 expr = ass_m.group('expr')
553 local_vars[ass_m.group('out')] = val
555 expr = ass_m.group('expr')
556 elif stmt.startswith(u'return '):
558 expr = stmt[len(u'return '):]
560 raise ExtractorError(
561 u'Cannot determine left side of statement in %r' % stmt)
563 v = interpret_expression(expr, local_vars, allow_recursion)
566 def interpret_expression(expr, local_vars, allow_recursion):
571 return local_vars[expr]
573 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
575 member = m.group('member')
576 val = local_vars[m.group('in')]
577 if member == 'split("")':
579 if member == 'join("")':
581 if member == 'length':
583 if member == 'reverse()':
585 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
587 idx = interpret_expression(
588 slice_m.group('idx'), local_vars, allow_recursion-1)
592 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
594 val = local_vars[m.group('in')]
595 idx = interpret_expression(m.group('idx'), local_vars,
599 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
601 a = interpret_expression(m.group('a'),
602 local_vars, allow_recursion)
603 b = interpret_expression(m.group('b'),
604 local_vars, allow_recursion)
608 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
610 fname = m.group('func')
611 if fname not in functions:
612 functions[fname] = extract_function(fname)
613 argvals = [int(v) if v.isdigit() else local_vars[v]
614 for v in m.group('args').split(',')]
615 return functions[fname](argvals)
616 raise ExtractorError(u'Unsupported JS expression %r' % expr)
618 def extract_function(funcname):
620 r'function ' + re.escape(funcname) +
621 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
623 argnames = func_m.group('args').split(',')
626 local_vars = dict(zip(argnames, args))
627 for stmt in func_m.group('code').split(';'):
628 res = interpret_statement(stmt, local_vars)
632 initial_function = extract_function(funcname)
633 return lambda s: initial_function([s])
635 def _parse_sig_swf(self, file_contents):
636 if file_contents[1:3] != b'WS':
637 raise ExtractorError(
638 u'Not an SWF file; header is %r' % file_contents[:3])
639 if file_contents[:1] == b'C':
640 content = zlib.decompress(file_contents[8:])
642 raise NotImplementedError(u'Unsupported compression format %r' %
645 def extract_tags(content):
647 while pos < len(content):
648 header16 = struct.unpack('<H', content[pos:pos+2])[0]
650 tag_code = header16 >> 6
651 tag_len = header16 & 0x3f
653 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
655 assert pos+tag_len <= len(content)
656 yield (tag_code, content[pos:pos+tag_len])
660 for tag_code, tag in extract_tags(content)
662 p = code_tag.index(b'\0', 4) + 1
663 code_reader = io.BytesIO(code_tag[p:])
665 # Parse ABC (AVM2 ByteCode)
666 def read_int(reader=None):
674 b = struct.unpack('<B', buf)[0]
675 res = res | ((b & 0x7f) << shift)
681 def u30(reader=None):
682 res = read_int(reader)
683 assert res & 0xf0000000 == 0
687 def s32(reader=None):
689 if v & 0x80000000 != 0:
690 v = - ((v ^ 0xffffffff) + 1)
693 def read_string(reader=None):
697 resb = reader.read(slen)
698 assert len(resb) == slen
699 return resb.decode('utf-8')
701 def read_bytes(count, reader=None):
704 resb = reader.read(count)
705 assert len(resb) == count
708 def read_byte(reader=None):
709 resb = read_bytes(1, reader=reader)
710 res = struct.unpack('<B', resb)[0]
713 # minor_version + major_version
718 for _c in range(1, int_count):
721 for _c in range(1, uint_count):
724 read_bytes((double_count-1) * 8)
726 constant_strings = [u'']
727 for _c in range(1, string_count):
729 constant_strings.append(s)
730 namespace_count = u30()
731 for _c in range(1, namespace_count):
735 for _c in range(1, ns_set_count):
737 for _c2 in range(count):
739 multiname_count = u30()
748 0x0e: 2, # MultinameA
749 0x1b: 1, # MultinameL
750 0x1c: 1, # MultinameLA
753 for _c in range(1, multiname_count):
755 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
757 u30() # namespace_idx
759 multinames.append(constant_strings[name_idx])
761 multinames.append('[MULTINAME kind: %d]' % kind)
762 for _c2 in range(MULTINAME_SIZES[kind]):
767 MethodInfo = collections.namedtuple(
769 ['NEED_ARGUMENTS', 'NEED_REST'])
771 for method_id in range(method_count):
774 for _ in range(param_count):
776 u30() # name index (always 0 for youtube)
778 if flags & 0x08 != 0:
781 for c in range(option_count):
784 if flags & 0x80 != 0:
785 # Param names present
786 for _ in range(param_count):
788 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
789 method_infos.append(mi)
792 metadata_count = u30()
793 for _c in range(metadata_count):
796 for _c2 in range(item_count):
800 def parse_traits_info():
801 trait_name_idx = u30()
802 kind_full = read_byte()
803 kind = kind_full & 0x0f
804 attrs = kind_full >> 4
806 if kind in [0x00, 0x06]: # Slot or Const
808 u30() # type_name_idx
812 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
815 methods[multinames[trait_name_idx]] = method_idx
816 elif kind == 0x04: # Class
819 elif kind == 0x05: # Function
822 methods[function_idx] = multinames[trait_name_idx]
824 raise ExtractorError(u'Unsupported trait kind %d' % kind)
826 if attrs & 0x4 != 0: # Metadata present
827 metadata_count = u30()
828 for _c3 in range(metadata_count):
829 u30() # metadata index
834 TARGET_CLASSNAME = u'SignatureDecipher'
835 searched_idx = multinames.index(TARGET_CLASSNAME)
836 searched_class_id = None
838 for class_id in range(class_count):
840 if name_idx == searched_idx:
841 # We found the class we're looking for!
842 searched_class_id = class_id
843 u30() # super_name idx
845 if flags & 0x08 != 0: # Protected namespace is present
846 u30() # protected_ns_idx
848 for _c2 in range(intrf_count):
852 for _c2 in range(trait_count):
855 if searched_class_id is None:
856 raise ExtractorError(u'Target class %r not found' %
861 for class_id in range(class_count):
864 for _c2 in range(trait_count):
865 trait_methods = parse_traits_info()
866 if class_id == searched_class_id:
867 method_names.update(trait_methods.items())
868 method_idxs.update(dict(
870 for name, idx in trait_methods.items()))
874 for _c in range(script_count):
877 for _c2 in range(trait_count):
881 method_body_count = u30()
882 Method = collections.namedtuple('Method', ['code', 'local_count'])
884 for _c in range(method_body_count):
888 u30() # init_scope_depth
889 u30() # max_scope_depth
891 code = read_bytes(code_length)
892 if method_idx in method_idxs:
893 m = Method(code, local_count)
894 methods[method_idxs[method_idx]] = m
895 exception_count = u30()
896 for _c2 in range(exception_count):
903 for _c2 in range(trait_count):
906 assert p + code_reader.tell() == len(code_tag)
907 assert len(methods) == len(method_idxs)
909 method_pyfunctions = {}
911 def extract_function(func_name):
912 if func_name in method_pyfunctions:
913 return method_pyfunctions[func_name]
914 if func_name not in methods:
915 raise ExtractorError(u'Cannot find function %r' % func_name)
916 m = methods[func_name]
919 registers = ['(this)'] + list(args) + [None] * m.local_count
921 coder = io.BytesIO(m.code)
923 opcode = struct.unpack('!B', coder.read(1))[0]
924 if opcode == 36: # pushbyte
925 v = struct.unpack('!B', coder.read(1))[0]
927 elif opcode == 44: # pushstring
929 stack.append(constant_strings[idx])
930 elif opcode == 48: # pushscope
931 # We don't implement the scope register, so we'll just
932 # ignore the popped value
934 elif opcode == 70: # callproperty
936 mname = multinames[index]
937 arg_count = u30(coder)
938 args = list(reversed(
939 [stack.pop() for _ in range(arg_count)]))
941 if mname == u'split':
942 assert len(args) == 1
943 assert isinstance(args[0], compat_str)
944 assert isinstance(obj, compat_str)
948 res = obj.split(args[0])
950 elif mname == u'slice':
951 assert len(args) == 1
952 assert isinstance(args[0], int)
953 assert isinstance(obj, list)
956 elif mname == u'join':
957 assert len(args) == 1
958 assert isinstance(args[0], compat_str)
959 assert isinstance(obj, list)
960 res = args[0].join(obj)
962 elif mname in method_pyfunctions:
963 stack.append(method_pyfunctions[mname](args))
965 raise NotImplementedError(
966 u'Unsupported property %r on %r'
968 elif opcode == 72: # returnvalue
971 elif opcode == 79: # callpropvoid
973 mname = multinames[index]
974 arg_count = u30(coder)
975 args = list(reversed(
976 [stack.pop() for _ in range(arg_count)]))
978 if mname == u'reverse':
979 assert isinstance(obj, list)
982 raise NotImplementedError(
983 u'Unsupported (void) property %r on %r'
985 elif opcode == 93: # findpropstrict
987 mname = multinames[index]
988 res = extract_function(mname)
990 elif opcode == 97: # setproperty
995 assert isinstance(obj, list)
996 assert isinstance(idx, int)
998 elif opcode == 98: # getlocal
1000 stack.append(registers[index])
1001 elif opcode == 99: # setlocal
1004 registers[index] = value
1005 elif opcode == 102: # getproperty
1007 pname = multinames[index]
1008 if pname == u'length':
1010 assert isinstance(obj, list)
1011 stack.append(len(obj))
1012 else: # Assume attribute access
1014 assert isinstance(idx, int)
1016 assert isinstance(obj, list)
1017 stack.append(obj[idx])
1018 elif opcode == 128: # coerce
1020 elif opcode == 133: # coerce_s
1021 assert isinstance(stack[-1], (type(None), compat_str))
1022 elif opcode == 164: # modulo
1023 value2 = stack.pop()
1024 value1 = stack.pop()
1025 res = value1 % value2
1027 elif opcode == 208: # getlocal_0
1028 stack.append(registers[0])
1029 elif opcode == 209: # getlocal_1
1030 stack.append(registers[1])
1031 elif opcode == 210: # getlocal_2
1032 stack.append(registers[2])
1033 elif opcode == 211: # getlocal_3
1034 stack.append(registers[3])
1035 elif opcode == 214: # setlocal_2
1036 registers[2] = stack.pop()
1037 elif opcode == 215: # setlocal_3
1038 registers[3] = stack.pop()
1040 raise NotImplementedError(
1041 u'Unsupported opcode %d' % opcode)
1043 method_pyfunctions[func_name] = resfunc
1046 initial_function = extract_function(u'decipher')
1047 return lambda s: initial_function([s])
1049 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1050 """Turn the encrypted s field into a working signature"""
1052 if player_url is not None:
1054 if player_url not in self._player_cache:
1055 func = self._extract_signature_function(
1056 video_id, player_url, len(s)
1058 self._player_cache[player_url] = func
1059 func = self._player_cache[player_url]
1060 if self._downloader.params.get('youtube_print_sig_code'):
1061 self._print_sig_code(func, len(s))
1064 tb = traceback.format_exc()
1065 self._downloader.report_warning(
1066 u'Automatic signature extraction failed: ' + tb)
1068 self._downloader.report_warning(
1069 u'Warning: Falling back to static signature algorithm')
1071 return self._static_decrypt_signature(
1072 s, video_id, player_url, age_gate)
1074 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1076 # The videos with age protection use another player, so the
1077 # algorithms can be different.
1079 return s[2:63] + s[82] + s[64:82] + s[63]
1082 return s[86:29:-1] + s[88] + s[28:5:-1]
1084 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1086 return s[84:27:-1] + s[86] + s[26:5:-1]
1088 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1090 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1092 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1094 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1096 return s[80:72:-1] + s[16] + s[71:39:-1] + s[72] + s[38:16:-1] + s[82] + s[15::-1]
1098 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1100 return s[78:70:-1] + s[14] + s[69:37:-1] + s[70] + s[36:14:-1] + s[80] + s[:14][::-1]
1102 return s[80:63:-1] + s[0] + s[62:0:-1] + s[63]
1104 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1106 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1108 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1110 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1113 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1115 def _get_available_subtitles(self, video_id):
1117 sub_list = self._download_webpage(
1118 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1119 video_id, note=False)
1120 except ExtractorError as err:
1121 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1123 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1128 params = compat_urllib_parse.urlencode({
1131 'fmt': self._downloader.params.get('subtitlesformat'),
1133 url = u'http://www.youtube.com/api/timedtext?' + params
1134 sub_lang_list[lang] = url
1135 if not sub_lang_list:
1136 self._downloader.report_warning(u'video doesn\'t have subtitles')
1138 return sub_lang_list
1140 def _get_available_automatic_caption(self, video_id, webpage):
1141 """We need the webpage for getting the captions url, pass it as an
1142 argument to speed up the process."""
1143 sub_format = self._downloader.params.get('subtitlesformat')
1144 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1145 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1146 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1148 self._downloader.report_warning(err_msg)
1150 player_config = json.loads(mobj.group(1))
1152 args = player_config[u'args']
1153 caption_url = args[u'ttsurl']
1154 timestamp = args[u'timestamp']
1155 # We get the available subtitles
1156 list_params = compat_urllib_parse.urlencode({
1161 list_url = caption_url + '&' + list_params
1162 list_page = self._download_webpage(list_url, video_id)
1163 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1164 original_lang_node = caption_list.find('track')
1165 if original_lang_node.attrib.get('kind') != 'asr' :
1166 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1168 original_lang = original_lang_node.attrib['lang_code']
1171 for lang_node in caption_list.findall('target'):
1172 sub_lang = lang_node.attrib['lang_code']
1173 params = compat_urllib_parse.urlencode({
1174 'lang': original_lang,
1180 sub_lang_list[sub_lang] = caption_url + '&' + params
1181 return sub_lang_list
1182 # An extractor error can be raise by the download process if there are
1183 # no automatic captions but there are subtitles
1184 except (KeyError, ExtractorError):
1185 self._downloader.report_warning(err_msg)
1188 def _print_formats(self, formats):
1189 print('Available formats:')
1191 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1192 self._video_dimensions.get(x, '???'),
1193 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1195 def _extract_id(self, url):
1196 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1198 raise ExtractorError(u'Invalid URL: %s' % url)
1199 video_id = mobj.group(2)
1202 def _get_video_url_list(self, url_map):
1204 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1205 with the requested formats.
1207 req_format = self._downloader.params.get('format', None)
1208 format_limit = self._downloader.params.get('format_limit', None)
1209 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1210 if format_limit is not None and format_limit in available_formats:
1211 format_list = available_formats[available_formats.index(format_limit):]
1213 format_list = available_formats
1214 existing_formats = [x for x in format_list if x in url_map]
1215 if len(existing_formats) == 0:
1216 raise ExtractorError(u'no known formats available for video')
1217 if self._downloader.params.get('listformats', None):
1218 self._print_formats(existing_formats)
1220 if req_format is None or req_format == 'best':
1221 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1222 elif req_format == 'worst':
1223 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1224 elif req_format in ('-1', 'all'):
1225 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1227 # Specific formats. We pick the first in a slash-delimeted sequence.
1228 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1229 # available in the specified format. For example,
1230 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1231 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1232 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1233 req_formats = req_format.split('/')
1234 video_url_list = None
1235 for rf in req_formats:
1237 video_url_list = [(rf, url_map[rf])]
1239 if rf in self._video_formats_map:
1240 for srf in self._video_formats_map[rf]:
1242 video_url_list = [(srf, url_map[srf])]
1247 if video_url_list is None:
1248 raise ExtractorError(u'requested format not available')
1249 return video_url_list
1251 def _extract_from_m3u8(self, manifest_url, video_id):
1253 def _get_urls(_manifest):
1254 lines = _manifest.split('\n')
1255 urls = filter(lambda l: l and not l.startswith('#'),
1258 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1259 formats_urls = _get_urls(manifest)
1260 for format_url in formats_urls:
1261 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1262 url_map[itag] = format_url
1265 def _real_extract(self, url):
1266 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1267 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1269 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1270 mobj = re.search(self._NEXT_URL_RE, url)
1272 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1273 video_id = self._extract_id(url)
1276 self.report_video_webpage_download(video_id)
1277 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1278 request = compat_urllib_request.Request(url)
1280 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1281 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1282 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1284 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1286 # Attempt to extract SWF player URL
1287 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1288 if mobj is not None:
1289 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1294 self.report_video_info_webpage_download(video_id)
1295 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1296 self.report_age_confirmation()
1298 # We simulate the access to the video from www.youtube.com/v/{video_id}
1299 # this can be viewed without login into Youtube
1300 data = compat_urllib_parse.urlencode({'video_id': video_id,
1304 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1308 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1309 video_info_webpage = self._download_webpage(video_info_url, video_id,
1311 errnote='unable to download video info webpage')
1312 video_info = compat_parse_qs(video_info_webpage)
1315 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1316 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1317 % (video_id, el_type))
1318 video_info_webpage = self._download_webpage(video_info_url, video_id,
1320 errnote='unable to download video info webpage')
1321 video_info = compat_parse_qs(video_info_webpage)
1322 if 'token' in video_info:
1324 if 'token' not in video_info:
1325 if 'reason' in video_info:
1326 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1328 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1330 # Check for "rental" videos
1331 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1332 raise ExtractorError(u'"rental" videos not supported')
1334 # Start extracting information
1335 self.report_information_extraction(video_id)
1338 if 'author' not in video_info:
1339 raise ExtractorError(u'Unable to extract uploader name')
1340 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1343 video_uploader_id = None
1344 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1345 if mobj is not None:
1346 video_uploader_id = mobj.group(1)
1348 self._downloader.report_warning(u'unable to extract uploader nickname')
1351 if 'title' not in video_info:
1352 raise ExtractorError(u'Unable to extract video title')
1353 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1356 # We try first to get a high quality image:
1357 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1358 video_webpage, re.DOTALL)
1359 if m_thumb is not None:
1360 video_thumbnail = m_thumb.group(1)
1361 elif 'thumbnail_url' not in video_info:
1362 self._downloader.report_warning(u'unable to extract video thumbnail')
1363 video_thumbnail = ''
1364 else: # don't panic if we can't find it
1365 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1369 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1370 if mobj is not None:
1371 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1372 upload_date = unified_strdate(upload_date)
1375 video_description = get_element_by_id("eow-description", video_webpage)
1376 if video_description:
1377 video_description = clean_html(video_description)
1379 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1381 video_description = unescapeHTML(fd_mobj.group(1))
1383 video_description = u''
1386 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1388 if self._downloader.params.get('listsubtitles', False):
1389 self._list_available_subtitles(video_id, video_webpage)
1392 if 'length_seconds' not in video_info:
1393 self._downloader.report_warning(u'unable to extract video duration')
1396 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1398 # Decide which formats to download
1401 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1403 raise ValueError('Could not find vevo ID')
1404 info = json.loads(mobj.group(1))
1406 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1407 # this signatures are encrypted
1408 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1410 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1411 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1412 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1414 if 'url_encoded_fmt_stream_map' in video_info:
1415 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1417 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1418 elif 'adaptive_fmts' in video_info:
1419 if 'url_encoded_fmt_stream_map' in video_info:
1420 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1422 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1426 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1427 self.report_rtmp_download()
1428 video_url_list = [(None, video_info['conn'][0])]
1429 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1430 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1431 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1433 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1434 url_data = compat_parse_qs(url_data_str)
1435 if 'itag' in url_data and 'url' in url_data:
1436 url = url_data['url'][0]
1437 if 'sig' in url_data:
1438 url += '&signature=' + url_data['sig'][0]
1439 elif 's' in url_data:
1440 encrypted_sig = url_data['s'][0]
1441 if self._downloader.params.get('verbose'):
1443 if player_url is None:
1444 player_version = 'unknown'
1446 player_version = self._search_regex(
1447 r'-(.+)\.swf$', player_url,
1448 u'flash player', fatal=False)
1449 player_desc = 'flash player %s' % player_version
1451 player_version = self._search_regex(
1452 r'html5player-(.+?)\.js', video_webpage,
1453 'html5 player', fatal=False)
1454 player_desc = u'html5 player %s' % player_version
1456 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1457 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1458 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1461 jsplayer_url_json = self._search_regex(
1462 r'"assets":.+?"js":\s*("[^"]+")',
1463 video_webpage, u'JS player URL')
1464 player_url = json.loads(jsplayer_url_json)
1466 signature = self._decrypt_signature(
1467 encrypted_sig, video_id, player_url, age_gate)
1468 url += '&signature=' + signature
1469 if 'ratebypass' not in url:
1470 url += '&ratebypass=yes'
1471 url_map[url_data['itag'][0]] = url
1472 video_url_list = self._get_video_url_list(url_map)
1473 if not video_url_list:
1475 elif video_info.get('hlsvp'):
1476 manifest_url = video_info['hlsvp'][0]
1477 url_map = self._extract_from_m3u8(manifest_url, video_id)
1478 video_url_list = self._get_video_url_list(url_map)
1479 if not video_url_list:
1483 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1486 for format_param, video_real_url in video_url_list:
1488 video_extension = self._video_extensions.get(format_param, 'flv')
1490 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1491 self._video_dimensions.get(format_param, '???'),
1492 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1496 'url': video_real_url,
1497 'uploader': video_uploader,
1498 'uploader_id': video_uploader_id,
1499 'upload_date': upload_date,
1500 'title': video_title,
1501 'ext': video_extension,
1502 'format': video_format,
1503 'thumbnail': video_thumbnail,
1504 'description': video_description,
1505 'player_url': player_url,
1506 'subtitles': video_subtitles,
1507 'duration': video_duration
1511 class YoutubePlaylistIE(InfoExtractor):
1512 IE_DESC = u'YouTube.com playlists'
1513 _VALID_URL = r"""(?:
1518 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1519 \? (?:.*?&)*? (?:p|a|list)=
1522 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1525 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1527 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1529 IE_NAME = u'youtube:playlist'
1532 def suitable(cls, url):
1533 """Receives a URL and returns True if suitable for this IE."""
1534 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1536 def _real_extract(self, url):
1537 # Extract playlist id
1538 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1540 raise ExtractorError(u'Invalid URL: %s' % url)
1542 # Download playlist videos from API
1543 playlist_id = mobj.group(1) or mobj.group(2)
1546 for page_num in itertools.count(1):
1547 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1548 if start_index >= 1000:
1549 self._downloader.report_warning(u'Max number of results reached')
1551 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1552 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1555 response = json.loads(page)
1556 except ValueError as err:
1557 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1559 if 'feed' not in response:
1560 raise ExtractorError(u'Got a malformed response from YouTube API')
1561 playlist_title = response['feed']['title']['$t']
1562 if 'entry' not in response['feed']:
1563 # Number of videos is a multiple of self._MAX_RESULTS
1566 for entry in response['feed']['entry']:
1567 index = entry['yt$position']['$t']
1568 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1571 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1574 videos = [v[1] for v in sorted(videos)]
1576 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1577 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1580 class YoutubeChannelIE(InfoExtractor):
1581 IE_DESC = u'YouTube.com channels'
1582 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1583 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1584 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1585 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1586 IE_NAME = u'youtube:channel'
1588 def extract_videos_from_page(self, page):
1590 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1591 if mobj.group(1) not in ids_in_page:
1592 ids_in_page.append(mobj.group(1))
1595 def _real_extract(self, url):
1596 # Extract channel id
1597 mobj = re.match(self._VALID_URL, url)
1599 raise ExtractorError(u'Invalid URL: %s' % url)
1601 # Download channel page
1602 channel_id = mobj.group(1)
1606 url = self._TEMPLATE_URL % (channel_id, pagenum)
1607 page = self._download_webpage(url, channel_id,
1608 u'Downloading page #%s' % pagenum)
1610 # Extract video identifiers
1611 ids_in_page = self.extract_videos_from_page(page)
1612 video_ids.extend(ids_in_page)
1614 # Download any subsequent channel pages using the json-based channel_ajax query
1615 if self._MORE_PAGES_INDICATOR in page:
1616 for pagenum in itertools.count(1):
1617 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1618 page = self._download_webpage(url, channel_id,
1619 u'Downloading page #%s' % pagenum)
1621 page = json.loads(page)
1623 ids_in_page = self.extract_videos_from_page(page['content_html'])
1624 video_ids.extend(ids_in_page)
1626 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1629 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1631 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1632 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1633 return [self.playlist_result(url_entries, channel_id)]
1636 class YoutubeUserIE(InfoExtractor):
1637 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1638 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1639 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1640 _GDATA_PAGE_SIZE = 50
1641 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1642 IE_NAME = u'youtube:user'
1645 def suitable(cls, url):
1646 # Don't return True if the url can be extracted with other youtube
1647 # extractor, the regex would is too permissive and it would match.
1648 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1649 if any(ie.suitable(url) for ie in other_ies): return False
1650 else: return super(YoutubeUserIE, cls).suitable(url)
1652 def _real_extract(self, url):
1654 mobj = re.match(self._VALID_URL, url)
1656 raise ExtractorError(u'Invalid URL: %s' % url)
1658 username = mobj.group(1)
1660 # Download video ids using YouTube Data API. Result size per
1661 # query is limited (currently to 50 videos) so we need to query
1662 # page by page until there are no video ids - it means we got
1667 for pagenum in itertools.count(0):
1668 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1670 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1671 page = self._download_webpage(gdata_url, username,
1672 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1675 response = json.loads(page)
1676 except ValueError as err:
1677 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1678 if 'entry' not in response['feed']:
1679 # Number of videos is a multiple of self._MAX_RESULTS
1682 # Extract video identifiers
1684 for entry in response['feed']['entry']:
1685 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1686 video_ids.extend(ids_in_page)
1688 # A little optimization - if current page is not
1689 # "full", ie. does not contain PAGE_SIZE video ids then
1690 # we can assume that this page is the last one - there
1691 # are no more ids on further pages - no need to query
1694 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1697 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1698 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1699 return [self.playlist_result(url_results, playlist_title = username)]
1701 class YoutubeSearchIE(SearchInfoExtractor):
1702 IE_DESC = u'YouTube.com searches'
1703 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1705 IE_NAME = u'youtube:search'
1706 _SEARCH_KEY = 'ytsearch'
1708 def report_download_page(self, query, pagenum):
1709 """Report attempt to download search page with given number."""
1710 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1712 def _get_n_results(self, query, n):
1713 """Get a specified number of results for a query"""
1719 while (50 * pagenum) < limit:
1720 self.report_download_page(query, pagenum+1)
1721 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1722 request = compat_urllib_request.Request(result_url)
1724 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1725 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1726 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1727 api_response = json.loads(data)['data']
1729 if not 'items' in api_response:
1730 raise ExtractorError(u'[youtube] No video results')
1732 new_ids = list(video['id'] for video in api_response['items'])
1733 video_ids += new_ids
1735 limit = min(n, api_response['totalItems'])
1738 if len(video_ids) > n:
1739 video_ids = video_ids[:n]
1740 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1741 return self.playlist_result(videos, query)
1744 class YoutubeShowIE(InfoExtractor):
1745 IE_DESC = u'YouTube.com (multi-season) shows'
1746 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1747 IE_NAME = u'youtube:show'
1749 def _real_extract(self, url):
1750 mobj = re.match(self._VALID_URL, url)
1751 show_name = mobj.group(1)
1752 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1753 # There's one playlist for each season of the show
1754 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1755 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1756 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1759 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1761 Base class for extractors that fetch info from
1762 http://www.youtube.com/feed_ajax
1763 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1765 _LOGIN_REQUIRED = True
1767 # use action_load_personal_feed instead of action_load_system_feed
1768 _PERSONAL_FEED = False
1771 def _FEED_TEMPLATE(self):
1772 action = 'action_load_system_feed'
1773 if self._PERSONAL_FEED:
1774 action = 'action_load_personal_feed'
1775 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1779 return u'youtube:%s' % self._FEED_NAME
1781 def _real_initialize(self):
1784 def _real_extract(self, url):
1786 # The step argument is available only in 2.7 or higher
1787 for i in itertools.count(0):
1788 paging = i*self._PAGING_STEP
1789 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1790 u'%s feed' % self._FEED_NAME,
1791 u'Downloading page %s' % i)
1792 info = json.loads(info)
1793 feed_html = info['feed_html']
1794 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1795 ids = orderedSet(m.group(1) for m in m_ids)
1796 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1797 if info['paging'] is None:
1799 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1801 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1802 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1803 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1804 _FEED_NAME = 'subscriptions'
1805 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1807 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1808 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1809 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1810 _FEED_NAME = 'recommended'
1811 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1813 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1814 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1815 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1816 _FEED_NAME = 'watch_later'
1817 _PLAYLIST_TITLE = u'Youtube Watch Later'
1819 _PERSONAL_FEED = True
1821 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1822 IE_NAME = u'youtube:favorites'
1823 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1824 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1825 _LOGIN_REQUIRED = True
1827 def _real_extract(self, url):
1828 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1829 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1830 return self.url_result(playlist_id, 'YoutubePlaylist')