14 import xml.etree.ElementTree
17 from .common import InfoExtractor, SearchInfoExtractor
18 from .subtitles import SubtitlesInfoExtractor
25 compat_urllib_request,
37 class YoutubeBaseInfoExtractor(InfoExtractor):
38 """Provide base functions for Youtube extractors"""
39 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
40 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
41 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
42 _NETRC_MACHINE = 'youtube'
43 # If True it will raise an error if no login info is provided
44 _LOGIN_REQUIRED = False
46 def report_lang(self):
47 """Report attempt to set language."""
48 self.to_screen(u'Setting language')
50 def _set_language(self):
51 request = compat_urllib_request.Request(self._LANG_URL)
54 compat_urllib_request.urlopen(request).read()
55 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
56 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
61 (username, password) = self._get_login_info()
62 # No authentication to be performed
64 if self._LOGIN_REQUIRED:
65 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
68 request = compat_urllib_request.Request(self._LOGIN_URL)
70 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
71 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
72 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
77 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
80 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
86 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
90 u'PersistentCookie': u'yes',
92 u'bgresponse': u'js_disabled',
93 u'checkConnection': u'',
94 u'checkedDomains': u'youtube',
100 u'signIn': u'Sign in',
102 u'service': u'youtube',
106 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
108 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
109 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
110 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
113 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
114 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
115 self._downloader.report_warning(u'unable to log in: bad username or password')
117 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
122 def _confirm_age(self):
125 'action_confirm': 'Confirm',
127 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
129 self.report_age_confirmation()
130 compat_urllib_request.urlopen(request).read().decode('utf-8')
131 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
132 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
135 def _real_initialize(self):
136 if self._downloader is None:
138 if not self._set_language():
140 if not self._login():
145 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
146 IE_DESC = u'YouTube.com'
149 (?:https?://)? # http(s):// (optional)
150 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
151 tube\.majestyc\.net/|
152 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
153 (?:.*?\#/)? # handle anchor (#/) redirect urls
154 (?: # the various things that can precede the ID:
155 (?:(?:v|embed|e)/) # v/ or embed/ or e/
156 |(?: # or the v= param in all its forms
157 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
158 (?:\?|\#!?) # the params delimiter ? or # or #!
159 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
163 |youtu\.be/ # just youtu.be/xxxx
165 )? # all until now is optional -> you can pass the naked ID
166 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
167 (?(1).+)? # if we found the ID, everything can follow
169 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
170 # Listed in order of quality
171 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
172 # Apple HTTP Live Streaming
173 '96', '95', '94', '93', '92', '132', '151',
175 '85', '84', '102', '83', '101', '82', '100',
177 '138', '137', '248', '136', '247', '135', '246',
178 '245', '244', '134', '243', '133', '242', '160',
180 '141', '172', '140', '171', '139',
182 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
183 # Apple HTTP Live Streaming
184 '96', '95', '94', '93', '92', '132', '151',
186 '85', '102', '84', '101', '83', '100', '82',
188 '138', '248', '137', '247', '136', '246', '245',
189 '244', '135', '243', '134', '242', '133', '160',
191 '172', '141', '171', '140', '139',
193 _video_formats_map = {
194 'flv': ['35', '34', '6', '5'],
195 '3gp': ['36', '17', '13'],
196 'mp4': ['38', '37', '22', '18'],
197 'webm': ['46', '45', '44', '43'],
199 _video_extensions = {
221 # Apple HTTP Live Streaming
253 _video_dimensions = {
335 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
336 u"file": u"BaW_jenozKc.mp4",
338 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
339 u"uploader": u"Philipp Hagemeister",
340 u"uploader_id": u"phihag",
341 u"upload_date": u"20121002",
342 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
346 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
347 u"file": u"1ltcDfZMA3U.flv",
348 u"note": u"Test VEVO video (#897)",
350 u"upload_date": u"20070518",
351 u"title": u"Maps - It Will Find You",
352 u"description": u"Music video by Maps performing It Will Find You.",
353 u"uploader": u"MuteUSA",
354 u"uploader_id": u"MuteUSA"
358 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
359 u"file": u"UxxajLWwzqY.mp4",
360 u"note": u"Test generic use_cipher_signature video (#897)",
362 u"upload_date": u"20120506",
363 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
364 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
365 u"uploader": u"Icona Pop",
366 u"uploader_id": u"IconaPop"
370 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
371 u"file": u"07FYdnEawAQ.mp4",
372 u"note": u"Test VEVO video with age protection (#956)",
374 u"upload_date": u"20130703",
375 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
376 u"description": u"md5:64249768eec3bc4276236606ea996373",
377 u"uploader": u"justintimberlakeVEVO",
378 u"uploader_id": u"justintimberlakeVEVO"
382 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
383 u'file': u'TGi3HqYrWHE.mp4',
384 u'note': u'm3u8 video',
386 u'title': u'Triathlon - Men - London 2012 Olympic Games',
387 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
388 u'uploader': u'olympic',
389 u'upload_date': u'20120807',
390 u'uploader_id': u'olympic',
393 u'skip_download': True,
400 def suitable(cls, url):
401 """Receives a URL and returns True if suitable for this IE."""
402 if YoutubePlaylistIE.suitable(url): return False
403 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
405 def __init__(self, *args, **kwargs):
406 super(YoutubeIE, self).__init__(*args, **kwargs)
407 self._player_cache = {}
409 def report_video_webpage_download(self, video_id):
410 """Report attempt to download video webpage."""
411 self.to_screen(u'%s: Downloading video webpage' % video_id)
413 def report_video_info_webpage_download(self, video_id):
414 """Report attempt to download video info webpage."""
415 self.to_screen(u'%s: Downloading video info webpage' % video_id)
417 def report_information_extraction(self, video_id):
418 """Report attempt to extract video information."""
419 self.to_screen(u'%s: Extracting video information' % video_id)
421 def report_unavailable_format(self, video_id, format):
422 """Report extracted video URL."""
423 self.to_screen(u'%s: Format %s not available' % (video_id, format))
425 def report_rtmp_download(self):
426 """Indicate the download will use the RTMP protocol."""
427 self.to_screen(u'RTMP download detected')
429 def _extract_signature_function(self, video_id, player_url, slen):
430 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
432 player_type = id_m.group('ext')
433 player_id = id_m.group('id')
435 # Read from filesystem cache
436 func_id = '%s_%s_%d' % (player_type, player_id, slen)
437 assert os.path.basename(func_id) == func_id
438 cache_dir = self._downloader.params.get('cachedir',
439 u'~/.youtube-dl/cache')
441 cache_enabled = cache_dir != u'NONE'
443 cache_fn = os.path.join(os.path.expanduser(cache_dir),
447 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
448 cache_spec = json.load(cachef)
449 return lambda s: u''.join(s[i] for i in cache_spec)
451 pass # No cache available
453 if player_type == 'js':
454 code = self._download_webpage(
455 player_url, video_id,
456 note=u'Downloading %s player %s' % (player_type, player_id),
457 errnote=u'Download of %s failed' % player_url)
458 res = self._parse_sig_js(code)
459 elif player_type == 'swf':
460 urlh = self._request_webpage(
461 player_url, video_id,
462 note=u'Downloading %s player %s' % (player_type, player_id),
463 errnote=u'Download of %s failed' % player_url)
465 res = self._parse_sig_swf(code)
467 assert False, 'Invalid player type %r' % player_type
471 cache_res = res(map(compat_chr, range(slen)))
472 cache_spec = [ord(c) for c in cache_res]
474 os.makedirs(os.path.dirname(cache_fn))
475 except OSError as ose:
476 if ose.errno != errno.EEXIST:
478 write_json_file(cache_spec, cache_fn)
480 tb = traceback.format_exc()
481 self._downloader.report_warning(
482 u'Writing cache to %r failed: %s' % (cache_fn, tb))
486 def _print_sig_code(self, func, slen):
487 def gen_sig_code(idxs):
488 def _genslice(start, end, step):
489 starts = u'' if start == 0 else str(start)
490 ends = u':%d' % (end+step)
491 steps = u'' if step == 1 else (':%d' % step)
492 return u's[%s%s%s]' % (starts, ends, steps)
495 start = '(Never used)' # Quelch pyflakes warnings - start will be
496 # set as soon as step is set
497 for i, prev in zip(idxs[1:], idxs[:-1]):
501 yield _genslice(start, prev, step)
504 if i - prev in [-1, 1]:
509 yield u's[%d]' % prev
513 yield _genslice(start, i, step)
515 cache_res = func(map(compat_chr, range(slen)))
516 cache_spec = [ord(c) for c in cache_res]
517 expr_code = u' + '.join(gen_sig_code(cache_spec))
518 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
519 self.to_screen(u'Extracted signature function:\n' + code)
521 def _parse_sig_js(self, jscode):
522 funcname = self._search_regex(
523 r'signature=([a-zA-Z]+)', jscode,
524 u'Initial JS player signature function name')
529 return string.lowercase.index(varname)
531 def interpret_statement(stmt, local_vars, allow_recursion=20):
532 if allow_recursion < 0:
533 raise ExtractorError(u'Recursion limit reached')
535 if stmt.startswith(u'var '):
536 stmt = stmt[len(u'var '):]
537 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
538 r'=(?P<expr>.*)$', stmt)
540 if ass_m.groupdict().get('index'):
542 lvar = local_vars[ass_m.group('out')]
543 idx = interpret_expression(ass_m.group('index'),
544 local_vars, allow_recursion)
545 assert isinstance(idx, int)
548 expr = ass_m.group('expr')
551 local_vars[ass_m.group('out')] = val
553 expr = ass_m.group('expr')
554 elif stmt.startswith(u'return '):
556 expr = stmt[len(u'return '):]
558 raise ExtractorError(
559 u'Cannot determine left side of statement in %r' % stmt)
561 v = interpret_expression(expr, local_vars, allow_recursion)
564 def interpret_expression(expr, local_vars, allow_recursion):
569 return local_vars[expr]
571 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
573 member = m.group('member')
574 val = local_vars[m.group('in')]
575 if member == 'split("")':
577 if member == 'join("")':
579 if member == 'length':
581 if member == 'reverse()':
583 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
585 idx = interpret_expression(
586 slice_m.group('idx'), local_vars, allow_recursion-1)
590 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
592 val = local_vars[m.group('in')]
593 idx = interpret_expression(m.group('idx'), local_vars,
597 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
599 a = interpret_expression(m.group('a'),
600 local_vars, allow_recursion)
601 b = interpret_expression(m.group('b'),
602 local_vars, allow_recursion)
606 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
608 fname = m.group('func')
609 if fname not in functions:
610 functions[fname] = extract_function(fname)
611 argvals = [int(v) if v.isdigit() else local_vars[v]
612 for v in m.group('args').split(',')]
613 return functions[fname](argvals)
614 raise ExtractorError(u'Unsupported JS expression %r' % expr)
616 def extract_function(funcname):
618 r'function ' + re.escape(funcname) +
619 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
621 argnames = func_m.group('args').split(',')
624 local_vars = dict(zip(argnames, args))
625 for stmt in func_m.group('code').split(';'):
626 res = interpret_statement(stmt, local_vars)
630 initial_function = extract_function(funcname)
631 return lambda s: initial_function([s])
633 def _parse_sig_swf(self, file_contents):
634 if file_contents[1:3] != b'WS':
635 raise ExtractorError(
636 u'Not an SWF file; header is %r' % file_contents[:3])
637 if file_contents[:1] == b'C':
638 content = zlib.decompress(file_contents[8:])
640 raise NotImplementedError(u'Unsupported compression format %r' %
643 def extract_tags(content):
645 while pos < len(content):
646 header16 = struct.unpack('<H', content[pos:pos+2])[0]
648 tag_code = header16 >> 6
649 tag_len = header16 & 0x3f
651 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
653 assert pos+tag_len <= len(content)
654 yield (tag_code, content[pos:pos+tag_len])
658 for tag_code, tag in extract_tags(content)
660 p = code_tag.index(b'\0', 4) + 1
661 code_reader = io.BytesIO(code_tag[p:])
663 # Parse ABC (AVM2 ByteCode)
664 def read_int(reader=None):
672 b = struct.unpack('<B', buf)[0]
673 res = res | ((b & 0x7f) << shift)
679 def u30(reader=None):
680 res = read_int(reader)
681 assert res & 0xf0000000 == 0
685 def s32(reader=None):
687 if v & 0x80000000 != 0:
688 v = - ((v ^ 0xffffffff) + 1)
691 def read_string(reader=None):
695 resb = reader.read(slen)
696 assert len(resb) == slen
697 return resb.decode('utf-8')
699 def read_bytes(count, reader=None):
702 resb = reader.read(count)
703 assert len(resb) == count
706 def read_byte(reader=None):
707 resb = read_bytes(1, reader=reader)
708 res = struct.unpack('<B', resb)[0]
711 # minor_version + major_version
716 for _c in range(1, int_count):
719 for _c in range(1, uint_count):
722 read_bytes((double_count-1) * 8)
724 constant_strings = [u'']
725 for _c in range(1, string_count):
727 constant_strings.append(s)
728 namespace_count = u30()
729 for _c in range(1, namespace_count):
733 for _c in range(1, ns_set_count):
735 for _c2 in range(count):
737 multiname_count = u30()
746 0x0e: 2, # MultinameA
747 0x1b: 1, # MultinameL
748 0x1c: 1, # MultinameLA
751 for _c in range(1, multiname_count):
753 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
755 u30() # namespace_idx
757 multinames.append(constant_strings[name_idx])
759 multinames.append('[MULTINAME kind: %d]' % kind)
760 for _c2 in range(MULTINAME_SIZES[kind]):
765 MethodInfo = collections.namedtuple(
767 ['NEED_ARGUMENTS', 'NEED_REST'])
769 for method_id in range(method_count):
772 for _ in range(param_count):
774 u30() # name index (always 0 for youtube)
776 if flags & 0x08 != 0:
779 for c in range(option_count):
782 if flags & 0x80 != 0:
783 # Param names present
784 for _ in range(param_count):
786 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
787 method_infos.append(mi)
790 metadata_count = u30()
791 for _c in range(metadata_count):
794 for _c2 in range(item_count):
798 def parse_traits_info():
799 trait_name_idx = u30()
800 kind_full = read_byte()
801 kind = kind_full & 0x0f
802 attrs = kind_full >> 4
804 if kind in [0x00, 0x06]: # Slot or Const
806 u30() # type_name_idx
810 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
813 methods[multinames[trait_name_idx]] = method_idx
814 elif kind == 0x04: # Class
817 elif kind == 0x05: # Function
820 methods[function_idx] = multinames[trait_name_idx]
822 raise ExtractorError(u'Unsupported trait kind %d' % kind)
824 if attrs & 0x4 != 0: # Metadata present
825 metadata_count = u30()
826 for _c3 in range(metadata_count):
827 u30() # metadata index
832 TARGET_CLASSNAME = u'SignatureDecipher'
833 searched_idx = multinames.index(TARGET_CLASSNAME)
834 searched_class_id = None
836 for class_id in range(class_count):
838 if name_idx == searched_idx:
839 # We found the class we're looking for!
840 searched_class_id = class_id
841 u30() # super_name idx
843 if flags & 0x08 != 0: # Protected namespace is present
844 u30() # protected_ns_idx
846 for _c2 in range(intrf_count):
850 for _c2 in range(trait_count):
853 if searched_class_id is None:
854 raise ExtractorError(u'Target class %r not found' %
859 for class_id in range(class_count):
862 for _c2 in range(trait_count):
863 trait_methods = parse_traits_info()
864 if class_id == searched_class_id:
865 method_names.update(trait_methods.items())
866 method_idxs.update(dict(
868 for name, idx in trait_methods.items()))
872 for _c in range(script_count):
875 for _c2 in range(trait_count):
879 method_body_count = u30()
880 Method = collections.namedtuple('Method', ['code', 'local_count'])
882 for _c in range(method_body_count):
886 u30() # init_scope_depth
887 u30() # max_scope_depth
889 code = read_bytes(code_length)
890 if method_idx in method_idxs:
891 m = Method(code, local_count)
892 methods[method_idxs[method_idx]] = m
893 exception_count = u30()
894 for _c2 in range(exception_count):
901 for _c2 in range(trait_count):
904 assert p + code_reader.tell() == len(code_tag)
905 assert len(methods) == len(method_idxs)
907 method_pyfunctions = {}
909 def extract_function(func_name):
910 if func_name in method_pyfunctions:
911 return method_pyfunctions[func_name]
912 if func_name not in methods:
913 raise ExtractorError(u'Cannot find function %r' % func_name)
914 m = methods[func_name]
917 registers = ['(this)'] + list(args) + [None] * m.local_count
919 coder = io.BytesIO(m.code)
921 opcode = struct.unpack('!B', coder.read(1))[0]
922 if opcode == 36: # pushbyte
923 v = struct.unpack('!B', coder.read(1))[0]
925 elif opcode == 44: # pushstring
927 stack.append(constant_strings[idx])
928 elif opcode == 48: # pushscope
929 # We don't implement the scope register, so we'll just
930 # ignore the popped value
932 elif opcode == 70: # callproperty
934 mname = multinames[index]
935 arg_count = u30(coder)
936 args = list(reversed(
937 [stack.pop() for _ in range(arg_count)]))
939 if mname == u'split':
940 assert len(args) == 1
941 assert isinstance(args[0], compat_str)
942 assert isinstance(obj, compat_str)
946 res = obj.split(args[0])
948 elif mname == u'slice':
949 assert len(args) == 1
950 assert isinstance(args[0], int)
951 assert isinstance(obj, list)
954 elif mname == u'join':
955 assert len(args) == 1
956 assert isinstance(args[0], compat_str)
957 assert isinstance(obj, list)
958 res = args[0].join(obj)
960 elif mname in method_pyfunctions:
961 stack.append(method_pyfunctions[mname](args))
963 raise NotImplementedError(
964 u'Unsupported property %r on %r'
966 elif opcode == 72: # returnvalue
969 elif opcode == 79: # callpropvoid
971 mname = multinames[index]
972 arg_count = u30(coder)
973 args = list(reversed(
974 [stack.pop() for _ in range(arg_count)]))
976 if mname == u'reverse':
977 assert isinstance(obj, list)
980 raise NotImplementedError(
981 u'Unsupported (void) property %r on %r'
983 elif opcode == 93: # findpropstrict
985 mname = multinames[index]
986 res = extract_function(mname)
988 elif opcode == 97: # setproperty
993 assert isinstance(obj, list)
994 assert isinstance(idx, int)
996 elif opcode == 98: # getlocal
998 stack.append(registers[index])
999 elif opcode == 99: # setlocal
1002 registers[index] = value
1003 elif opcode == 102: # getproperty
1005 pname = multinames[index]
1006 if pname == u'length':
1008 assert isinstance(obj, list)
1009 stack.append(len(obj))
1010 else: # Assume attribute access
1012 assert isinstance(idx, int)
1014 assert isinstance(obj, list)
1015 stack.append(obj[idx])
1016 elif opcode == 128: # coerce
1018 elif opcode == 133: # coerce_s
1019 assert isinstance(stack[-1], (type(None), compat_str))
1020 elif opcode == 164: # modulo
1021 value2 = stack.pop()
1022 value1 = stack.pop()
1023 res = value1 % value2
1025 elif opcode == 208: # getlocal_0
1026 stack.append(registers[0])
1027 elif opcode == 209: # getlocal_1
1028 stack.append(registers[1])
1029 elif opcode == 210: # getlocal_2
1030 stack.append(registers[2])
1031 elif opcode == 211: # getlocal_3
1032 stack.append(registers[3])
1033 elif opcode == 214: # setlocal_2
1034 registers[2] = stack.pop()
1035 elif opcode == 215: # setlocal_3
1036 registers[3] = stack.pop()
1038 raise NotImplementedError(
1039 u'Unsupported opcode %d' % opcode)
1041 method_pyfunctions[func_name] = resfunc
1044 initial_function = extract_function(u'decipher')
1045 return lambda s: initial_function([s])
1047 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
1048 """Turn the encrypted s field into a working signature"""
1050 if player_url is not None:
1052 if player_url not in self._player_cache:
1053 func = self._extract_signature_function(
1054 video_id, player_url, len(s)
1056 self._player_cache[player_url] = func
1057 func = self._player_cache[player_url]
1058 if self._downloader.params.get('youtube_print_sig_code'):
1059 self._print_sig_code(func, len(s))
1062 tb = traceback.format_exc()
1063 self._downloader.report_warning(
1064 u'Automatic signature extraction failed: ' + tb)
1066 self._downloader.report_warning(
1067 u'Warning: Falling back to static signature algorithm')
1068 return self._static_decrypt_signature(
1069 s, video_id, player_url, age_gate)
1071 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1073 # The videos with age protection use another player, so the
1074 # algorithms can be different.
1076 return s[2:63] + s[82] + s[64:82] + s[63]
1079 return s[86:29:-1] + s[88] + s[28:5:-1]
1081 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1083 return s[84:27:-1] + s[86] + s[26:5:-1]
1085 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1087 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1089 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1091 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1093 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1095 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1097 return s[81:36:-1] + s[0] + s[35:2:-1]
1099 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1101 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1103 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1105 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1107 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1110 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1112 def _get_available_subtitles(self, video_id):
1114 sub_list = self._download_webpage(
1115 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1116 video_id, note=False)
1117 except ExtractorError as err:
1118 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1120 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1125 params = compat_urllib_parse.urlencode({
1128 'fmt': self._downloader.params.get('subtitlesformat'),
1130 url = u'http://www.youtube.com/api/timedtext?' + params
1131 sub_lang_list[lang] = url
1132 if not sub_lang_list:
1133 self._downloader.report_warning(u'video doesn\'t have subtitles')
1135 return sub_lang_list
1137 def _get_available_automatic_caption(self, video_id, webpage):
1138 """We need the webpage for getting the captions url, pass it as an
1139 argument to speed up the process."""
1140 sub_format = self._downloader.params.get('subtitlesformat')
1141 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1142 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1143 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1145 self._downloader.report_warning(err_msg)
1147 player_config = json.loads(mobj.group(1))
1149 args = player_config[u'args']
1150 caption_url = args[u'ttsurl']
1151 timestamp = args[u'timestamp']
1152 # We get the available subtitles
1153 list_params = compat_urllib_parse.urlencode({
1158 list_url = caption_url + '&' + list_params
1159 list_page = self._download_webpage(list_url, video_id)
1160 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1161 original_lang_node = caption_list.find('track')
1162 if original_lang_node.attrib.get('kind') != 'asr' :
1163 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1165 original_lang = original_lang_node.attrib['lang_code']
1168 for lang_node in caption_list.findall('target'):
1169 sub_lang = lang_node.attrib['lang_code']
1170 params = compat_urllib_parse.urlencode({
1171 'lang': original_lang,
1177 sub_lang_list[sub_lang] = caption_url + '&' + params
1178 return sub_lang_list
1179 # An extractor error can be raise by the download process if there are
1180 # no automatic captions but there are subtitles
1181 except (KeyError, ExtractorError):
1182 self._downloader.report_warning(err_msg)
1185 def _print_formats(self, formats):
1186 print('Available formats:')
1188 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1189 self._video_dimensions.get(x, '???'),
1190 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1192 def _extract_id(self, url):
1193 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1195 raise ExtractorError(u'Invalid URL: %s' % url)
1196 video_id = mobj.group(2)
1199 def _get_video_url_list(self, url_map):
1201 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1202 with the requested formats.
1204 req_format = self._downloader.params.get('format', None)
1205 format_limit = self._downloader.params.get('format_limit', None)
1206 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1207 if format_limit is not None and format_limit in available_formats:
1208 format_list = available_formats[available_formats.index(format_limit):]
1210 format_list = available_formats
1211 existing_formats = [x for x in format_list if x in url_map]
1212 if len(existing_formats) == 0:
1213 raise ExtractorError(u'no known formats available for video')
1214 if self._downloader.params.get('listformats', None):
1215 self._print_formats(existing_formats)
1217 if req_format is None or req_format == 'best':
1218 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1219 elif req_format == 'worst':
1220 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1221 elif req_format in ('-1', 'all'):
1222 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1224 # Specific formats. We pick the first in a slash-delimeted sequence.
1225 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1226 # available in the specified format. For example,
1227 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1228 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1229 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1230 req_formats = req_format.split('/')
1231 video_url_list = None
1232 for rf in req_formats:
1234 video_url_list = [(rf, url_map[rf])]
1236 if rf in self._video_formats_map:
1237 for srf in self._video_formats_map[rf]:
1239 video_url_list = [(srf, url_map[srf])]
1244 if video_url_list is None:
1245 raise ExtractorError(u'requested format not available')
1246 return video_url_list
1248 def _extract_from_m3u8(self, manifest_url, video_id):
1250 def _get_urls(_manifest):
1251 lines = _manifest.split('\n')
1252 urls = filter(lambda l: l and not l.startswith('#'),
1255 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1256 formats_urls = _get_urls(manifest)
1257 for format_url in formats_urls:
1258 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1259 url_map[itag] = format_url
1262 def _real_extract(self, url):
1263 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1264 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1266 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1267 mobj = re.search(self._NEXT_URL_RE, url)
1269 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1270 video_id = self._extract_id(url)
1273 self.report_video_webpage_download(video_id)
1274 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1275 request = compat_urllib_request.Request(url)
1277 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1278 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1279 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1281 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1283 # Attempt to extract SWF player URL
1284 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1285 if mobj is not None:
1286 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1291 self.report_video_info_webpage_download(video_id)
1292 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1293 self.report_age_confirmation()
1295 # We simulate the access to the video from www.youtube.com/v/{video_id}
1296 # this can be viewed without login into Youtube
1297 data = compat_urllib_parse.urlencode({'video_id': video_id,
1301 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1305 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1306 video_info_webpage = self._download_webpage(video_info_url, video_id,
1308 errnote='unable to download video info webpage')
1309 video_info = compat_parse_qs(video_info_webpage)
1312 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1313 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1314 % (video_id, el_type))
1315 video_info_webpage = self._download_webpage(video_info_url, video_id,
1317 errnote='unable to download video info webpage')
1318 video_info = compat_parse_qs(video_info_webpage)
1319 if 'token' in video_info:
1321 if 'token' not in video_info:
1322 if 'reason' in video_info:
1323 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1325 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1327 # Check for "rental" videos
1328 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1329 raise ExtractorError(u'"rental" videos not supported')
1331 # Start extracting information
1332 self.report_information_extraction(video_id)
1335 if 'author' not in video_info:
1336 raise ExtractorError(u'Unable to extract uploader name')
1337 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1340 video_uploader_id = None
1341 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1342 if mobj is not None:
1343 video_uploader_id = mobj.group(1)
1345 self._downloader.report_warning(u'unable to extract uploader nickname')
1348 if 'title' not in video_info:
1349 raise ExtractorError(u'Unable to extract video title')
1350 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1353 # We try first to get a high quality image:
1354 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1355 video_webpage, re.DOTALL)
1356 if m_thumb is not None:
1357 video_thumbnail = m_thumb.group(1)
1358 elif 'thumbnail_url' not in video_info:
1359 self._downloader.report_warning(u'unable to extract video thumbnail')
1360 video_thumbnail = ''
1361 else: # don't panic if we can't find it
1362 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1366 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1367 if mobj is not None:
1368 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1369 upload_date = unified_strdate(upload_date)
1372 video_description = get_element_by_id("eow-description", video_webpage)
1373 if video_description:
1374 video_description = clean_html(video_description)
1376 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1378 video_description = unescapeHTML(fd_mobj.group(1))
1380 video_description = u''
1383 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1385 if self._downloader.params.get('listsubtitles', False):
1386 self._list_available_subtitles(video_id, video_webpage)
1389 if 'length_seconds' not in video_info:
1390 self._downloader.report_warning(u'unable to extract video duration')
1393 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1395 # Decide which formats to download
1398 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1400 raise ValueError('Could not find vevo ID')
1401 info = json.loads(mobj.group(1))
1403 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1404 # this signatures are encrypted
1405 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1407 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1408 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1409 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1411 if 'url_encoded_fmt_stream_map' in video_info:
1412 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1414 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1415 elif 'adaptive_fmts' in video_info:
1416 if 'url_encoded_fmt_stream_map' in video_info:
1417 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1419 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1423 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1424 self.report_rtmp_download()
1425 video_url_list = [(None, video_info['conn'][0])]
1426 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1427 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1428 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1430 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1431 url_data = compat_parse_qs(url_data_str)
1432 if 'itag' in url_data and 'url' in url_data:
1433 url = url_data['url'][0]
1434 if 'sig' in url_data:
1435 url += '&signature=' + url_data['sig'][0]
1436 elif 's' in url_data:
1437 encrypted_sig = url_data['s'][0]
1438 if self._downloader.params.get('verbose'):
1440 player_version = self._search_regex(
1442 player_url if player_url else None,
1443 'flash player', fatal=False)
1444 player_desc = 'flash player %s' % player_version
1446 player_version = self._search_regex(
1447 r'html5player-(.+?)\.js', video_webpage,
1448 'html5 player', fatal=False)
1449 player_desc = u'html5 player %s' % player_version
1451 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1452 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1453 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1456 jsplayer_url_json = self._search_regex(
1457 r'"assets":.+?"js":\s*("[^"]+")',
1458 video_webpage, u'JS player URL')
1459 player_url = json.loads(jsplayer_url_json)
1461 signature = self._decrypt_signature(
1462 encrypted_sig, video_id, player_url, age_gate)
1463 url += '&signature=' + signature
1464 if 'ratebypass' not in url:
1465 url += '&ratebypass=yes'
1466 url_map[url_data['itag'][0]] = url
1467 video_url_list = self._get_video_url_list(url_map)
1468 if not video_url_list:
1470 elif video_info.get('hlsvp'):
1471 manifest_url = video_info['hlsvp'][0]
1472 url_map = self._extract_from_m3u8(manifest_url, video_id)
1473 video_url_list = self._get_video_url_list(url_map)
1474 if not video_url_list:
1478 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1481 for format_param, video_real_url in video_url_list:
1483 video_extension = self._video_extensions.get(format_param, 'flv')
1485 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1486 self._video_dimensions.get(format_param, '???'),
1487 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1491 'url': video_real_url,
1492 'uploader': video_uploader,
1493 'uploader_id': video_uploader_id,
1494 'upload_date': upload_date,
1495 'title': video_title,
1496 'ext': video_extension,
1497 'format': video_format,
1498 'thumbnail': video_thumbnail,
1499 'description': video_description,
1500 'player_url': player_url,
1501 'subtitles': video_subtitles,
1502 'duration': video_duration
1506 class YoutubePlaylistIE(InfoExtractor):
1507 IE_DESC = u'YouTube.com playlists'
1508 _VALID_URL = r"""(?:
1513 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1514 \? (?:.*?&)*? (?:p|a|list)=
1517 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1520 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1522 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1524 IE_NAME = u'youtube:playlist'
1527 def suitable(cls, url):
1528 """Receives a URL and returns True if suitable for this IE."""
1529 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1531 def _real_extract(self, url):
1532 # Extract playlist id
1533 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1535 raise ExtractorError(u'Invalid URL: %s' % url)
1537 # Download playlist videos from API
1538 playlist_id = mobj.group(1) or mobj.group(2)
1541 for page_num in itertools.count(1):
1542 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1543 if start_index >= 1000:
1544 self._downloader.report_warning(u'Max number of results reached')
1546 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1547 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1550 response = json.loads(page)
1551 except ValueError as err:
1552 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1554 if 'feed' not in response:
1555 raise ExtractorError(u'Got a malformed response from YouTube API')
1556 playlist_title = response['feed']['title']['$t']
1557 if 'entry' not in response['feed']:
1558 # Number of videos is a multiple of self._MAX_RESULTS
1561 for entry in response['feed']['entry']:
1562 index = entry['yt$position']['$t']
1563 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1566 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1569 videos = [v[1] for v in sorted(videos)]
1571 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1572 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1575 class YoutubeChannelIE(InfoExtractor):
1576 IE_DESC = u'YouTube.com channels'
1577 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1578 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1579 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1580 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1581 IE_NAME = u'youtube:channel'
1583 def extract_videos_from_page(self, page):
1585 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1586 if mobj.group(1) not in ids_in_page:
1587 ids_in_page.append(mobj.group(1))
1590 def _real_extract(self, url):
1591 # Extract channel id
1592 mobj = re.match(self._VALID_URL, url)
1594 raise ExtractorError(u'Invalid URL: %s' % url)
1596 # Download channel page
1597 channel_id = mobj.group(1)
1601 url = self._TEMPLATE_URL % (channel_id, pagenum)
1602 page = self._download_webpage(url, channel_id,
1603 u'Downloading page #%s' % pagenum)
1605 # Extract video identifiers
1606 ids_in_page = self.extract_videos_from_page(page)
1607 video_ids.extend(ids_in_page)
1609 # Download any subsequent channel pages using the json-based channel_ajax query
1610 if self._MORE_PAGES_INDICATOR in page:
1611 for pagenum in itertools.count(1):
1612 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1613 page = self._download_webpage(url, channel_id,
1614 u'Downloading page #%s' % pagenum)
1616 page = json.loads(page)
1618 ids_in_page = self.extract_videos_from_page(page['content_html'])
1619 video_ids.extend(ids_in_page)
1621 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1624 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1626 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1627 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1628 return [self.playlist_result(url_entries, channel_id)]
1631 class YoutubeUserIE(InfoExtractor):
1632 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1633 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1634 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1635 _GDATA_PAGE_SIZE = 50
1636 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1637 IE_NAME = u'youtube:user'
1640 def suitable(cls, url):
1641 # Don't return True if the url can be extracted with other youtube
1642 # extractor, the regex would is too permissive and it would match.
1643 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1644 if any(ie.suitable(url) for ie in other_ies): return False
1645 else: return super(YoutubeUserIE, cls).suitable(url)
1647 def _real_extract(self, url):
1649 mobj = re.match(self._VALID_URL, url)
1651 raise ExtractorError(u'Invalid URL: %s' % url)
1653 username = mobj.group(1)
1655 # Download video ids using YouTube Data API. Result size per
1656 # query is limited (currently to 50 videos) so we need to query
1657 # page by page until there are no video ids - it means we got
1662 for pagenum in itertools.count(0):
1663 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1665 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1666 page = self._download_webpage(gdata_url, username,
1667 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1670 response = json.loads(page)
1671 except ValueError as err:
1672 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1673 if 'entry' not in response['feed']:
1674 # Number of videos is a multiple of self._MAX_RESULTS
1677 # Extract video identifiers
1679 for entry in response['feed']['entry']:
1680 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1681 video_ids.extend(ids_in_page)
1683 # A little optimization - if current page is not
1684 # "full", ie. does not contain PAGE_SIZE video ids then
1685 # we can assume that this page is the last one - there
1686 # are no more ids on further pages - no need to query
1689 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1692 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1693 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1694 return [self.playlist_result(url_results, playlist_title = username)]
1696 class YoutubeSearchIE(SearchInfoExtractor):
1697 IE_DESC = u'YouTube.com searches'
1698 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1700 IE_NAME = u'youtube:search'
1701 _SEARCH_KEY = 'ytsearch'
1703 def report_download_page(self, query, pagenum):
1704 """Report attempt to download search page with given number."""
1705 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1707 def _get_n_results(self, query, n):
1708 """Get a specified number of results for a query"""
1714 while (50 * pagenum) < limit:
1715 self.report_download_page(query, pagenum+1)
1716 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1717 request = compat_urllib_request.Request(result_url)
1719 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1720 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1721 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1722 api_response = json.loads(data)['data']
1724 if not 'items' in api_response:
1725 raise ExtractorError(u'[youtube] No video results')
1727 new_ids = list(video['id'] for video in api_response['items'])
1728 video_ids += new_ids
1730 limit = min(n, api_response['totalItems'])
1733 if len(video_ids) > n:
1734 video_ids = video_ids[:n]
1735 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1736 return self.playlist_result(videos, query)
1739 class YoutubeShowIE(InfoExtractor):
1740 IE_DESC = u'YouTube.com (multi-season) shows'
1741 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1742 IE_NAME = u'youtube:show'
1744 def _real_extract(self, url):
1745 mobj = re.match(self._VALID_URL, url)
1746 show_name = mobj.group(1)
1747 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1748 # There's one playlist for each season of the show
1749 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1750 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1751 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1754 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1756 Base class for extractors that fetch info from
1757 http://www.youtube.com/feed_ajax
1758 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1760 _LOGIN_REQUIRED = True
1762 # use action_load_personal_feed instead of action_load_system_feed
1763 _PERSONAL_FEED = False
1766 def _FEED_TEMPLATE(self):
1767 action = 'action_load_system_feed'
1768 if self._PERSONAL_FEED:
1769 action = 'action_load_personal_feed'
1770 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1774 return u'youtube:%s' % self._FEED_NAME
1776 def _real_initialize(self):
1779 def _real_extract(self, url):
1781 # The step argument is available only in 2.7 or higher
1782 for i in itertools.count(0):
1783 paging = i*self._PAGING_STEP
1784 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1785 u'%s feed' % self._FEED_NAME,
1786 u'Downloading page %s' % i)
1787 info = json.loads(info)
1788 feed_html = info['feed_html']
1789 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1790 ids = orderedSet(m.group(1) for m in m_ids)
1791 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1792 if info['paging'] is None:
1794 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1796 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1797 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1798 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1799 _FEED_NAME = 'subscriptions'
1800 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1802 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1803 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1804 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1805 _FEED_NAME = 'recommended'
1806 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1808 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1809 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1810 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1811 _FEED_NAME = 'watch_later'
1812 _PLAYLIST_TITLE = u'Youtube Watch Later'
1814 _PERSONAL_FEED = True
1816 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1817 IE_NAME = u'youtube:favorites'
1818 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1819 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1820 _LOGIN_REQUIRED = True
1822 def _real_extract(self, url):
1823 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1824 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1825 return self.url_result(playlist_id, 'YoutubePlaylist')