15 from .common import InfoExtractor, SearchInfoExtractor
16 from .subtitles import SubtitlesInfoExtractor
22 compat_urllib_request,
33 class YoutubeBaseInfoExtractor(InfoExtractor):
34 """Provide base functions for Youtube extractors"""
35 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
36 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
37 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
38 _NETRC_MACHINE = 'youtube'
39 # If True it will raise an error if no login info is provided
40 _LOGIN_REQUIRED = False
42 def report_lang(self):
43 """Report attempt to set language."""
44 self.to_screen(u'Setting language')
46 def _set_language(self):
47 request = compat_urllib_request.Request(self._LANG_URL)
50 compat_urllib_request.urlopen(request).read()
51 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
52 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
57 (username, password) = self._get_login_info()
58 # No authentication to be performed
60 if self._LOGIN_REQUIRED:
61 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
64 request = compat_urllib_request.Request(self._LOGIN_URL)
66 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
67 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
68 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
73 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
76 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
82 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
86 u'PersistentCookie': u'yes',
88 u'bgresponse': u'js_disabled',
89 u'checkConnection': u'',
90 u'checkedDomains': u'youtube',
96 u'signIn': u'Sign in',
98 u'service': u'youtube',
102 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
104 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
105 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
106 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
109 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
110 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
111 self._downloader.report_warning(u'unable to log in: bad username or password')
113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
118 def _confirm_age(self):
121 'action_confirm': 'Confirm',
123 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
125 self.report_age_confirmation()
126 compat_urllib_request.urlopen(request).read().decode('utf-8')
127 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
128 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
131 def _real_initialize(self):
132 if self._downloader is None:
134 if not self._set_language():
136 if not self._login():
141 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
142 IE_DESC = u'YouTube.com'
145 (?:https?://)? # http(s):// (optional)
146 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
147 tube\.majestyc\.net/|
148 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
149 (?:.*?\#/)? # handle anchor (#/) redirect urls
150 (?: # the various things that can precede the ID:
151 (?:(?:v|embed|e)/) # v/ or embed/ or e/
152 |(?: # or the v= param in all its forms
153 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
154 (?:\?|\#!?) # the params delimiter ? or # or #!
155 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
159 |youtu\.be/ # just youtu.be/xxxx
161 )? # all until now is optional -> you can pass the naked ID
162 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
163 (?(1).+)? # if we found the ID, everything can follow
165 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
166 # Listed in order of quality
167 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
168 # Apple HTTP Live Streaming
169 '96', '95', '94', '93', '92', '132', '151',
171 '85', '84', '102', '83', '101', '82', '100',
173 '138', '137', '248', '136', '247', '135', '246',
174 '245', '244', '134', '243', '133', '242', '160',
176 '141', '172', '140', '171', '139',
178 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
179 # Apple HTTP Live Streaming
180 '96', '95', '94', '93', '92', '132', '151',
182 '85', '102', '84', '101', '83', '100', '82',
184 '138', '248', '137', '247', '136', '246', '245',
185 '244', '135', '243', '134', '242', '133', '160',
187 '172', '141', '171', '140', '139',
189 _video_formats_map = {
190 'flv': ['35', '34', '6', '5'],
191 '3gp': ['36', '17', '13'],
192 'mp4': ['38', '37', '22', '18'],
193 'webm': ['46', '45', '44', '43'],
195 _video_extensions = {
217 # Apple HTTP Live Streaming
249 _video_dimensions = {
331 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
332 u"file": u"BaW_jenozKc.mp4",
334 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
335 u"uploader": u"Philipp Hagemeister",
336 u"uploader_id": u"phihag",
337 u"upload_date": u"20121002",
338 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
342 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
343 u"file": u"1ltcDfZMA3U.flv",
344 u"note": u"Test VEVO video (#897)",
346 u"upload_date": u"20070518",
347 u"title": u"Maps - It Will Find You",
348 u"description": u"Music video by Maps performing It Will Find You.",
349 u"uploader": u"MuteUSA",
350 u"uploader_id": u"MuteUSA"
354 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
355 u"file": u"UxxajLWwzqY.mp4",
356 u"note": u"Test generic use_cipher_signature video (#897)",
358 u"upload_date": u"20120506",
359 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
360 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
361 u"uploader": u"Icona Pop",
362 u"uploader_id": u"IconaPop"
366 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
367 u"file": u"07FYdnEawAQ.mp4",
368 u"note": u"Test VEVO video with age protection (#956)",
370 u"upload_date": u"20130703",
371 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
372 u"description": u"md5:64249768eec3bc4276236606ea996373",
373 u"uploader": u"justintimberlakeVEVO",
374 u"uploader_id": u"justintimberlakeVEVO"
378 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
379 u'file': u'TGi3HqYrWHE.mp4',
380 u'note': u'm3u8 video',
382 u'title': u'Triathlon - Men - London 2012 Olympic Games',
383 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
384 u'uploader': u'olympic',
385 u'upload_date': u'20120807',
386 u'uploader_id': u'olympic',
389 u'skip_download': True,
396 def suitable(cls, url):
397 """Receives a URL and returns True if suitable for this IE."""
398 if YoutubePlaylistIE.suitable(url): return False
399 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
401 def __init__(self, *args, **kwargs):
402 super(YoutubeIE, self).__init__(*args, **kwargs)
403 self._player_cache = {}
405 def report_video_webpage_download(self, video_id):
406 """Report attempt to download video webpage."""
407 self.to_screen(u'%s: Downloading video webpage' % video_id)
409 def report_video_info_webpage_download(self, video_id):
410 """Report attempt to download video info webpage."""
411 self.to_screen(u'%s: Downloading video info webpage' % video_id)
413 def report_information_extraction(self, video_id):
414 """Report attempt to extract video information."""
415 self.to_screen(u'%s: Extracting video information' % video_id)
417 def report_unavailable_format(self, video_id, format):
418 """Report extracted video URL."""
419 self.to_screen(u'%s: Format %s not available' % (video_id, format))
421 def report_rtmp_download(self):
422 """Indicate the download will use the RTMP protocol."""
423 self.to_screen(u'RTMP download detected')
425 def _extract_signature_function(self, video_id, player_url):
426 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9]+)\.(?P<ext>[a-z]+)$',
428 player_type = id_m.group('ext')
429 player_id = id_m.group('id')
431 # TODO read from filesystem cache
433 if player_type == 'js':
434 code = self._download_webpage(
435 player_url, video_id,
436 note=u'Downloading %s player %s' % (player_type, player_id),
437 errnote=u'Download of %s failed' % player_url)
438 res = self._parse_sig_js(code)
439 elif player_tpye == 'swf':
440 urlh = self._request_webpage(
441 player_url, video_id,
442 note=u'Downloading %s player %s' % (player_type, player_id),
443 errnote=u'Download of %s failed' % player_url)
445 res = self._parse_sig_swf(code)
447 assert False, 'Invalid player type %r' % player_type
453 def _parse_sig_js(self, jscode):
454 funcname = self._search_regex(
455 r'signature=([a-zA-Z]+)', jscode,
456 u'Initial JS player signature function name')
461 return string.lowercase.index(varname)
463 def interpret_statement(stmt, local_vars, allow_recursion=20):
464 if allow_recursion < 0:
465 raise ExctractorError(u'Recursion limit reached')
467 if stmt.startswith(u'var '):
468 stmt = stmt[len(u'var '):]
469 ass_m = re.match(r'^(?P<out>[a-z]+)(?:\[(?P<index>[^\]]+)\])?' +
470 r'=(?P<expr>.*)$', stmt)
472 if ass_m.groupdict().get('index'):
474 lvar = local_vars[ass_m.group('out')]
475 idx = interpret_expression(ass_m.group('index'),
476 local_vars, allow_recursion)
477 assert isinstance(idx, int)
480 expr = ass_m.group('expr')
483 local_vars[ass_m.group('out')] = val
485 expr = ass_m.group('expr')
486 elif stmt.startswith(u'return '):
488 expr = stmt[len(u'return '):]
490 raise ExtractorError(
491 u'Cannot determine left side of statement in %r' % stmt)
493 v = interpret_expression(expr, local_vars, allow_recursion)
496 def interpret_expression(expr, local_vars, allow_recursion):
501 return local_vars[expr]
503 m = re.match(r'^(?P<in>[a-z]+)\.(?P<member>.*)$', expr)
505 member = m.group('member')
506 val = local_vars[m.group('in')]
507 if member == 'split("")':
509 if member == 'join("")':
511 if member == 'length':
513 if member == 'reverse()':
515 slice_m = re.match(r'slice\((?P<idx>.*)\)', member)
517 idx = interpret_expression(
518 slice_m.group('idx'), local_vars, allow_recursion-1)
522 r'^(?P<in>[a-z]+)\[(?P<idx>.+)\]$', expr)
524 val = local_vars[m.group('in')]
525 idx = interpret_expression(m.group('idx'), local_vars,
529 m = re.match(r'^(?P<a>.+?)(?P<op>[%])(?P<b>.+?)$', expr)
531 a = interpret_expression(m.group('a'),
532 local_vars, allow_recursion)
533 b = interpret_expression(m.group('b'),
534 local_vars, allow_recursion)
538 r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
540 fname = m.group('func')
541 if fname not in functions:
542 functions[fname] = extract_function(fname)
543 argvals = [int(v) if v.isdigit() else local_vars[v]
544 for v in m.group('args').split(',')]
545 return functions[fname](argvals)
546 raise ExtractorError(u'Unsupported JS expression %r' % expr)
548 def extract_function(funcname):
550 r'function ' + re.escape(funcname) +
551 r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
553 argnames = func_m.group('args').split(',')
556 local_vars = dict(zip(argnames, args))
557 for stmt in func_m.group('code').split(';'):
558 res = interpret_statement(stmt, local_vars)
562 initial_function = extract_function(funcname)
563 return lambda s: initial_function([s])
565 def _parse_sig_swf(self, file_contents):
566 if file_contents[1:3] != b'WS':
567 raise ExtractorError(
568 u'Not an SWF file; header is %r' % file_contents[:3])
569 if file_contents[:1] == b'C':
570 content = zlib.decompress(file_contents[8:])
572 raise NotImplementedError(u'Unsupported compression format %r' %
575 def extract_tags(content):
577 while pos < len(content):
578 header16 = struct.unpack('<H', content[pos:pos+2])[0]
580 tag_code = header16 >> 6
581 tag_len = header16 & 0x3f
583 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
585 assert pos+tag_len <= len(content)
586 yield (tag_code, content[pos:pos+tag_len])
590 for tag_code, tag in extract_tags(content)
592 p = code_tag.index(b'\0', 4) + 1
593 code_reader = io.BytesIO(code_tag[p:])
595 # Parse ABC (AVM2 ByteCode)
596 def read_int(reader=None):
604 b = struct.unpack('<B', buf)[0]
605 res = res | ((b & 0x7f) << shift)
611 def u30(reader=None):
612 res = read_int(reader)
613 assert res & 0xf0000000 == 0
617 def s32(reader=None):
619 if v & 0x80000000 != 0:
620 v = - ((v ^ 0xffffffff) + 1)
623 def string(reader=None):
627 resb = reader.read(slen)
628 assert len(resb) == slen
629 return resb.decode('utf-8')
631 def read_bytes(count, reader=None):
634 resb = reader.read(count)
635 assert len(resb) == count
638 def read_byte(reader=None):
639 resb = read_bytes(1, reader=reader)
640 res = struct.unpack('<B', resb)[0]
643 # minor_version + major_version
644 _ = read_bytes(2 + 2)
648 for _c in range(1, int_count):
651 for _c in range(1, uint_count):
654 _ = read_bytes((double_count-1) * 8)
656 constant_strings = [u'']
657 for _c in range(1, string_count):
659 constant_strings.append(s)
660 namespace_count = u30()
661 for _c in range(1, namespace_count):
662 _ = read_bytes(1) # kind
665 for _c in range(1, ns_set_count):
667 for _c2 in range(count):
669 multiname_count = u30()
678 0x0e: 2, # MultinameA
679 0x1b: 1, # MultinameL
680 0x1c: 1, # MultinameLA
683 for _c in range(1, multiname_count):
685 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
687 namespace_idx = u30()
689 multinames.append(constant_strings[name_idx])
691 multinames.append('[MULTINAME kind: %d]' % kind)
692 for _c2 in range(MULTINAME_SIZES[kind]):
697 MethodInfo = collections.namedtuple(
699 ['NEED_ARGUMENTS', 'NEED_REST'])
701 for method_id in range(method_count):
703 _ = u30() # return type
704 for _ in range(param_count):
705 _ = u30() # param type
706 _ = u30() # name index (always 0 for youtube)
708 if flags & 0x08 != 0:
711 for c in range(option_count):
713 _ = read_bytes(1) # kind
714 if flags & 0x80 != 0:
715 # Param names present
716 for _ in range(param_count):
717 _ = u30() # param name
718 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
719 method_infos.append(mi)
722 metadata_count = u30()
723 for _c in range(metadata_count):
726 for _c2 in range(item_count):
730 def parse_traits_info():
731 trait_name_idx = u30()
732 kind_full = read_byte()
733 kind = kind_full & 0x0f
734 attrs = kind_full >> 4
736 if kind in [0x00, 0x06]: # Slot or Const
738 type_name_idx = u30()
741 _ = read_byte() # vkind
742 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
745 methods[multinames[trait_name_idx]] = method_idx
746 elif kind == 0x04: # Class
749 elif kind == 0x05: # Function
752 methods[function_idx] = multinames[trait_name_idx]
754 raise ExtractorError(u'Unsupported trait kind %d' % kind)
756 if attrs & 0x4 != 0: # Metadata present
757 metadata_count = u30()
758 for _c3 in range(metadata_count):
764 TARGET_CLASSNAME = u'SignatureDecipher'
765 searched_idx = multinames.index(TARGET_CLASSNAME)
766 searched_class_id = None
768 for class_id in range(class_count):
770 if name_idx == searched_idx:
771 # We found the class we're looking for!
772 searched_class_id = class_id
773 _ = u30() # super_name idx
775 if flags & 0x08 != 0: # Protected namespace is present
776 protected_ns_idx = u30()
778 for _c2 in range(intrf_count):
782 for _c2 in range(trait_count):
783 _ = parse_traits_info()
785 if searched_class_id is None:
786 raise ExtractorError(u'Target class %r not found' %
791 for class_id in range(class_count):
794 for _c2 in range(trait_count):
795 trait_methods = parse_traits_info()
796 if class_id == searched_class_id:
797 method_names.update(trait_methods.items())
798 method_idxs.update(dict(
800 for name, idx in trait_methods.items()))
804 for _c in range(script_count):
807 for _c2 in range(trait_count):
808 _ = parse_traits_info()
811 method_body_count = u30()
812 Method = collections.namedtuple('Method', ['code', 'local_count'])
814 for _c in range(method_body_count):
818 init_scope_depth = u30()
819 max_scope_depth = u30()
821 code = read_bytes(code_length)
822 if method_idx in method_idxs:
823 m = Method(code, local_count)
824 methods[method_idxs[method_idx]] = m
825 exception_count = u30()
826 for _c2 in range(exception_count):
833 for _c2 in range(trait_count):
834 _ = parse_traits_info()
836 assert p + code_reader.tell() == len(code_tag)
837 assert len(methods) == len(method_idxs)
839 method_pyfunctions = {}
841 def extract_function(func_name):
842 if func_name in method_pyfunctions:
843 return method_pyfunctions[func_name]
844 if func_name not in methods:
845 raise ExtractorError(u'Cannot find function %r' % func_name)
846 m = methods[func_name]
849 registers = ['(this)'] + list(args) + [None] * m.local_count
851 coder = io.BytesIO(m.code)
853 opcode = struct.unpack('!B', coder.read(1))[0]
854 if opcode == 36: # pushbyte
855 v = struct.unpack('!B', coder.read(1))[0]
857 elif opcode == 44: # pushstring
859 stack.append(constant_strings[idx])
860 elif opcode == 48: # pushscope
861 # We don't implement the scope register, so we'll just
862 # ignore the popped value
864 elif opcode == 70: # callproperty
866 mname = multinames[index]
867 arg_count = u30(coder)
868 args = list(reversed(
869 [stack.pop() for _ in range(arg_count)]))
871 if mname == u'split':
872 assert len(args) == 1
873 assert isinstance(args[0], compat_str)
874 assert isinstance(obj, compat_str)
878 res = obj.split(args[0])
880 elif mname == u'slice':
881 assert len(args) == 1
882 assert isinstance(args[0], int)
883 assert isinstance(obj, list)
886 elif mname == u'join':
887 assert len(args) == 1
888 assert isinstance(args[0], compat_str)
889 assert isinstance(obj, list)
890 res = args[0].join(obj)
892 elif mname in method_pyfunctions:
893 stack.append(method_pyfunctions[mname](args))
895 raise NotImplementedError(
896 u'Unsupported property %r on %r'
898 elif opcode == 72: # returnvalue
901 elif opcode == 79: # callpropvoid
903 mname = multinames[index]
904 arg_count = u30(coder)
905 args = list(reversed(
906 [stack.pop() for _ in range(arg_count)]))
908 if mname == u'reverse':
909 assert isinstance(obj, list)
912 raise NotImplementedError(
913 u'Unsupported (void) property %r on %r'
915 elif opcode == 93: # findpropstrict
917 mname = multinames[index]
918 res = extract_function(mname)
920 elif opcode == 97: # setproperty
925 assert isinstance(obj, list)
926 assert isinstance(idx, int)
928 elif opcode == 98: # getlocal
930 stack.append(registers[index])
931 elif opcode == 99: # setlocal
934 registers[index] = value
935 elif opcode == 102: # getproperty
937 pname = multinames[index]
938 if pname == u'length':
940 assert isinstance(obj, list)
941 stack.append(len(obj))
942 else: # Assume attribute access
944 assert isinstance(idx, int)
946 assert isinstance(obj, list)
947 stack.append(obj[idx])
948 elif opcode == 128: # coerce
950 elif opcode == 133: # coerce_s
951 assert isinstance(stack[-1], (type(None), compat_str))
952 elif opcode == 164: # modulo
955 res = value1 % value2
957 elif opcode == 208: # getlocal_0
958 stack.append(registers[0])
959 elif opcode == 209: # getlocal_1
960 stack.append(registers[1])
961 elif opcode == 210: # getlocal_2
962 stack.append(registers[2])
963 elif opcode == 211: # getlocal_3
964 stack.append(registers[3])
965 elif opcode == 214: # setlocal_2
966 registers[2] = stack.pop()
967 elif opcode == 215: # setlocal_3
968 registers[3] = stack.pop()
970 raise NotImplementedError(
971 u'Unsupported opcode %d' % opcode)
973 method_pyfunctions[func_name] = resfunc
976 initial_function = extract_function(u'decipher')
977 return lambda s: initial_function([s])
979 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
980 """Turn the encrypted s field into a working signature"""
982 if player_url is not None:
984 if player_url not in self._player_cache:
985 func = self._extract_signature_function(
988 self._player_cache[player_url] = func
989 return self._player_cache[player_url](s)
990 except Exception as e:
991 tb = traceback.format_exc()
992 self._downloader.report_warning(
993 u'Automatic signature extraction failed: ' + tb)
995 self._downloader.report_warning(
996 u'Warning: Falling back to static signature algorithm')
997 return self._static_decrypt_signature(
998 s, video_id, player_url, age_gate)
1000 def _static_decrypt_signature(self, s, video_id, player_url, age_gate):
1002 # The videos with age protection use another player, so the
1003 # algorithms can be different.
1005 return s[2:63] + s[82] + s[64:82] + s[63]
1008 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
1010 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
1012 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
1014 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
1016 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
1018 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
1020 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
1022 return s[81:36:-1] + s[0] + s[35:2:-1]
1024 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
1026 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
1028 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1030 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
1032 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
1035 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
1037 def _decrypt_signature_age_gate(self, s):
1038 # The videos with age protection use another player, so the algorithms
1041 return s[2:63] + s[82] + s[64:82] + s[63]
1043 # Fallback to the other algortihms
1044 return self._decrypt_signature(s)
1046 def _get_available_subtitles(self, video_id):
1048 sub_list = self._download_webpage(
1049 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
1050 video_id, note=False)
1051 except ExtractorError as err:
1052 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
1054 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
1059 params = compat_urllib_parse.urlencode({
1062 'fmt': self._downloader.params.get('subtitlesformat'),
1064 url = u'http://www.youtube.com/api/timedtext?' + params
1065 sub_lang_list[lang] = url
1066 if not sub_lang_list:
1067 self._downloader.report_warning(u'video doesn\'t have subtitles')
1069 return sub_lang_list
1071 def _get_available_automatic_caption(self, video_id, webpage):
1072 """We need the webpage for getting the captions url, pass it as an
1073 argument to speed up the process."""
1074 sub_format = self._downloader.params.get('subtitlesformat')
1075 self.to_screen(u'%s: Looking for automatic captions' % video_id)
1076 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
1077 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
1079 self._downloader.report_warning(err_msg)
1081 player_config = json.loads(mobj.group(1))
1083 args = player_config[u'args']
1084 caption_url = args[u'ttsurl']
1085 timestamp = args[u'timestamp']
1086 # We get the available subtitles
1087 list_params = compat_urllib_parse.urlencode({
1092 list_url = caption_url + '&' + list_params
1093 list_page = self._download_webpage(list_url, video_id)
1094 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
1095 original_lang_node = caption_list.find('track')
1096 if original_lang_node.attrib.get('kind') != 'asr' :
1097 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
1099 original_lang = original_lang_node.attrib['lang_code']
1102 for lang_node in caption_list.findall('target'):
1103 sub_lang = lang_node.attrib['lang_code']
1104 params = compat_urllib_parse.urlencode({
1105 'lang': original_lang,
1111 sub_lang_list[sub_lang] = caption_url + '&' + params
1112 return sub_lang_list
1113 # An extractor error can be raise by the download process if there are
1114 # no automatic captions but there are subtitles
1115 except (KeyError, ExtractorError):
1116 self._downloader.report_warning(err_msg)
1119 def _print_formats(self, formats):
1120 print('Available formats:')
1122 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
1123 self._video_dimensions.get(x, '???'),
1124 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
1126 def _extract_id(self, url):
1127 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1129 raise ExtractorError(u'Invalid URL: %s' % url)
1130 video_id = mobj.group(2)
1133 def _get_video_url_list(self, url_map):
1135 Transform a dictionary in the format {itag:url} to a list of (itag, url)
1136 with the requested formats.
1138 req_format = self._downloader.params.get('format', None)
1139 format_limit = self._downloader.params.get('format_limit', None)
1140 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
1141 if format_limit is not None and format_limit in available_formats:
1142 format_list = available_formats[available_formats.index(format_limit):]
1144 format_list = available_formats
1145 existing_formats = [x for x in format_list if x in url_map]
1146 if len(existing_formats) == 0:
1147 raise ExtractorError(u'no known formats available for video')
1148 if self._downloader.params.get('listformats', None):
1149 self._print_formats(existing_formats)
1151 if req_format is None or req_format == 'best':
1152 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
1153 elif req_format == 'worst':
1154 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
1155 elif req_format in ('-1', 'all'):
1156 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
1158 # Specific formats. We pick the first in a slash-delimeted sequence.
1159 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
1160 # available in the specified format. For example,
1161 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
1162 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
1163 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
1164 req_formats = req_format.split('/')
1165 video_url_list = None
1166 for rf in req_formats:
1168 video_url_list = [(rf, url_map[rf])]
1170 if rf in self._video_formats_map:
1171 for srf in self._video_formats_map[rf]:
1173 video_url_list = [(srf, url_map[srf])]
1178 if video_url_list is None:
1179 raise ExtractorError(u'requested format not available')
1180 return video_url_list
1182 def _extract_from_m3u8(self, manifest_url, video_id):
1184 def _get_urls(_manifest):
1185 lines = _manifest.split('\n')
1186 urls = filter(lambda l: l and not l.startswith('#'),
1189 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
1190 formats_urls = _get_urls(manifest)
1191 for format_url in formats_urls:
1192 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
1193 url_map[itag] = format_url
1196 def _real_extract(self, url):
1197 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
1198 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
1200 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
1201 mobj = re.search(self._NEXT_URL_RE, url)
1203 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
1204 video_id = self._extract_id(url)
1207 self.report_video_webpage_download(video_id)
1208 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1209 request = compat_urllib_request.Request(url)
1211 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
1212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1213 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
1215 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
1217 # Attempt to extract SWF player URL
1218 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1219 if mobj is not None:
1220 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1225 self.report_video_info_webpage_download(video_id)
1226 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1227 self.report_age_confirmation()
1229 # We simulate the access to the video from www.youtube.com/v/{video_id}
1230 # this can be viewed without login into Youtube
1231 data = compat_urllib_parse.urlencode({'video_id': video_id,
1235 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1239 video_info_url = 'https://www.youtube.com/get_video_info?' + data
1240 video_info_webpage = self._download_webpage(video_info_url, video_id,
1242 errnote='unable to download video info webpage')
1243 video_info = compat_parse_qs(video_info_webpage)
1246 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1247 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1248 % (video_id, el_type))
1249 video_info_webpage = self._download_webpage(video_info_url, video_id,
1251 errnote='unable to download video info webpage')
1252 video_info = compat_parse_qs(video_info_webpage)
1253 if 'token' in video_info:
1255 if 'token' not in video_info:
1256 if 'reason' in video_info:
1257 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
1259 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
1261 # Check for "rental" videos
1262 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1263 raise ExtractorError(u'"rental" videos not supported')
1265 # Start extracting information
1266 self.report_information_extraction(video_id)
1269 if 'author' not in video_info:
1270 raise ExtractorError(u'Unable to extract uploader name')
1271 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1274 video_uploader_id = None
1275 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1276 if mobj is not None:
1277 video_uploader_id = mobj.group(1)
1279 self._downloader.report_warning(u'unable to extract uploader nickname')
1282 if 'title' not in video_info:
1283 raise ExtractorError(u'Unable to extract video title')
1284 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
1287 # We try first to get a high quality image:
1288 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1289 video_webpage, re.DOTALL)
1290 if m_thumb is not None:
1291 video_thumbnail = m_thumb.group(1)
1292 elif 'thumbnail_url' not in video_info:
1293 self._downloader.report_warning(u'unable to extract video thumbnail')
1294 video_thumbnail = ''
1295 else: # don't panic if we can't find it
1296 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1300 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
1301 if mobj is not None:
1302 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1303 upload_date = unified_strdate(upload_date)
1306 video_description = get_element_by_id("eow-description", video_webpage)
1307 if video_description:
1308 video_description = clean_html(video_description)
1310 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1312 video_description = unescapeHTML(fd_mobj.group(1))
1314 video_description = u''
1317 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1319 if self._downloader.params.get('listsubtitles', False):
1320 self._list_available_subtitles(video_id, video_webpage)
1323 if 'length_seconds' not in video_info:
1324 self._downloader.report_warning(u'unable to extract video duration')
1327 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
1329 # Decide which formats to download
1332 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
1334 raise ValueError('Could not find vevo ID')
1335 info = json.loads(mobj.group(1))
1337 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1338 # this signatures are encrypted
1339 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
1341 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1342 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1343 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
1345 if 'url_encoded_fmt_stream_map' in video_info:
1346 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
1348 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
1349 elif 'adaptive_fmts' in video_info:
1350 if 'url_encoded_fmt_stream_map' in video_info:
1351 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
1353 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
1357 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1358 self.report_rtmp_download()
1359 video_url_list = [(None, video_info['conn'][0])]
1360 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
1361 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
1362 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1364 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
1365 url_data = compat_parse_qs(url_data_str)
1366 if 'itag' in url_data and 'url' in url_data:
1367 url = url_data['url'][0]
1368 if 'sig' in url_data:
1369 url += '&signature=' + url_data['sig'][0]
1370 elif 's' in url_data:
1371 encrypted_sig = url_data['s'][0]
1372 if self._downloader.params.get('verbose'):
1374 player_version = self._search_regex(
1376 player_url if player_url else None,
1377 'flash player', fatal=False)
1378 player_desc = 'flash player %s' % player_version
1380 player_version = self._search_regex(
1381 r'html5player-(.+?)\.js', video_webpage,
1382 'html5 player', fatal=False)
1383 player_desc = u'html5 player %s' % player_version
1385 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1386 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1387 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1390 jsplayer_url_json = self._search_regex(
1391 r'"assets":.+?"js":\s*("[^"]+")',
1392 video_webpage, u'JS player URL')
1393 player_url = json.loads(jsplayer_url_json)
1395 signature = self._decrypt_signature(
1396 encrypted_sig, video_id, player_url, age_gate)
1397 url += '&signature=' + signature
1398 if 'ratebypass' not in url:
1399 url += '&ratebypass=yes'
1400 url_map[url_data['itag'][0]] = url
1401 video_url_list = self._get_video_url_list(url_map)
1402 if not video_url_list:
1404 elif video_info.get('hlsvp'):
1405 manifest_url = video_info['hlsvp'][0]
1406 url_map = self._extract_from_m3u8(manifest_url, video_id)
1407 video_url_list = self._get_video_url_list(url_map)
1408 if not video_url_list:
1412 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
1415 for format_param, video_real_url in video_url_list:
1417 video_extension = self._video_extensions.get(format_param, 'flv')
1419 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
1420 self._video_dimensions.get(format_param, '???'),
1421 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
1425 'url': video_real_url,
1426 'uploader': video_uploader,
1427 'uploader_id': video_uploader_id,
1428 'upload_date': upload_date,
1429 'title': video_title,
1430 'ext': video_extension,
1431 'format': video_format,
1432 'thumbnail': video_thumbnail,
1433 'description': video_description,
1434 'player_url': player_url,
1435 'subtitles': video_subtitles,
1436 'duration': video_duration
1440 class YoutubePlaylistIE(InfoExtractor):
1441 IE_DESC = u'YouTube.com playlists'
1442 _VALID_URL = r"""(?:
1447 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1448 \? (?:.*?&)*? (?:p|a|list)=
1451 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
1454 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
1456 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
1458 IE_NAME = u'youtube:playlist'
1461 def suitable(cls, url):
1462 """Receives a URL and returns True if suitable for this IE."""
1463 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1465 def _real_extract(self, url):
1466 # Extract playlist id
1467 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1469 raise ExtractorError(u'Invalid URL: %s' % url)
1471 # Download playlist videos from API
1472 playlist_id = mobj.group(1) or mobj.group(2)
1475 for page_num in itertools.count(1):
1476 start_index = self._MAX_RESULTS * (page_num - 1) + 1
1477 if start_index >= 1000:
1478 self._downloader.report_warning(u'Max number of results reached')
1480 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
1481 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1484 response = json.loads(page)
1485 except ValueError as err:
1486 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1488 if 'feed' not in response:
1489 raise ExtractorError(u'Got a malformed response from YouTube API')
1490 playlist_title = response['feed']['title']['$t']
1491 if 'entry' not in response['feed']:
1492 # Number of videos is a multiple of self._MAX_RESULTS
1495 for entry in response['feed']['entry']:
1496 index = entry['yt$position']['$t']
1497 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
1500 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
1503 videos = [v[1] for v in sorted(videos)]
1505 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
1506 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1509 class YoutubeChannelIE(InfoExtractor):
1510 IE_DESC = u'YouTube.com channels'
1511 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1512 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1513 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1514 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1515 IE_NAME = u'youtube:channel'
1517 def extract_videos_from_page(self, page):
1519 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1520 if mobj.group(1) not in ids_in_page:
1521 ids_in_page.append(mobj.group(1))
1524 def _real_extract(self, url):
1525 # Extract channel id
1526 mobj = re.match(self._VALID_URL, url)
1528 raise ExtractorError(u'Invalid URL: %s' % url)
1530 # Download channel page
1531 channel_id = mobj.group(1)
1535 url = self._TEMPLATE_URL % (channel_id, pagenum)
1536 page = self._download_webpage(url, channel_id,
1537 u'Downloading page #%s' % pagenum)
1539 # Extract video identifiers
1540 ids_in_page = self.extract_videos_from_page(page)
1541 video_ids.extend(ids_in_page)
1543 # Download any subsequent channel pages using the json-based channel_ajax query
1544 if self._MORE_PAGES_INDICATOR in page:
1545 for pagenum in itertools.count(1):
1546 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1547 page = self._download_webpage(url, channel_id,
1548 u'Downloading page #%s' % pagenum)
1550 page = json.loads(page)
1552 ids_in_page = self.extract_videos_from_page(page['content_html'])
1553 video_ids.extend(ids_in_page)
1555 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1558 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1560 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1561 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
1562 return [self.playlist_result(url_entries, channel_id)]
1565 class YoutubeUserIE(InfoExtractor):
1566 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1567 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1568 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1569 _GDATA_PAGE_SIZE = 50
1570 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1571 IE_NAME = u'youtube:user'
1574 def suitable(cls, url):
1575 # Don't return True if the url can be extracted with other youtube
1576 # extractor, the regex would is too permissive and it would match.
1577 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1578 if any(ie.suitable(url) for ie in other_ies): return False
1579 else: return super(YoutubeUserIE, cls).suitable(url)
1581 def _real_extract(self, url):
1583 mobj = re.match(self._VALID_URL, url)
1585 raise ExtractorError(u'Invalid URL: %s' % url)
1587 username = mobj.group(1)
1589 # Download video ids using YouTube Data API. Result size per
1590 # query is limited (currently to 50 videos) so we need to query
1591 # page by page until there are no video ids - it means we got
1596 for pagenum in itertools.count(0):
1597 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1599 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1600 page = self._download_webpage(gdata_url, username,
1601 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1604 response = json.loads(page)
1605 except ValueError as err:
1606 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1607 if 'entry' not in response['feed']:
1608 # Number of videos is a multiple of self._MAX_RESULTS
1611 # Extract video identifiers
1613 for entry in response['feed']['entry']:
1614 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1615 video_ids.extend(ids_in_page)
1617 # A little optimization - if current page is not
1618 # "full", ie. does not contain PAGE_SIZE video ids then
1619 # we can assume that this page is the last one - there
1620 # are no more ids on further pages - no need to query
1623 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1626 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1627 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1628 return [self.playlist_result(url_results, playlist_title = username)]
1630 class YoutubeSearchIE(SearchInfoExtractor):
1631 IE_DESC = u'YouTube.com searches'
1632 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1634 IE_NAME = u'youtube:search'
1635 _SEARCH_KEY = 'ytsearch'
1637 def report_download_page(self, query, pagenum):
1638 """Report attempt to download search page with given number."""
1639 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1641 def _get_n_results(self, query, n):
1642 """Get a specified number of results for a query"""
1648 while (50 * pagenum) < limit:
1649 self.report_download_page(query, pagenum+1)
1650 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1651 request = compat_urllib_request.Request(result_url)
1653 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1654 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1655 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1656 api_response = json.loads(data)['data']
1658 if not 'items' in api_response:
1659 raise ExtractorError(u'[youtube] No video results')
1661 new_ids = list(video['id'] for video in api_response['items'])
1662 video_ids += new_ids
1664 limit = min(n, api_response['totalItems'])
1667 if len(video_ids) > n:
1668 video_ids = video_ids[:n]
1669 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1670 return self.playlist_result(videos, query)
1673 class YoutubeShowIE(InfoExtractor):
1674 IE_DESC = u'YouTube.com (multi-season) shows'
1675 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1676 IE_NAME = u'youtube:show'
1678 def _real_extract(self, url):
1679 mobj = re.match(self._VALID_URL, url)
1680 show_name = mobj.group(1)
1681 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1682 # There's one playlist for each season of the show
1683 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1684 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1685 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1688 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1690 Base class for extractors that fetch info from
1691 http://www.youtube.com/feed_ajax
1692 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1694 _LOGIN_REQUIRED = True
1696 # use action_load_personal_feed instead of action_load_system_feed
1697 _PERSONAL_FEED = False
1700 def _FEED_TEMPLATE(self):
1701 action = 'action_load_system_feed'
1702 if self._PERSONAL_FEED:
1703 action = 'action_load_personal_feed'
1704 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1708 return u'youtube:%s' % self._FEED_NAME
1710 def _real_initialize(self):
1713 def _real_extract(self, url):
1715 # The step argument is available only in 2.7 or higher
1716 for i in itertools.count(0):
1717 paging = i*self._PAGING_STEP
1718 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1719 u'%s feed' % self._FEED_NAME,
1720 u'Downloading page %s' % i)
1721 info = json.loads(info)
1722 feed_html = info['feed_html']
1723 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1724 ids = orderedSet(m.group(1) for m in m_ids)
1725 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1726 if info['paging'] is None:
1728 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1730 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1731 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1732 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1733 _FEED_NAME = 'subscriptions'
1734 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1736 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1737 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1738 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1739 _FEED_NAME = 'recommended'
1740 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1742 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1743 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1744 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1745 _FEED_NAME = 'watch_later'
1746 _PLAYLIST_TITLE = u'Youtube Watch Later'
1748 _PERSONAL_FEED = True
1750 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1751 IE_NAME = u'youtube:favorites'
1752 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1753 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1754 _LOGIN_REQUIRED = True
1756 def _real_extract(self, url):
1757 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1758 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1759 return self.url_result(playlist_id, 'YoutubePlaylist')