8 import xml.etree.ElementTree
10 from .common import InfoExtractor, SearchInfoExtractor
11 from .subtitles import SubtitlesInfoExtractor
17 compat_urllib_request,
28 class YoutubeBaseInfoExtractor(InfoExtractor):
29 """Provide base functions for Youtube extractors"""
30 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
31 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
32 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
33 _NETRC_MACHINE = 'youtube'
34 # If True it will raise an error if no login info is provided
35 _LOGIN_REQUIRED = False
37 def report_lang(self):
38 """Report attempt to set language."""
39 self.to_screen(u'Setting language')
41 def _set_language(self):
42 request = compat_urllib_request.Request(self._LANG_URL)
45 compat_urllib_request.urlopen(request).read()
46 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
47 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
52 (username, password) = self._get_login_info()
53 # No authentication to be performed
55 if self._LOGIN_REQUIRED:
56 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
59 request = compat_urllib_request.Request(self._LOGIN_URL)
61 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
62 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
63 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
68 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
71 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
77 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
81 u'PersistentCookie': u'yes',
83 u'bgresponse': u'js_disabled',
84 u'checkConnection': u'',
85 u'checkedDomains': u'youtube',
91 u'signIn': u'Sign in',
93 u'service': u'youtube',
97 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
99 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
100 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
101 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
104 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
105 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
106 self._downloader.report_warning(u'unable to log in: bad username or password')
108 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
109 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
113 def _confirm_age(self):
116 'action_confirm': 'Confirm',
118 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
120 self.report_age_confirmation()
121 compat_urllib_request.urlopen(request).read().decode('utf-8')
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
126 def _real_initialize(self):
127 if self._downloader is None:
129 if not self._set_language():
131 if not self._login():
136 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
137 IE_DESC = u'YouTube.com'
140 (?:https?://)? # http(s):// (optional)
141 (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
142 tube\.majestyc\.net/|
143 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
144 (?:.*?\#/)? # handle anchor (#/) redirect urls
145 (?: # the various things that can precede the ID:
146 (?:(?:v|embed|e)/) # v/ or embed/ or e/
147 |(?: # or the v= param in all its forms
148 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
149 (?:\?|\#!?) # the params delimiter ? or # or #!
150 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
154 |youtu\.be/ # just youtu.be/xxxx
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 # Listed in order of quality
162 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '36', '17', '13',
163 # Apple HTTP Live Streaming
164 '96', '95', '94', '93', '92', '132', '151',
166 '85', '84', '102', '83', '101', '82', '100',
168 '138', '137', '248', '136', '247', '135', '246',
169 '245', '244', '134', '243', '133', '242', '160',
171 '141', '172', '140', '171', '139',
173 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '36', '17', '13',
174 # Apple HTTP Live Streaming
175 '96', '95', '94', '93', '92', '132', '151',
177 '85', '102', '84', '101', '83', '100', '82',
179 '138', '248', '137', '247', '136', '246', '245',
180 '244', '135', '243', '134', '242', '133', '160',
182 '172', '141', '171', '140', '139',
184 _video_formats_map = {
185 'flv': ['35', '34', '6', '5'],
186 '3gp': ['36', '17', '13'],
187 'mp4': ['38', '37', '22', '18'],
188 'webm': ['46', '45', '44', '43'],
190 _video_extensions = {
212 # Apple HTTP Live Streaming
244 _video_dimensions = {
326 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
327 u"file": u"BaW_jenozKc.mp4",
329 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
330 u"uploader": u"Philipp Hagemeister",
331 u"uploader_id": u"phihag",
332 u"upload_date": u"20121002",
333 u"description": u"test chars: \"'/\\ä↭𝕐\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ."
337 u"url": u"http://www.youtube.com/watch?v=1ltcDfZMA3U",
338 u"file": u"1ltcDfZMA3U.flv",
339 u"note": u"Test VEVO video (#897)",
341 u"upload_date": u"20070518",
342 u"title": u"Maps - It Will Find You",
343 u"description": u"Music video by Maps performing It Will Find You.",
344 u"uploader": u"MuteUSA",
345 u"uploader_id": u"MuteUSA"
349 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
350 u"file": u"UxxajLWwzqY.mp4",
351 u"note": u"Test generic use_cipher_signature video (#897)",
353 u"upload_date": u"20120506",
354 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
355 u"description": u"md5:3e2666e0a55044490499ea45fe9037b7",
356 u"uploader": u"Icona Pop",
357 u"uploader_id": u"IconaPop"
361 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
362 u"file": u"07FYdnEawAQ.mp4",
363 u"note": u"Test VEVO video with age protection (#956)",
365 u"upload_date": u"20130703",
366 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
367 u"description": u"md5:64249768eec3bc4276236606ea996373",
368 u"uploader": u"justintimberlakeVEVO",
369 u"uploader_id": u"justintimberlakeVEVO"
373 u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
374 u'file': u'TGi3HqYrWHE.mp4',
375 u'note': u'm3u8 video',
377 u'title': u'Triathlon - Men - London 2012 Olympic Games',
378 u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
379 u'uploader': u'olympic',
380 u'upload_date': u'20120807',
381 u'uploader_id': u'olympic',
384 u'skip_download': True,
391 def suitable(cls, url):
392 """Receives a URL and returns True if suitable for this IE."""
393 if YoutubePlaylistIE.suitable(url): return False
394 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
396 def report_video_webpage_download(self, video_id):
397 """Report attempt to download video webpage."""
398 self.to_screen(u'%s: Downloading video webpage' % video_id)
400 def report_video_info_webpage_download(self, video_id):
401 """Report attempt to download video info webpage."""
402 self.to_screen(u'%s: Downloading video info webpage' % video_id)
404 def report_information_extraction(self, video_id):
405 """Report attempt to extract video information."""
406 self.to_screen(u'%s: Extracting video information' % video_id)
408 def report_unavailable_format(self, video_id, format):
409 """Report extracted video URL."""
410 self.to_screen(u'%s: Format %s not available' % (video_id, format))
412 def report_rtmp_download(self):
413 """Indicate the download will use the RTMP protocol."""
414 self.to_screen(u'RTMP download detected')
416 def _decrypt_signature(self, s):
417 """Turn the encrypted s field into a working signature"""
420 return s[86:29:-1] + s[88] + s[28:5:-1]
422 return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
424 return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
426 return s[84:78:-1] + s[87] + s[77:60:-1] + s[0] + s[59:3:-1]
428 return s[7:28] + s[87] + s[29:45] + s[55] + s[46:55] + s[2] + s[56:87] + s[28]
430 return s[6:27] + s[4] + s[28:39] + s[27] + s[40:59] + s[2] + s[60:]
432 return s[5:34] + s[0] + s[35:38] + s[3] + s[39:45] + s[38] + s[46:53] + s[73] + s[54:73] + s[85] + s[74:85] + s[53]
434 return s[3:11] + s[0] + s[12:55] + s[84] + s[56:84]
436 return s[81:36:-1] + s[0] + s[35:2:-1]
438 return s[81:64:-1] + s[82] + s[63:52:-1] + s[45] + s[51:45:-1] + s[1] + s[44:1:-1] + s[0]
440 return s[80:73:-1] + s[81] + s[72:54:-1] + s[2] + s[53:43:-1] + s[0] + s[42:2:-1] + s[43] + s[1] + s[54]
442 return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
444 return s[1:19] + s[0] + s[20:68] + s[19] + s[69:80]
446 return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
449 raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
451 def _decrypt_signature_age_gate(self, s):
452 # The videos with age protection use another player, so the algorithms
455 return s[2:63] + s[82] + s[64:82] + s[63]
457 # Fallback to the other algortihms
458 return self._decrypt_signature(s)
460 def _get_available_subtitles(self, video_id):
462 sub_list = self._download_webpage(
463 'http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
464 video_id, note=False)
465 except ExtractorError as err:
466 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
468 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
473 params = compat_urllib_parse.urlencode({
476 'fmt': self._downloader.params.get('subtitlesformat'),
478 url = u'http://www.youtube.com/api/timedtext?' + params
479 sub_lang_list[lang] = url
480 if not sub_lang_list:
481 self._downloader.report_warning(u'video doesn\'t have subtitles')
485 def _get_available_automatic_caption(self, video_id, webpage):
486 """We need the webpage for getting the captions url, pass it as an
487 argument to speed up the process."""
488 sub_format = self._downloader.params.get('subtitlesformat')
489 self.to_screen(u'%s: Looking for automatic captions' % video_id)
490 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
491 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
493 self._downloader.report_warning(err_msg)
495 player_config = json.loads(mobj.group(1))
497 args = player_config[u'args']
498 caption_url = args[u'ttsurl']
499 timestamp = args[u'timestamp']
500 # We get the available subtitles
501 list_params = compat_urllib_parse.urlencode({
506 list_url = caption_url + '&' + list_params
507 list_page = self._download_webpage(list_url, video_id)
508 caption_list = xml.etree.ElementTree.fromstring(list_page.encode('utf-8'))
509 original_lang_node = caption_list.find('track')
510 if original_lang_node.attrib.get('kind') != 'asr' :
511 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
513 original_lang = original_lang_node.attrib['lang_code']
516 for lang_node in caption_list.findall('target'):
517 sub_lang = lang_node.attrib['lang_code']
518 params = compat_urllib_parse.urlencode({
519 'lang': original_lang,
525 sub_lang_list[sub_lang] = caption_url + '&' + params
527 # An extractor error can be raise by the download process if there are
528 # no automatic captions but there are subtitles
529 except (KeyError, ExtractorError):
530 self._downloader.report_warning(err_msg)
533 def _print_formats(self, formats):
534 print('Available formats:')
536 print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
537 self._video_dimensions.get(x, '???'),
538 ' ('+self._special_itags[x]+')' if x in self._special_itags else ''))
540 def _extract_id(self, url):
541 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
543 raise ExtractorError(u'Invalid URL: %s' % url)
544 video_id = mobj.group(2)
547 def _get_video_url_list(self, url_map):
549 Transform a dictionary in the format {itag:url} to a list of (itag, url)
550 with the requested formats.
552 req_format = self._downloader.params.get('format', None)
553 format_limit = self._downloader.params.get('format_limit', None)
554 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
555 if format_limit is not None and format_limit in available_formats:
556 format_list = available_formats[available_formats.index(format_limit):]
558 format_list = available_formats
559 existing_formats = [x for x in format_list if x in url_map]
560 if len(existing_formats) == 0:
561 raise ExtractorError(u'no known formats available for video')
562 if self._downloader.params.get('listformats', None):
563 self._print_formats(existing_formats)
565 if req_format is None or req_format == 'best':
566 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
567 elif req_format == 'worst':
568 video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
569 elif req_format in ('-1', 'all'):
570 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
572 # Specific formats. We pick the first in a slash-delimeted sequence.
573 # Format can be specified as itag or 'mp4' or 'flv' etc. We pick the highest quality
574 # available in the specified format. For example,
575 # if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
576 # if '1/mp4/3/4' is requested and '1' and '5' (is a mp4) are available, we pick '1'.
577 # if '1/mp4/3/4' is requested and '4' and '5' (is a mp4) are available, we pick '5'.
578 req_formats = req_format.split('/')
579 video_url_list = None
580 for rf in req_formats:
582 video_url_list = [(rf, url_map[rf])]
584 if rf in self._video_formats_map:
585 for srf in self._video_formats_map[rf]:
587 video_url_list = [(srf, url_map[srf])]
592 if video_url_list is None:
593 raise ExtractorError(u'requested format not available')
594 return video_url_list
596 def _extract_from_m3u8(self, manifest_url, video_id):
598 def _get_urls(_manifest):
599 lines = _manifest.split('\n')
600 urls = filter(lambda l: l and not l.startswith('#'),
603 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
604 formats_urls = _get_urls(manifest)
605 for format_url in formats_urls:
606 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
607 url_map[itag] = format_url
610 def _real_extract(self, url):
611 if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
612 self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
614 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
615 mobj = re.search(self._NEXT_URL_RE, url)
617 url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
618 video_id = self._extract_id(url)
621 self.report_video_webpage_download(video_id)
622 url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
623 request = compat_urllib_request.Request(url)
625 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627 raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err))
629 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
631 # Attempt to extract SWF player URL
632 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
634 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
639 self.report_video_info_webpage_download(video_id)
640 if re.search(r'player-age-gate-content">', video_webpage) is not None:
641 self.report_age_confirmation()
643 # We simulate the access to the video from www.youtube.com/v/{video_id}
644 # this can be viewed without login into Youtube
645 data = compat_urllib_parse.urlencode({'video_id': video_id,
649 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
653 video_info_url = 'https://www.youtube.com/get_video_info?' + data
654 video_info_webpage = self._download_webpage(video_info_url, video_id,
656 errnote='unable to download video info webpage')
657 video_info = compat_parse_qs(video_info_webpage)
660 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
661 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
662 % (video_id, el_type))
663 video_info_webpage = self._download_webpage(video_info_url, video_id,
665 errnote='unable to download video info webpage')
666 video_info = compat_parse_qs(video_info_webpage)
667 if 'token' in video_info:
669 if 'token' not in video_info:
670 if 'reason' in video_info:
671 raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
673 raise ExtractorError(u'"token" parameter not in video info for unknown reason')
675 # Check for "rental" videos
676 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
677 raise ExtractorError(u'"rental" videos not supported')
679 # Start extracting information
680 self.report_information_extraction(video_id)
683 if 'author' not in video_info:
684 raise ExtractorError(u'Unable to extract uploader name')
685 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
688 video_uploader_id = None
689 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
691 video_uploader_id = mobj.group(1)
693 self._downloader.report_warning(u'unable to extract uploader nickname')
696 if 'title' not in video_info:
697 raise ExtractorError(u'Unable to extract video title')
698 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
701 # We try first to get a high quality image:
702 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
703 video_webpage, re.DOTALL)
704 if m_thumb is not None:
705 video_thumbnail = m_thumb.group(1)
706 elif 'thumbnail_url' not in video_info:
707 self._downloader.report_warning(u'unable to extract video thumbnail')
709 else: # don't panic if we can't find it
710 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
714 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
716 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
717 upload_date = unified_strdate(upload_date)
720 video_description = get_element_by_id("eow-description", video_webpage)
721 if video_description:
722 video_description = clean_html(video_description)
724 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
726 video_description = unescapeHTML(fd_mobj.group(1))
728 video_description = u''
731 video_subtitles = self.extract_subtitles(video_id, video_webpage)
733 if self._downloader.params.get('listsubtitles', False):
734 self._list_available_subtitles(video_id, video_webpage)
737 if 'length_seconds' not in video_info:
738 self._downloader.report_warning(u'unable to extract video duration')
741 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
743 # Decide which formats to download
746 mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
748 raise ValueError('Could not find vevo ID')
749 info = json.loads(mobj.group(1))
751 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
752 # this signatures are encrypted
753 m_s = re.search(r'[&,]s=', args['url_encoded_fmt_stream_map'])
755 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
756 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
757 m_s = re.search(r'[&,]s=', args.get('adaptive_fmts', u''))
759 if 'url_encoded_fmt_stream_map' in video_info:
760 video_info['url_encoded_fmt_stream_map'][0] += ',' + args['adaptive_fmts']
762 video_info['url_encoded_fmt_stream_map'] = [args['adaptive_fmts']]
763 elif 'adaptive_fmts' in video_info:
764 if 'url_encoded_fmt_stream_map' in video_info:
765 video_info['url_encoded_fmt_stream_map'][0] += ',' + video_info['adaptive_fmts'][0]
767 video_info['url_encoded_fmt_stream_map'] = video_info['adaptive_fmts']
771 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
772 self.report_rtmp_download()
773 video_url_list = [(None, video_info['conn'][0])]
774 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
775 if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
776 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
778 for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
779 url_data = compat_parse_qs(url_data_str)
780 if 'itag' in url_data and 'url' in url_data:
781 url = url_data['url'][0]
782 if 'sig' in url_data:
783 url += '&signature=' + url_data['sig'][0]
784 elif 's' in url_data:
785 if self._downloader.params.get('verbose'):
788 player = 'flash player'
790 player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
791 'html5 player', fatal=False)
792 parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
793 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
794 (len(s), parts_sizes, url_data['itag'][0], player))
795 encrypted_sig = url_data['s'][0]
797 signature = self._decrypt_signature_age_gate(encrypted_sig)
799 signature = self._decrypt_signature(encrypted_sig)
800 url += '&signature=' + signature
801 if 'ratebypass' not in url:
802 url += '&ratebypass=yes'
803 url_map[url_data['itag'][0]] = url
804 video_url_list = self._get_video_url_list(url_map)
805 if not video_url_list:
807 elif video_info.get('hlsvp'):
808 manifest_url = video_info['hlsvp'][0]
809 url_map = self._extract_from_m3u8(manifest_url, video_id)
810 video_url_list = self._get_video_url_list(url_map)
811 if not video_url_list:
815 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
818 for format_param, video_real_url in video_url_list:
820 video_extension = self._video_extensions.get(format_param, 'flv')
822 video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
823 self._video_dimensions.get(format_param, '???'),
824 ' ('+self._special_itags[format_param]+')' if format_param in self._special_itags else '')
828 'url': video_real_url,
829 'uploader': video_uploader,
830 'uploader_id': video_uploader_id,
831 'upload_date': upload_date,
832 'title': video_title,
833 'ext': video_extension,
834 'format': video_format,
835 'thumbnail': video_thumbnail,
836 'description': video_description,
837 'player_url': player_url,
838 'subtitles': video_subtitles,
839 'duration': video_duration
843 class YoutubePlaylistIE(InfoExtractor):
844 IE_DESC = u'YouTube.com playlists'
850 (?:course|view_play_list|my_playlists|artist|playlist|watch)
851 \? (?:.*?&)*? (?:p|a|list)=
854 ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
857 ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
859 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
861 IE_NAME = u'youtube:playlist'
864 def suitable(cls, url):
865 """Receives a URL and returns True if suitable for this IE."""
866 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
868 def _real_extract(self, url):
869 # Extract playlist id
870 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
872 raise ExtractorError(u'Invalid URL: %s' % url)
874 # Download playlist videos from API
875 playlist_id = mobj.group(1) or mobj.group(2)
878 for page_num in itertools.count(1):
879 start_index = self._MAX_RESULTS * (page_num - 1) + 1
880 if start_index >= 1000:
881 self._downloader.report_warning(u'Max number of results reached')
883 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
884 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
887 response = json.loads(page)
888 except ValueError as err:
889 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
891 if 'feed' not in response:
892 raise ExtractorError(u'Got a malformed response from YouTube API')
893 playlist_title = response['feed']['title']['$t']
894 if 'entry' not in response['feed']:
895 # Number of videos is a multiple of self._MAX_RESULTS
898 for entry in response['feed']['entry']:
899 index = entry['yt$position']['$t']
900 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
903 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
906 videos = [v[1] for v in sorted(videos)]
908 url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
909 return [self.playlist_result(url_results, playlist_id, playlist_title)]
912 class YoutubeChannelIE(InfoExtractor):
913 IE_DESC = u'YouTube.com channels'
914 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
915 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
916 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
917 _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
918 IE_NAME = u'youtube:channel'
920 def extract_videos_from_page(self, page):
922 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
923 if mobj.group(1) not in ids_in_page:
924 ids_in_page.append(mobj.group(1))
927 def _real_extract(self, url):
929 mobj = re.match(self._VALID_URL, url)
931 raise ExtractorError(u'Invalid URL: %s' % url)
933 # Download channel page
934 channel_id = mobj.group(1)
938 url = self._TEMPLATE_URL % (channel_id, pagenum)
939 page = self._download_webpage(url, channel_id,
940 u'Downloading page #%s' % pagenum)
942 # Extract video identifiers
943 ids_in_page = self.extract_videos_from_page(page)
944 video_ids.extend(ids_in_page)
946 # Download any subsequent channel pages using the json-based channel_ajax query
947 if self._MORE_PAGES_INDICATOR in page:
948 for pagenum in itertools.count(1):
949 url = self._MORE_PAGES_URL % (pagenum, channel_id)
950 page = self._download_webpage(url, channel_id,
951 u'Downloading page #%s' % pagenum)
953 page = json.loads(page)
955 ids_in_page = self.extract_videos_from_page(page['content_html'])
956 video_ids.extend(ids_in_page)
958 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
961 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
963 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
964 url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
965 return [self.playlist_result(url_entries, channel_id)]
968 class YoutubeUserIE(InfoExtractor):
969 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
970 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?)|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
971 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
972 _GDATA_PAGE_SIZE = 50
973 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
974 IE_NAME = u'youtube:user'
977 def suitable(cls, url):
978 # Don't return True if the url can be extracted with other youtube
979 # extractor, the regex would is too permissive and it would match.
980 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
981 if any(ie.suitable(url) for ie in other_ies): return False
982 else: return super(YoutubeUserIE, cls).suitable(url)
984 def _real_extract(self, url):
986 mobj = re.match(self._VALID_URL, url)
988 raise ExtractorError(u'Invalid URL: %s' % url)
990 username = mobj.group(1)
992 # Download video ids using YouTube Data API. Result size per
993 # query is limited (currently to 50 videos) so we need to query
994 # page by page until there are no video ids - it means we got
999 for pagenum in itertools.count(0):
1000 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1002 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1003 page = self._download_webpage(gdata_url, username,
1004 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1007 response = json.loads(page)
1008 except ValueError as err:
1009 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1010 if 'entry' not in response['feed']:
1011 # Number of videos is a multiple of self._MAX_RESULTS
1014 # Extract video identifiers
1016 for entry in response['feed']['entry']:
1017 ids_in_page.append(entry['id']['$t'].split('/')[-1])
1018 video_ids.extend(ids_in_page)
1020 # A little optimization - if current page is not
1021 # "full", ie. does not contain PAGE_SIZE video ids then
1022 # we can assume that this page is the last one - there
1023 # are no more ids on further pages - no need to query
1026 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1029 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1030 url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
1031 return [self.playlist_result(url_results, playlist_title = username)]
1033 class YoutubeSearchIE(SearchInfoExtractor):
1034 IE_DESC = u'YouTube.com searches'
1035 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1037 IE_NAME = u'youtube:search'
1038 _SEARCH_KEY = 'ytsearch'
1040 def report_download_page(self, query, pagenum):
1041 """Report attempt to download search page with given number."""
1042 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1044 def _get_n_results(self, query, n):
1045 """Get a specified number of results for a query"""
1051 while (50 * pagenum) < limit:
1052 self.report_download_page(query, pagenum+1)
1053 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1054 request = compat_urllib_request.Request(result_url)
1056 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1057 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1058 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
1059 api_response = json.loads(data)['data']
1061 if not 'items' in api_response:
1062 raise ExtractorError(u'[youtube] No video results')
1064 new_ids = list(video['id'] for video in api_response['items'])
1065 video_ids += new_ids
1067 limit = min(n, api_response['totalItems'])
1070 if len(video_ids) > n:
1071 video_ids = video_ids[:n]
1072 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1073 return self.playlist_result(videos, query)
1076 class YoutubeShowIE(InfoExtractor):
1077 IE_DESC = u'YouTube.com (multi-season) shows'
1078 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1079 IE_NAME = u'youtube:show'
1081 def _real_extract(self, url):
1082 mobj = re.match(self._VALID_URL, url)
1083 show_name = mobj.group(1)
1084 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1085 # There's one playlist for each season of the show
1086 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1087 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1088 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1091 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1093 Base class for extractors that fetch info from
1094 http://www.youtube.com/feed_ajax
1095 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1097 _LOGIN_REQUIRED = True
1099 # use action_load_personal_feed instead of action_load_system_feed
1100 _PERSONAL_FEED = False
1103 def _FEED_TEMPLATE(self):
1104 action = 'action_load_system_feed'
1105 if self._PERSONAL_FEED:
1106 action = 'action_load_personal_feed'
1107 return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1111 return u'youtube:%s' % self._FEED_NAME
1113 def _real_initialize(self):
1116 def _real_extract(self, url):
1118 # The step argument is available only in 2.7 or higher
1119 for i in itertools.count(0):
1120 paging = i*self._PAGING_STEP
1121 info = self._download_webpage(self._FEED_TEMPLATE % paging,
1122 u'%s feed' % self._FEED_NAME,
1123 u'Downloading page %s' % i)
1124 info = json.loads(info)
1125 feed_html = info['feed_html']
1126 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1127 ids = orderedSet(m.group(1) for m in m_ids)
1128 feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
1129 if info['paging'] is None:
1131 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1133 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1134 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
1135 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1136 _FEED_NAME = 'subscriptions'
1137 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1139 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1140 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1141 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1142 _FEED_NAME = 'recommended'
1143 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1145 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1146 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1147 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1148 _FEED_NAME = 'watch_later'
1149 _PLAYLIST_TITLE = u'Youtube Watch Later'
1151 _PERSONAL_FEED = True
1153 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1154 IE_NAME = u'youtube:favorites'
1155 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1156 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1157 _LOGIN_REQUIRED = True
1159 def _real_extract(self, url):
1160 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1161 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1162 return self.url_result(playlist_id, 'YoutubePlaylist')