X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2Fextractor%2Fyoutube.py;h=4e8db240d3f9d141cfa457c1d941d1b5399f0c67;hb=685e87b61f785b096745cda5ea64ea0b950f56d1;hp=d6c098706ac664ea96a47d97fd740e1742f7a628;hpb=fd545fc6d1a5b489830e80c421f7e09264e0c80f;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d6c098706..4e8db240d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter from ..swfinterp import SWFInterpreter from ..compat import ( compat_chr, + compat_kwargs, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, @@ -245,6 +246,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return True + def _download_webpage(self, *args, **kwargs): + kwargs.setdefault('query', {})['disable_polymer'] = 'true' + return super(YoutubeBaseInfoExtractor, self)._download_webpage( + *args, **compat_kwargs(kwargs)) + def _real_initialize(self): if self._downloader is None: return @@ -326,6 +332,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/| (?:www\.)?deturl\.com/www\.youtube\.com/| (?:www\.)?pwnyoutube\.com/| + (?:www\.)?hooktube\.com/| (?:www\.)?yourepeat\.com/| tube\.majestyc\.net/| youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains @@ -673,6 +680,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, }, # video_info is None (https://github.com/rg3/youtube-dl/issues/4421) + # YouTube Red ad is not captured for creator { 'url': '__2ABJjxzNo', 'info_dict': { @@ -1002,6 +1010,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'Skipping DASH manifest', ], }, + { + # The following content has been identified by the YouTube community + # as inappropriate or offensive to some audiences. + 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI', + 'info_dict': { + 'id': '6SJNVb0GnPI', + 'ext': 'mp4', + 'title': 'Race Differences in Intelligence', + 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1', + 'duration': 965, + 'upload_date': '20140124', + 'uploader': 'New Century Foundation', + 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg', + 'license': 'Standard YouTube License', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { # itag 212 'url': '1t24XAntNCY', @@ -1269,37 +1298,57 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sub_lang_list[sub_lang] = sub_formats return sub_lang_list + def make_captions(sub_url, sub_langs): + parsed_sub_url = compat_urllib_parse_urlparse(sub_url) + caption_qs = compat_parse_qs(parsed_sub_url.query) + captions = {} + for sub_lang in sub_langs: + sub_formats = [] + for ext in self._SUBTITLE_FORMATS: + caption_qs.update({ + 'tlang': [sub_lang], + 'fmt': [ext], + }) + sub_url = compat_urlparse.urlunparse(parsed_sub_url._replace( + query=compat_urllib_parse_urlencode(caption_qs, True))) + sub_formats.append({ + 'url': sub_url, + 'ext': ext, + }) + captions[sub_lang] = sub_formats + return captions + + # New captions format as of 22.06.2017 + player_response = args.get('player_response') + if player_response and isinstance(player_response, compat_str): + player_response = self._parse_json( + player_response, video_id, fatal=False) + if player_response: + renderer = player_response['captions']['playerCaptionsTracklistRenderer'] + base_url = renderer['captionTracks'][0]['baseUrl'] + sub_lang_list = [] + for lang in renderer['translationLanguages']: + lang_code = lang.get('languageCode') + if lang_code: + sub_lang_list.append(lang_code) + return make_captions(base_url, sub_lang_list) + # Some videos don't provide ttsurl but rather caption_tracks and # caption_translation_languages (e.g. 20LmZk1hakA) + # Does not used anymore as of 22.06.2017 caption_tracks = args['caption_tracks'] caption_translation_languages = args['caption_translation_languages'] caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] - parsed_caption_url = compat_urllib_parse_urlparse(caption_url) - caption_qs = compat_parse_qs(parsed_caption_url.query) - - sub_lang_list = {} + sub_lang_list = [] for lang in caption_translation_languages.split(','): lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) sub_lang = lang_qs.get('lc', [None])[0] - if not sub_lang: - continue - sub_formats = [] - for ext in self._SUBTITLE_FORMATS: - caption_qs.update({ - 'tlang': [sub_lang], - 'fmt': [ext], - }) - sub_url = compat_urlparse.urlunparse(parsed_caption_url._replace( - query=compat_urllib_parse_urlencode(caption_qs, True))) - sub_formats.append({ - 'url': sub_url, - 'ext': ext, - }) - sub_lang_list[sub_lang] = sub_formats - return sub_lang_list + if sub_lang: + sub_lang_list.append(sub_lang) + return make_captions(caption_url, sub_lang_list) # An extractor error can be raise by the download process if there are # no automatic captions but there are subtitles - except (KeyError, ExtractorError): + except (KeyError, IndexError, ExtractorError): self._downloader.report_warning(err_msg) return {} @@ -1326,6 +1375,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor): playback_url, video_id, 'Marking watched', 'Unable to mark watched', fatal=False) + @staticmethod + def _extract_urls(webpage): + # Embedded YouTube player + entries = [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer(r'''(?x) + (?: + ]+?src=| + data-video-url=| + ]+?src=| + embedSWF\(?:\s*| + ]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/.+?) + \1''', webpage)] + + # lazyYT YouTube embed + entries.extend(list(map( + unescapeHTML, + re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + + # Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)]+ + class=(?P[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P[\'"])([^\'"]+)(?P=q2)''', webpage) + entries.extend(m[-1] for m in matches) + + return entries + + @staticmethod + def _extract_url(webpage): + urls = YoutubeIE._extract_urls(webpage) + return urls[0] if urls else None + @classmethod def extract_id(cls, url): mobj = re.match(cls._VALID_URL, url, re.VERBOSE) @@ -1353,10 +1439,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): start_time = parse_duration(time_point) if start_time is None: continue + if start_time > duration: + break end_time = (duration if next_num == len(chapter_lines) else parse_duration(chapter_lines[next_num][1])) if end_time is None: continue + if end_time > duration: + end_time = duration + if start_time > end_time: + break chapter_title = re.sub( r']+>[^<]+', '', chapter_line).strip(' \t-') chapter_title = re.sub(r'\s+', ' ', chapter_title) @@ -1410,9 +1502,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if dash_mpd and dash_mpd[0] not in dash_mpds: dash_mpds.append(dash_mpd[0]) + is_live = None + view_count = None + + def extract_view_count(v_info): + return int_or_none(try_get(v_info, lambda x: x['view_count'][0])) + # Get video info embed_webpage = None - is_live = None if re.search(r'player-age-gate-content">', video_webpage) is not None: age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} @@ -1482,6 +1579,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue get_video_info = compat_parse_qs(video_info_webpage) add_dash_mpd(get_video_info) + if view_count is None: + view_count = extract_view_count(get_video_info) if not video_info: video_info = get_video_info if 'token' in get_video_info: @@ -1531,7 +1630,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class="[^"]*"[^>]*> [^<]+\.{3}\s* - ''', r'\1', video_description) + ''', lambda m: compat_urlparse.urljoin(url, m.group(1)), video_description) video_description = clean_html(video_description) else: fd_mobj = re.search(r'', + r'', video_webpage) if mobj is not None: video_uploader_id = mobj.group('uploader_id') @@ -1612,10 +1709,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not upload_date: upload_date = self._search_regex( [r'(?s)id="eow-date.*?>(.*?)', - r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)'], + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], video_webpage, 'upload date', default=None) - if upload_date: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) video_license = self._html_search_regex( @@ -1623,7 +1718,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage, 'license', default=None) m_music = re.search( - r']+class="title"[^>]*>\s*Music\s*\s*]*>\s*
  • (?P.+?) by (?P<creator>.+?)(?:\(.+?\))?</li', + r'''(?x) + <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s* + <ul[^>]*>\s* + <li>(?P<title>.+?) + by (?P<creator>.+?) + (?: + \(.+?\)| + <a[^>]* + (?: + \bhref=["\']/red[^>]*>| # drop possible + >\s*Listen ad-free with YouTube Red # YouTube Red ad + ) + .*? + )?</li + ''', video_webpage) if m_music: video_alt_title = remove_quotes(unescapeHTML(m_music.group('title'))) @@ -1715,12 +1824,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): format_id = url_data['itag'][0] url = url_data['url'][0] - if 'sig' in url_data: - url += '&signature=' + url_data['sig'][0] - elif 's' in url_data: - encrypted_sig = url_data['s'][0] + if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' - jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, @@ -1741,6 +1846,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_webpage, 'age gate player URL') player_url = json.loads(player_url_json) + if 'sig' in url_data: + url += '&signature=' + url_data['sig'][0] + elif 's' in url_data: + encrypted_sig = url_data['s'][0] + if self._downloader.params.get('verbose'): if player_url is None: player_version = 'unknown' @@ -1930,39 +2040,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } -class YoutubeSharedVideoIE(InfoExtractor): - _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?.*\bci=(?P<id>[0-9A-Za-z_-]{11})' - IE_NAME = 'youtube:shared' - - _TEST = { - 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', - 'info_dict': { - 'id': 'uPDB5I9wfp8', - 'ext': 'webm', - 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', - 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', - 'upload_date': '20160219', - 'uploader': 'Pocoyo - Português (BR)', - 'uploader_id': 'PocoyoBrazil', - }, - 'add_ie': ['Youtube'], - 'params': { - # There are already too many Youtube downloads - 'skip_download': True, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - real_video_id = self._html_search_meta( - 'videoId', webpage, 'YouTube video id', fatal=True) - - return self.url_result(real_video_id, YoutubeIE.ie_key()) - - class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: @@ -1986,7 +2063,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): | (%(playlist_id)s) )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} - _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true' + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?' IE_NAME = 'youtube:playlist' _TESTS = [{