X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=677907aba39f444444a6b61a411c29f2e29ba9c2;hb=936784b272db3f85f5ff5bdd2d5a71e0397ee7bd;hp=f698a5627bb552ffa3383e03044af698d592bbb2;hpb=e7f3529f68ee7c8ca78366d37f851cb31fa00f31;p=youtube-dl
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index f698a5627..677907aba 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -37,6 +37,7 @@ from ..utils import (
orderedSet,
parse_codecs,
parse_duration,
+ qualities,
remove_quotes,
remove_start,
smuggle_url,
@@ -84,10 +85,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
"""
- (username, password) = self._get_login_info()
+ username, password = self._get_login_info()
# No authentication to be performed
if username is None:
- if self._LOGIN_REQUIRED:
+ if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
return True
@@ -246,9 +247,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return True
- def _download_webpage(self, *args, **kwargs):
+ def _download_webpage_handle(self, *args, **kwargs):
kwargs.setdefault('query', {})['disable_polymer'] = 'true'
- return super(YoutubeBaseInfoExtractor, self)._download_webpage(
+ return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))
def _real_initialize(self):
@@ -509,6 +510,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
'license': 'Standard YouTube License',
'creator': 'Icona Pop',
+ 'track': 'I Love It (feat. Charli XCX)',
+ 'artist': 'Icona Pop',
}
},
{
@@ -527,6 +530,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
'license': 'Standard YouTube License',
'creator': 'Justin Timberlake',
+ 'track': 'Tunnel Vision`',
+ 'artist': 'Justin Timberlake',
'age_limit': 18,
}
},
@@ -1537,7 +1542,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
if ytplayer_config:
args = ytplayer_config['args']
- if args.get('url_encoded_fmt_stream_map'):
+ if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
# Convert to the same format returned by compat_parse_qs
video_info = dict((k, [v]) for k, v in args.items())
add_dash_mpd(video_info)
@@ -1596,6 +1601,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if 'token' not in video_info:
video_info = get_video_info
break
+
+ def extract_unavailable_message():
+ return self._html_search_regex(
+ r'(?s)
]+id="unavailable-message"[^>]*>(.+?)
',
+ video_webpage, 'unavailable message', default=None)
+
if 'token' not in video_info:
if 'reason' in video_info:
if 'The uploader has not made this video available in your country.' in video_info['reason']:
@@ -1604,8 +1615,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
countries = regions_allowed.split(',') if regions_allowed else None
self.raise_geo_restricted(
msg=video_info['reason'][0], countries=countries)
+ reason = video_info['reason'][0]
+ if 'Invalid parameters' in reason:
+ unavailable_message = extract_unavailable_message()
+ if unavailable_message:
+ reason = unavailable_message
raise ExtractorError(
- 'YouTube said: %s' % video_info['reason'][0],
+ 'YouTube said: %s' % reason,
expected=True, video_id=video_id)
else:
raise ExtractorError(
@@ -1686,9 +1702,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
self.report_information_extraction(video_id)
# uploader
- if 'author' not in video_info:
- raise ExtractorError('Unable to extract uploader name')
- video_uploader = compat_urllib_parse_unquote_plus(video_info['author'][0])
+ video_uploader = try_get(video_info, lambda x: x['author'][0], compat_str)
+ if video_uploader:
+ video_uploader = compat_urllib_parse_unquote_plus(video_uploader)
+ else:
+ self._downloader.report_warning('unable to extract uploader name')
# uploader_id
video_uploader_id = None
@@ -1751,6 +1769,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
video_alt_title = video_creator = None
+ def extract_meta(field):
+ return self._html_search_regex(
+ r']+class="title"[^>]*>\s*%s\s*
\s*]*>\s*- (.+?)
\s*' % field,
+ video_webpage, field, default=None)
+
+ track = extract_meta('Song')
+ artist = extract_meta('Artist')
+
m_episode = re.search(
r']+id="watch7-headline"[^>]*>\s*]*>.*?>(?P[^<]+)\s*S(?P\d+)\s*â¢\s*E(?P\d+)',
video_webpage)
@@ -1802,6 +1828,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
chapters = self._extract_chapters(description_original, video_duration)
+ def _extract_filesize(media_url):
+ return int_or_none(self._search_regex(
+ r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
+
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
formats = [{
@@ -1827,6 +1857,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'width': int_or_none(width_height[0]),
'height': int_or_none(width_height[1]),
}
+ q = qualities(['small', 'medium', 'hd720'])
formats = []
for url_data_str in encoded_url_map.split(','):
url_data = compat_parse_qs(url_data_str)
@@ -1906,13 +1937,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
mobj = re.search(r'^(?P\d+)[xX](?P\d+)$', url_data.get('size', [''])[0])
width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
+ filesize = int_or_none(url_data.get(
+ 'clen', [None])[0]) or _extract_filesize(url)
+
+ quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
+
more_fields = {
- 'filesize': int_or_none(url_data.get('clen', [None])[0]),
+ 'filesize': filesize,
'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
'width': width,
'height': height,
'fps': int_or_none(url_data.get('fps', [None])[0]),
- 'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
+ 'format_note': quality,
+ 'quality': q(quality),
}
for key, value in more_fields.items():
if value:
@@ -1933,6 +1970,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
break
if codecs:
dct.update(parse_codecs(codecs))
+ if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none':
+ dct['downloader_options'] = {
+ # Youtube throttles chunks >~10M
+ 'http_chunk_size': 10485760,
+ }
formats.append(dct)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
@@ -1953,11 +1995,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
formats.append(a_format)
else:
- unavailable_message = self._html_search_regex(
- r'(?s)]+id="unavailable-message"[^>]*>(.+?)
',
- video_webpage, 'unavailable message', default=None)
- if unavailable_message:
- raise ExtractorError(unavailable_message, expected=True)
+ error_message = clean_html(video_info.get('reason', [None])[0])
+ if not error_message:
+ error_message = extract_unavailable_message()
+ if error_message:
+ raise ExtractorError(error_message, expected=True)
raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# Look for the DASH manifest
@@ -1976,6 +2018,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
for df in self._extract_mpd_formats(
mpd_url, video_id, fatal=dash_mpd_fatal,
formats_dict=self._formats):
+ if not df.get('filesize'):
+ df['filesize'] = _extract_filesize(df['url'])
# Do not overwrite DASH format found in some previous DASH manifest
if df['format_id'] not in dash_formats:
dash_formats[df['format_id']] = df
@@ -2023,9 +2067,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'uploader_url': video_uploader_url,
'upload_date': upload_date,
'license': video_license,
- 'creator': video_creator,
+ 'creator': video_creator or artist,
'title': video_title,
- 'alt_title': video_alt_title,
+ 'alt_title': video_alt_title or track,
'thumbnail': video_thumbnail,
'description': video_description,
'categories': video_categories,
@@ -2048,6 +2092,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'series': series,
'season_number': season_number,
'episode_number': episode_number,
+ 'track': track,
+ 'artist': artist,
}
@@ -2437,7 +2483,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
class YoutubeUserIE(YoutubeChannelIE):
IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?Puser|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?Puser|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P[A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
IE_NAME = 'youtube:user'
@@ -2569,7 +2615,11 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
}]
-class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
+class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?'
+
+
+class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
IE_DESC = 'YouTube.com searches'
# there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results
@@ -2603,8 +2653,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE):
raise ExtractorError(
'[youtube] No video results', expected=True)
- new_videos = self._ids_to_results(orderedSet(re.findall(
- r'href="/watch\?v=(.{11})', html_content)))
+ new_videos = list(self._process_page(html_content))
videos += new_videos
if not new_videos or len(videos) > limit:
break
@@ -2627,11 +2676,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
_EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'}
-class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):
+class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
IE_DESC = 'YouTube.com search URLs'
IE_NAME = 'youtube:search_url'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P[^&]+)(?:[&]|$)'
- _VIDEO_RE = r'href="\s*/watch\?v=(?P[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P[^"]+))?'
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
@@ -2683,10 +2731,7 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _real_extract(self, url):
- page = self._download_webpage(
- 'https://www.youtube.com/feed/%s' % self._FEED_NAME, self._PLAYLIST_TITLE)
-
+ def _entries(self, page):
# The extraction process is the same as for playlists, but the regex
# for the video ids doesn't contain an index
ids = []
@@ -2697,12 +2742,15 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
# 'recommended' feed has infinite 'load more' and each new portion spins
# the same videos in (sometimes) slightly different order, so we'll check
# for unicity and break when portion has no new videos
- new_ids = filter(lambda video_id: video_id not in ids, orderedSet(matches))
+ new_ids = list(filter(lambda video_id: video_id not in ids, orderedSet(matches)))
if not new_ids:
break
ids.extend(new_ids)
+ for entry in self._ids_to_results(new_ids):
+ yield entry
+
mobj = re.search(r'data-uix-load-more-href="/?(?P[^"]+)"', more_widget_html)
if not mobj:
break
@@ -2714,8 +2762,12 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
content_html = more['content_html']
more_widget_html = more['load_more_widget_html']
+ def _real_extract(self, url):
+ page = self._download_webpage(
+ 'https://www.youtube.com/feed/%s' % self._FEED_NAME,
+ self._PLAYLIST_TITLE)
return self.playlist_result(
- self._ids_to_results(ids), playlist_title=self._PLAYLIST_TITLE)
+ self._entries(page), playlist_title=self._PLAYLIST_TITLE)
class YoutubeWatchLaterIE(YoutubePlaylistIE):