import traceback
from .common import InfoExtractor, SearchInfoExtractor
-from .subtitles import SubtitlesInfoExtractor
from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter
from ..compat import (
from ..utils import (
clean_html,
ExtractorError,
+ float_or_none,
get_element_by_attribute,
get_element_by_id,
int_or_none,
return
-class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
+class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com'
_VALID_URL = r"""(?x)^
(
'uploader': '孫艾倫',
'title': '[A-made] 變態妍字幕版 太妍 我就是這樣的人',
},
- }
+ },
+ # url_encoded_fmt_stream_map is empty string
+ {
+ 'url': 'qEJwOuvDf7I',
+ 'info_dict': {
+ 'id': 'qEJwOuvDf7I',
+ 'ext': 'mp4',
+ 'title': 'Обсуждение судебной практики по выборам 14 сентября 2014 года в Санкт-Петербурге',
+ 'description': '',
+ 'upload_date': '20150404',
+ 'uploader_id': 'spbelect',
+ 'uploader': 'Наблюдатели Петербурга',
+ },
+ 'params': {
+ 'skip_download': 'requires avconv',
+ }
+ },
]
def __init__(self, *args, **kwargs):
if cache_spec is not None:
return lambda s: ''.join(s[i] for i in cache_spec)
+ download_note = (
+ 'Downloading player %s' % player_url
+ if self._downloader.params.get('verbose') else
+ 'Downloading %s player %s' % (player_type, player_id)
+ )
if player_type == 'js':
code = self._download_webpage(
player_url, video_id,
- note='Downloading %s player %s' % (player_type, player_id),
+ note=download_note,
errnote='Download of %s failed' % player_url)
res = self._parse_sig_js(code)
elif player_type == 'swf':
urlh = self._request_webpage(
player_url, video_id,
- note='Downloading %s player %s' % (player_type, player_id),
+ note=download_note,
errnote='Download of %s failed' % player_url)
code = urlh.read()
res = self._parse_sig_swf(code)
else:
assert False, 'Invalid player type %r' % player_type
- if cache_spec is None:
- test_string = ''.join(map(compat_chr, range(len(example_sig))))
- cache_res = res(test_string)
- cache_spec = [ord(c) for c in cache_res]
+ test_string = ''.join(map(compat_chr, range(len(example_sig))))
+ cache_res = res(test_string)
+ cache_spec = [ord(c) for c in cache_res]
self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
return res
raise ExtractorError(
'Signature extraction failed: ' + tb, cause=e)
- def _get_available_subtitles(self, video_id, webpage):
+ def _get_subtitles(self, video_id, webpage):
try:
subs_doc = self._download_xml(
'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
lang = track.attrib['lang_code']
if lang in sub_lang_list:
continue
- params = compat_urllib_parse.urlencode({
- 'lang': lang,
- 'v': video_id,
- 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
- 'name': track.attrib['name'].encode('utf-8'),
- })
- url = 'https://www.youtube.com/api/timedtext?' + params
- sub_lang_list[lang] = url
+ sub_formats = []
+ for ext in ['sbv', 'vtt', 'srt']:
+ params = compat_urllib_parse.urlencode({
+ 'lang': lang,
+ 'v': video_id,
+ 'fmt': ext,
+ 'name': track.attrib['name'].encode('utf-8'),
+ })
+ sub_formats.append({
+ 'url': 'https://www.youtube.com/api/timedtext?' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[lang] = sub_formats
if not sub_lang_list:
self._downloader.report_warning('video doesn\'t have subtitles')
return {}
return sub_lang_list
- def _get_available_automatic_caption(self, video_id, webpage):
+ def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
- sub_format = self._downloader.params.get('subtitlesformat', 'srt')
self.to_screen('%s: Looking for automatic captions' % video_id)
mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
err_msg = 'Couldn\'t find automatic captions for %s' % video_id
sub_lang_list = {}
for lang_node in caption_list.findall('target'):
sub_lang = lang_node.attrib['lang_code']
- params = compat_urllib_parse.urlencode({
- 'lang': original_lang,
- 'tlang': sub_lang,
- 'fmt': sub_format,
- 'ts': timestamp,
- 'kind': caption_kind,
- })
- sub_lang_list[sub_lang] = caption_url + '&' + params
+ sub_formats = []
+ for ext in ['sbv', 'vtt', 'srt']:
+ params = compat_urllib_parse.urlencode({
+ 'lang': original_lang,
+ 'tlang': sub_lang,
+ 'fmt': ext,
+ 'ts': timestamp,
+ 'kind': caption_kind,
+ })
+ sub_formats.append({
+ 'url': caption_url + '&' + params,
+ 'ext': ext,
+ })
+ sub_lang_list[sub_lang] = sub_formats
return sub_lang_list
# An extractor error can be raise by the download process if there are
# no automatic captions but there are subtitles
errnote='Could not download DASH manifest')
formats = []
- for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
- url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
- if url_el is None:
- continue
- format_id = r.attrib['id']
- video_url = url_el.text
- filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
- f = {
- 'format_id': format_id,
- 'url': video_url,
- 'width': int_or_none(r.attrib.get('width')),
- 'height': int_or_none(r.attrib.get('height')),
- 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
- 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
- 'filesize': filesize,
- 'fps': int_or_none(r.attrib.get('frameRate')),
- }
- try:
- existing_format = next(
- fo for fo in formats
- if fo['format_id'] == format_id)
- except StopIteration:
- f.update(self._formats.get(format_id, {}).items())
- formats.append(f)
- else:
- existing_format.update(f)
+ for a in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}AdaptationSet'):
+ mime_type = a.attrib.get('mimeType')
+ for r in a.findall('{urn:mpeg:DASH:schema:MPD:2011}Representation'):
+ url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
+ if url_el is None:
+ continue
+ if mime_type == 'text/vtt':
+ # TODO implement WebVTT downloading
+ pass
+ elif mime_type.startswith('audio/') or mime_type.startswith('video/'):
+ format_id = r.attrib['id']
+ video_url = url_el.text
+ filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
+ f = {
+ 'format_id': format_id,
+ 'url': video_url,
+ 'width': int_or_none(r.attrib.get('width')),
+ 'height': int_or_none(r.attrib.get('height')),
+ 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
+ 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
+ 'filesize': filesize,
+ 'fps': int_or_none(r.attrib.get('frameRate')),
+ }
+ try:
+ existing_format = next(
+ fo for fo in formats
+ if fo['format_id'] == format_id)
+ except StopIteration:
+ full_info = self._formats.get(format_id, {}).copy()
+ full_info.update(f)
+ formats.append(full_info)
+ else:
+ existing_format.update(f)
+ else:
+ self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
def _real_extract(self, url):
player_url = None
# Get video info
+ embed_webpage = None
if re.search(r'player-age-gate-content">', video_webpage) is not None:
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
args = ytplayer_config['args']
# Convert to the same format returned by compat_parse_qs
video_info = dict((k, [v]) for k, v in args.items())
- if 'url_encoded_fmt_stream_map' not in args:
+ if not args.get('url_encoded_fmt_stream_map'):
raise ValueError('No stream_map present') # caught below
except ValueError:
# We fallback to the get_video_info pages (used by the embed page)
# subtitles
video_subtitles = self.extract_subtitles(video_id, video_webpage)
-
- if self._downloader.params.get('listsubtitles', False):
- self._list_available_subtitles(video_id, video_webpage)
- return
+ automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
if 'length_seconds' not in video_info:
self._downloader.report_warning('unable to extract video duration')
url += '&signature=' + url_data['sig'][0]
elif 's' in url_data:
encrypted_sig = url_data['s'][0]
+ ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
jsplayer_url_json = self._search_regex(
- r'"assets":.+?"js":\s*("[^"]+")',
- embed_webpage if age_gate else video_webpage, 'JS player URL')
+ ASSETS_RE,
+ embed_webpage if age_gate else video_webpage,
+ 'JS player URL (1)', default=None)
+ if not jsplayer_url_json and not age_gate:
+ # We need the embed website after all
+ if embed_webpage is None:
+ embed_url = proto + '://www.youtube.com/embed/%s' % video_id
+ embed_webpage = self._download_webpage(
+ embed_url, video_id, 'Downloading embed webpage')
+ jsplayer_url_json = self._search_regex(
+ ASSETS_RE, embed_webpage, 'JS player URL')
+
player_url = json.loads(jsplayer_url_json)
if player_url is None:
player_url_json = self._search_regex(
'description': video_description,
'categories': video_categories,
'subtitles': video_subtitles,
+ 'automatic_captions': automatic_captions,
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
'view_count': view_count,
'like_count': like_count,
'dislike_count': dislike_count,
+ 'average_rating': float_or_none(video_info.get('avg_rating', [None])[0]),
'formats': formats,
}
| p/
)
(
- (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
+ (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
.*
|
- ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
+ ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
)"""
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
}, {
'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
'info_dict': {
+ 'id': 'PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
'title': 'YDL_Empty_List',
},
'playlist_count': 0,
'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
'info_dict': {
'title': '29C3: Not my department',
+ 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
},
'playlist_count': 95,
}, {
'url': 'PLBB231211A4F62143',
'info_dict': {
'title': '[OLD]Team Fortress 2 (Class-based LP)',
+ 'id': 'PLBB231211A4F62143',
},
'playlist_mincount': 26,
}, {
'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
'info_dict': {
'title': 'Uploads from Cauchemar',
+ 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
},
'playlist_mincount': 799,
}, {
'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
'info_dict': {
'title': 'YDL_safe_search',
+ 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
},
'playlist_count': 2,
}, {
'playlist_count': 4,
'info_dict': {
'title': 'JODA15',
+ 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
}
}, {
'note': 'Embedded SWF player',
'playlist_count': 4,
'info_dict': {
'title': 'JODA7',
+ 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
}
}, {
'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
'info_dict': {
- 'title': 'Uploads from Interstellar Movie',
+ 'title': 'Uploads from Interstellar Movie',
+ 'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
},
'playlist_mincout': 21,
}]
for vid_id in ids]
def _extract_mix(self, playlist_id):
- # The mixes are generated from a a single video
+ # The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id
url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
webpage = self._download_webpage(
return self.playlist_result(url_results, playlist_id, title)
- def _real_extract(self, url):
- # Extract playlist id
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
- playlist_id = mobj.group(1) or mobj.group(2)
-
- # Check if it's a video-specific URL
- query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
- if 'v' in query_dict:
- video_id = query_dict['v'][0]
- if self._downloader.params.get('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- return self.url_result(video_id, 'Youtube', video_id=video_id)
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
-
- if playlist_id.startswith('RD'):
- # Mixes require a custom extraction process
- return self._extract_mix(playlist_id)
-
+ def _extract_playlist(self, playlist_id):
url = self._TEMPLATE_URL % playlist_id
page = self._download_webpage(url, playlist_id)
more_widget_html = content_html = page
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title)
+ def _real_extract(self, url):
+ # Extract playlist id
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ raise ExtractorError('Invalid URL: %s' % url)
+ playlist_id = mobj.group(1) or mobj.group(2)
+
+ # Check if it's a video-specific URL
+ query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ if 'v' in query_dict:
+ video_id = query_dict['v'][0]
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ return self.url_result(video_id, 'Youtube', video_id=video_id)
+ else:
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+ if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
+ # Mixes require a custom extraction process
+ return self._extract_mix(playlist_id)
+
+ return self._extract_playlist(playlist_id)
+
class YoutubeChannelIE(InfoExtractor):
IE_DESC = 'YouTube.com channels'
'note': 'paginated channel',
'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
'playlist_mincount': 91,
+ 'info_dict': {
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ }
}]
def extract_videos_from_page(self, page):
ids_in_page = []
- for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(mobj.group(1))
- return ids_in_page
+ titles_in_page = []
+ for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
+ video_id = mobj.group('id')
+ video_title = unescapeHTML(mobj.group('title'))
+ try:
+ idx = ids_in_page.index(video_id)
+ if video_title and not titles_in_page[idx]:
+ titles_in_page[idx] = video_title
+ except ValueError:
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+ return zip(ids_in_page, titles_in_page)
def _real_extract(self, url):
channel_id = self._match_id(url)
- video_ids = []
url = 'https://www.youtube.com/channel/%s/videos' % channel_id
channel_page = self._download_webpage(url, channel_id)
autogenerated = re.search(r'''(?x)
if autogenerated:
# The videos are contained in a single page
# the ajax pages can't be used, they are empty
- video_ids = self.extract_videos_from_page(channel_page)
entries = [
- self.url_result(video_id, 'Youtube', video_id=video_id)
- for video_id in video_ids]
+ self.url_result(
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
+ for video_id, video_title in self.extract_videos_from_page(channel_page)]
return self.playlist_result(entries, channel_id)
def _entries():
more_widget_html = content_html = channel_page
for pagenum in itertools.count(1):
- ids_in_page = self.extract_videos_from_page(content_html)
- for video_id in ids_in_page:
+ for video_id, video_title in self.extract_videos_from_page(content_html):
yield self.url_result(
- video_id, 'Youtube', video_id=video_id)
+ video_id, 'Youtube', video_id=video_id,
+ video_title=video_title)
mobj = re.search(
r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
webpage = self._download_webpage(url, query)
result_code = self._search_regex(
- r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
+ r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
part_codes = re.findall(
r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+ IE_NAME = 'youtube:recommended'
IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
_VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
_PLAYLIST_TITLE = 'Youtube Recommended videos'
-class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+ IE_NAME = 'youtube:watchlater'
IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
- _FEED_NAME = 'watch_later'
- _PLAYLIST_TITLE = 'Youtube Watch Later'
- _PERSONAL_FEED = True
+ _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+
+ _TESTS = [] # override PlaylistIE tests
+
+ def _real_extract(self, url):
+ return self._extract_playlist('WL')
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+ IE_NAME = 'youtube:history'
IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
_VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
_FEED_NAME = 'history'
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list
_VALID_URL = r'''(?x)
- (?:https?://)?[^/]+/watch\?(?:
+ (?:https?://)?
+ (?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/
+ (?:watch\?(?:
feature=[a-z_]+|
- annotation_id=annotation_[^&]+
- )?$|
- (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
+ annotation_id=annotation_[^&]+|
+ x-yt-cl=[0-9]+|
+ hl=[^&]*|
+ )?
+ |
+ attribution_link\?a=[^&]+
+ )
+ $
'''
_TESTS = [{
}, {
'url': 'http://www.youtube.com/watch?',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?x-yt-cl=84503534',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?feature=foo',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?hl=en-GB',
+ 'only_matching': True,
}]
def _real_extract(self, url):
class YoutubeTruncatedIDIE(InfoExtractor):
IE_NAME = 'youtube:truncated_id'
IE_DESC = False # Do not list
- _VALID_URL = r'https?://(?:www\.)youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/watch\?v=(?P<id>[0-9A-Za-z_-]{1,10})$'
_TESTS = [{
'url': 'https://www.youtube.com/watch?v=N_708QY7Ob',