clean_html,
dict_get,
error_to_compat_str,
+ extract_attributes,
ExtractorError,
float_or_none,
get_element_by_attribute,
for video_id, video_title in self.extract_videos_from_page(content):
yield self.url_result(video_id, 'Youtube', video_id, video_title)
- def extract_videos_from_page(self, page):
- ids_in_page = []
- titles_in_page = []
- for mobj in re.finditer(self._VIDEO_RE, page):
+ def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
+ for mobj in re.finditer(video_re, page):
# The link with index 0 is not the first video of the playlist (not sure if still actual)
if 'index' in mobj.groupdict() and mobj.group('id') == '0':
continue
video_id = mobj.group('id')
- video_title = unescapeHTML(mobj.group('title'))
+ video_title = unescapeHTML(
+ mobj.group('title')) if 'title' in mobj.groupdict() else None
if video_title:
video_title = video_title.strip()
+ if video_title == '► Play all':
+ video_title = None
try:
idx = ids_in_page.index(video_id)
if video_title and not titles_in_page[idx]:
except ValueError:
ids_in_page.append(video_id)
titles_in_page.append(video_title)
+
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+ self.extract_videos_from_page_impl(
+ self._VIDEO_RE, page, ids_in_page, titles_in_page)
return zip(ids_in_page, titles_in_page)
video_id = mobj.group(2)
return video_id
- def _extract_annotations(self, video_id):
- return self._download_webpage(
- 'https://www.youtube.com/annotations_invideo', video_id,
- note='Downloading annotations',
- errnote='Unable to download video annotations', fatal=False,
- query={
- 'features': 1,
- 'legacy': 1,
- 'video_id': video_id,
- })
-
@staticmethod
def _extract_chapters(description, duration):
if not description:
break
def extract_unavailable_message():
- return self._html_search_regex(
- r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
- video_webpage, 'unavailable message', default=None)
+ messages = []
+ for tag, kind in (('h1', 'message'), ('div', 'submessage')):
+ msg = self._html_search_regex(
+ r'(?s)<{tag}[^>]+id=["\']unavailable-{kind}["\'][^>]*>(.+?)</{tag}>'.format(tag=tag, kind=kind),
+ video_webpage, 'unavailable %s' % kind, default=None)
+ if msg:
+ messages.append(msg)
+ if messages:
+ return '\n'.join(messages)
if not video_info:
unavailable_message = extract_unavailable_message()
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
formats.append(a_format)
else:
- error_message = clean_html(video_info.get('reason', [None])[0])
+ error_message = extract_unavailable_message()
+ if not error_message:
+ error_message = clean_html(try_get(
+ player_response, lambda x: x['playabilityStatus']['reason'],
+ compat_str))
if not error_message:
- error_message = extract_unavailable_message()
+ error_message = clean_html(
+ try_get(video_info, lambda x: x['reason'][0], compat_str))
if error_message:
raise ExtractorError(error_message, expected=True)
raise ExtractorError('no conn, hlsvp, hlsManifestUrl or url_encoded_fmt_stream_map information found in video info')
# annotations
video_annotations = None
if self._downloader.params.get('writeannotations', False):
- video_annotations = self._extract_annotations(video_id)
+ xsrf_token = self._search_regex(
+ r'([\'"])XSRF_TOKEN\1\s*:\s*([\'"])(?P<xsrf_token>[A-Za-z0-9+/=]+)\2',
+ video_webpage, 'xsrf token', group='xsrf_token', fatal=False)
+ invideo_url = try_get(
+ player_response, lambda x: x['annotations'][0]['playerAnnotationsUrlsRenderer']['invideoUrl'], compat_str)
+ if xsrf_token and invideo_url:
+ xsrf_field_name = self._search_regex(
+ r'([\'"])XSRF_FIELD_NAME\1\s*:\s*([\'"])(?P<xsrf_field_name>\w+)\2',
+ video_webpage, 'xsrf field name',
+ group='xsrf_field_name', default='session_token')
+ video_annotations = self._download_webpage(
+ self._proto_relative_url(invideo_url),
+ video_id, note='Downloading annotations',
+ errnote='Unable to download video annotations', fatal=False,
+ data=urlencode_postdata({xsrf_field_name: xsrf_token}))
chapters = self._extract_chapters(description_original, video_duration)
(%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
_TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
+ _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
+ _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
IE_NAME = 'youtube:playlist'
_TESTS = [{
'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
def _real_initialize(self):
self._login()
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ titles_in_page = []
+
+ for item in re.findall(
+ r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
+ attrs = extract_attributes(item)
+ video_id = attrs['data-video-id']
+ video_title = unescapeHTML(attrs.get('data-title'))
+ if video_title:
+ video_title = video_title.strip()
+ ids_in_page.append(video_id)
+ titles_in_page.append(video_title)
+
+ # Fallback with old _VIDEO_RE
+ self.extract_videos_from_page_impl(
+ self._VIDEO_RE, page, ids_in_page, titles_in_page)
+
+ # Relaxed fallbacks
+ self.extract_videos_from_page_impl(
+ r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
+ ids_in_page, titles_in_page)
+ self.extract_videos_from_page_impl(
+ r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
+ ids_in_page, titles_in_page)
+
+ return zip(ids_in_page, titles_in_page)
+
def _extract_mix(self, playlist_id):
# The mixes are generated from a single video
# the id of the playlist is just 'RD' + video_id