ExtractorError,
int_or_none,
PagedList,
- RegexNotFoundError,
unescapeHTML,
unified_strdate,
orderedSet,
(?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com/|
+ (?:www\.)?yourepeat\.com/|
tube\.majestyc\.net/|
youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
(?:.*?\#/)? # handle anchor (#/) redirect urls
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/) # v/ or embed/ or e/
|(?: # or the v= param in all its forms
- (?:(?:watch|movie)(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+ (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
'151': {'ext': 'mp4', 'height': 72, 'resolution': '72p', 'format_note': 'HLS', 'preference': -10},
# DASH mp4 video
- '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'preference': -40},
- '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'preference': -40},
- '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'preference': -40},
- '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'preference': -40},
- '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
- '138': {'ext': 'mp4', 'height': 1081, 'resolution': '>1080p', 'format_note': 'DASH video', 'preference': -40},
- '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'preference': -40},
- '264': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'preference': -40},
+ '133': {'ext': 'mp4', 'height': 240, 'resolution': '240p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '134': {'ext': 'mp4', 'height': 360, 'resolution': '360p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '135': {'ext': 'mp4', 'height': 480, 'resolution': '480p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '136': {'ext': 'mp4', 'height': 720, 'resolution': '720p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '137': {'ext': 'mp4', 'height': 1080, 'resolution': '1080p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '138': {'ext': 'mp4', 'height': 2160, 'resolution': '2160p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '160': {'ext': 'mp4', 'height': 192, 'resolution': '192p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
+ '264': {'ext': 'mp4', 'height': 1440, 'resolution': '1440p', 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
# Dash mp4 audio
'139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
'141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
# Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40},
'242': {'ext': 'webm', 'height': 240, 'resolution': '240p', 'format_note': 'DASH webm', 'preference': -40},
'243': {'ext': 'webm', 'height': 360, 'resolution': '360p', 'format_note': 'DASH webm', 'preference': -40},
'244': {'ext': 'webm', 'height': 480, 'resolution': '480p', 'format_note': 'DASH webm', 'preference': -40},
u"format": "141",
},
},
+ # DASH manifest with encrypted signature
+ {
+ u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+ u'info_dict': {
+ u'id': u'IB3lcPjvWLA',
+ u'ext': u'm4a',
+ u'title': u'Afrojack - The Spark ft. Spree Wilson',
+ u'description': u'md5:3199ed45ee8836572865580804d7ac0f',
+ u'uploader': u'AfrojackVEVO',
+ u'uploader_id': u'AfrojackVEVO',
+ u'upload_date': u'20131011',
+ },
+ u"params": {
+ u'youtube_include_dash_manifest': True,
+ u'format': '141',
+ },
+ },
]
# Decide which formats to download
try:
- mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
+ mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
if not mobj:
raise ValueError('Could not find vevo ID')
- info = json.loads(mobj.group(1))
- args = info['args']
+ json_code = uppercase_escape(mobj.group(1))
+ ytplayer_config = json.loads(json_code)
+ args = ytplayer_config['args']
# Easy way to know if the 's' value is in url_encoded_fmt_stream_map
# this signatures are encrypted
if 'url_encoded_fmt_stream_map' not in args:
raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
# Look for the DASH manifest
- dash_manifest_url_lst = video_info.get('dashmpd')
- if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
- self._downloader.params.get('youtube_include_dash_manifest', False)):
+ if (self._downloader.params.get('youtube_include_dash_manifest', False)):
try:
+ # The DASH manifest used needs to be the one from the original video_webpage.
+ # The one found in get_video_info seems to be using different signatures.
+ # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
+ # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
+ # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
+ if age_gate:
+ dash_manifest_url = video_info.get('dashmpd')[0]
+ else:
+ dash_manifest_url = ytplayer_config['args']['dashmpd']
+ def decrypt_sig(mobj):
+ s = mobj.group(1)
+ dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+ return '/signature/%s' % dec_s
+ dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
dash_doc = self._download_xml(
- dash_manifest_url_lst[0], video_id,
+ dash_manifest_url, video_id,
note=u'Downloading DASH manifest',
errnote=u'Could not download DASH manifest')
for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
|
((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
)"""
- _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+ _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_MORE_PAGES_INDICATOR = r'data-link-type="next"'
- _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
+ _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&[^"]*?index=(?P<index>\d+)'
IE_NAME = u'youtube:playlist'
def _real_initialize(self):
# the id of the playlist is just 'RD' + video_id
url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
- title_span = (get_element_by_attribute('class', 'title long-title', webpage) or
- get_element_by_attribute('class', 'title ', webpage))
+ search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
+ title_span = (search_title('playlist-title') or
+ search_title('title long-title') or search_title('title'))
title = clean_html(title_span)
- video_re = r'data-index="\d+".*?href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s' % re.escape(playlist_id)
- ids = orderedSet(re.findall(video_re, webpage))
+ video_re = r'''(?x)data-video-username="(.*?)".*?
+ href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id)
+ matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
+ # Some of the videos may have been deleted, their username field is empty
+ ids = [video_id for (username, video_id) in matches if username]
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, title)
raise ExtractorError(u'For downloading YouTube.com top lists, use '
u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
+ url = self._TEMPLATE_URL % playlist_id
+ page = self._download_webpage(url, playlist_id)
+ more_widget_html = content_html = page
+
# Extract the video ids from the playlist pages
ids = []
for page_num in itertools.count(1):
- url = self._TEMPLATE_URL % (playlist_id, page_num)
- page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
- matches = re.finditer(self._VIDEO_RE, page)
+ matches = re.finditer(self._VIDEO_RE, content_html)
# We remove the duplicates and the link with index 0
# (it's not the first video of the playlist)
new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
ids.extend(new_ids)
- if re.search(self._MORE_PAGES_INDICATOR, page) is None:
+ mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+ if not mobj:
break
- try:
- playlist_title = self._og_search_title(page)
- except RegexNotFoundError:
- self.report_warning(
- u'Playlist page is missing OpenGraph title, falling back ...',
- playlist_id)
- playlist_title = self._html_search_regex(
- r'<h1 class="pl-header-title">(.*?)</h1>', page, u'title')
+ more = self._download_json(
+ 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num)
+ content_html = more['content_html']
+ more_widget_html = more['load_more_widget_html']
+
+ playlist_title = self._html_search_regex(
+ r'<h1 class="pl-header-title">\s*(.*?)\s*</h1>', page, u'title')
url_results = self._ids_to_results(ids)
return self.playlist_result(url_results, playlist_id, playlist_title)
class YoutubeUserIE(InfoExtractor):
IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
+ _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
_GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
for video_id in video_ids]
return self.playlist_result(videos, query)
+
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
_SEARCH_KEY = 'ytsearchdate'
IE_DESC = u'YouTube.com searches, newest videos first'
+
+class YoutubeSearchURLIE(InfoExtractor):
+ IE_DESC = u'YouTube.com search URLs'
+ IE_NAME = u'youtube:search_url'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ query = compat_urllib_parse.unquote_plus(mobj.group('query'))
+
+ webpage = self._download_webpage(url, query)
+ result_code = self._search_regex(
+ r'(?s)<ol id="search-results"(.*?)</ol>', webpage, u'result HTML')
+
+ part_codes = re.findall(
+ r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
+ entries = []
+ for part_code in part_codes:
+ part_title = self._html_search_regex(
+ r'(?s)title="([^"]+)"', part_code, 'item title', fatal=False)
+ part_url_snippet = self._html_search_regex(
+ r'(?s)href="([^"]+)"', part_code, 'item URL')
+ part_url = compat_urlparse.urljoin(
+ 'https://www.youtube.com/', part_url_snippet)
+ entries.append({
+ '_type': 'url',
+ 'url': part_url,
+ 'title': part_title,
+ })
+
+ return {
+ '_type': 'playlist',
+ 'entries': entries,
+ 'title': query,
+ }
+
+
class YoutubeShowIE(InfoExtractor):
IE_DESC = u'YouTube.com (multi-season) shows'
_VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
IE_NAME = 'youtube:truncated_url'
IE_DESC = False # Do not list
_VALID_URL = r'''(?x)
- (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
+ (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
(?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
'''