X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyoutube.py;h=6bdea1c44bc5201d2c0b9a6d01279b9414668cbe;hb=f2b8db57ebb15fa2edc9c7810436618a0725d451;hp=2d1a1912392dcd329f5466768fafe9a78a94d79a;hpb=2b25cb5d7693b62736d4cdfa656289cc429c4c81;p=youtube-dl diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 2d1a19123..6bdea1c44 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -7,7 +7,6 @@ import itertools import json import os.path import re -import string import struct import traceback import zlib @@ -152,6 +151,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): ) )) |youtu\.be/ # just youtu.be/xxxx + |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId= ) )? # all until now is optional -> you can pass the naked ID ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID @@ -210,23 +210,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'acodec': 'none', 'preference': -40}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH webm', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH webm', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH webm', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH webm', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH webm', 'preference': -40}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40}, # Dash webm audio - '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 48, 'preference': -50}, - '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH webm audio', 'abr': 256, 'preference': -50}, + '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50}, + '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, @@ -242,7 +244,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"uploader": u"Philipp Hagemeister", u"uploader_id": u"phihag", u"upload_date": u"20121002", - u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de ." + u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .", + u"categories": [u'Science & Technology'], } }, { @@ -252,7 +255,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u"info_dict": { u"upload_date": u"20120506", u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]", - u"description": u"md5:5b292926389560516e384ac437c0ec07", + u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f", u"uploader": u"Icona Pop", u"uploader_id": u"IconaPop" } @@ -304,7 +307,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): u'id': u'IB3lcPjvWLA', u'ext': u'm4a', u'title': u'Afrojack - The Spark ft. Spree Wilson', - u'description': u'md5:3199ed45ee8836572865580804d7ac0f', + u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8', u'uploader': u'AfrojackVEVO', u'uploader_id': u'AfrojackVEVO', u'upload_date': u'20131011', @@ -438,7 +441,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'signature=([a-zA-Z]+)', jscode, + r'signature=([$a-zA-Z]+)', jscode, u'Initial JS player signature function name') jsi = JSInterpreter(jscode) @@ -1082,9 +1085,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): break if 'token' not in video_info: if 'reason' in video_info: - raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True) + raise ExtractorError( + u'YouTube said: %s' % video_info['reason'][0], + expected=True, video_id=video_id) else: - raise ExtractorError(u'"token" parameter not in video info for unknown reason') + raise ExtractorError( + u'"token" parameter not in video info for unknown reason', + video_id=video_id) if 'view_count' in video_info: view_count = int(video_info['view_count'][0]) @@ -1113,7 +1120,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # title if 'title' in video_info: - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) + video_title = video_info['title'][0] else: self._downloader.report_warning(u'Unable to extract video title') video_title = u'_' @@ -1132,11 +1139,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): # upload date upload_date = None - mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) + mobj = re.search(r'(?s)id="eow-date.*?>(.*?)', video_webpage) + if mobj is None: + mobj = re.search( + r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)', + video_webpage) if mobj is not None: upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) upload_date = unified_strdate(upload_date) + m_cat_container = get_element_by_id("eow-category", video_webpage) + if m_cat_container: + category = self._html_search_regex( + r'(?s)(.*?)', m_cat_container, 'category', + default=None) + video_categories = None if category is None else [category] + else: + video_categories = None + # description video_description = get_element_by_id("eow-description", video_webpage) if video_description: @@ -1343,6 +1363,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): 'title': video_title, 'thumbnail': video_thumbnail, 'description': video_description, + 'categories': video_categories, 'subtitles': video_subtitles, 'duration': video_duration, 'age_limit': 18 if age_gate else 0, @@ -1366,13 +1387,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | p/ ) ( - (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' @@ -1395,11 +1416,9 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): title_span = (search_title('playlist-title') or search_title('title long-title') or search_title('title')) title = clean_html(title_span) - video_re = r'''(?x)data-video-username="(.*?)".*? + video_re = r'''(?x)data-video-username=".*?".*? href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id) - matches = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) - # Some of the videos may have been deleted, their username field is empty - ids = [video_id for (username, video_id) in matches if username] + ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL)) url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, title) @@ -1419,7 +1438,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) return self.url_result(video_id, 'Youtube', video_id=video_id) else: - self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) if playlist_id.startswith('RD'): # Mixes require a custom extraction process @@ -1432,6 +1451,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): page = self._download_webpage(url, playlist_id) more_widget_html = content_html = page + # Check if the playlist exists or is private + if re.search(r'
[^<]*?(The|This) playlist (does not exist|is private)[^<]*?
', page) is not None: + raise ExtractorError( + u'The playlist doesn\'t exist or is private, use --username or ' + '--netrc to access it.', + expected=True) + # Extract the video ids from the playlist pages ids = [] @@ -1447,12 +1473,15 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): break more = self._download_json( - 'https://youtube.com/%s' % mobj.group('more'), playlist_id, 'Downloading page #%s' % page_num) + 'https://youtube.com/%s' % mobj.group('more'), playlist_id, + 'Downloading page #%s' % page_num, + transform_source=uppercase_escape) content_html = more['content_html'] more_widget_html = more['load_more_widget_html'] playlist_title = self._html_search_regex( - r'

\s*(.*?)\s*

', page, u'title') + r'(?s)

\s*(.*?)\s*

', + page, u'title') url_results = self._ids_to_results(ids) return self.playlist_result(url_results, playlist_id, playlist_title) @@ -1610,7 +1639,7 @@ class YoutubeUserIE(InfoExtractor): class YoutubeSearchIE(SearchInfoExtractor): IE_DESC = u'YouTube.com searches' - _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' + _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc' _MAX_RESULTS = 1000 IE_NAME = u'youtube:search' _SEARCH_KEY = 'ytsearch' @@ -1621,9 +1650,12 @@ class YoutubeSearchIE(SearchInfoExtractor): video_ids = [] pagenum = 0 limit = n + PAGE_SIZE = 50 - while (50 * pagenum) < limit: - result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1) + while (PAGE_SIZE * pagenum) < limit: + result_url = self._API_URL % ( + compat_urllib_parse.quote_plus(query.encode('utf-8')), + (PAGE_SIZE * pagenum) + 1) data_json = self._download_webpage( result_url, video_id=u'query "%s"' % query, note=u'Downloading page %s' % (pagenum + 1), @@ -1734,23 +1766,25 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor): feed_entries = [] paging = 0 for i in itertools.count(1): - info = self._download_webpage(self._FEED_TEMPLATE % paging, + info = self._download_json(self._FEED_TEMPLATE % paging, u'%s feed' % self._FEED_NAME, u'Downloading page %s' % i) - info = json.loads(info) - feed_html = info['feed_html'] + feed_html = info.get('feed_html') or info.get('content_html') m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) feed_entries.extend( self.url_result(video_id, 'Youtube', video_id=video_id) for video_id in ids) - if info['paging'] is None: + mobj = re.search( + r'data-uix-load-more-href="/?[^"]+paging=(?P\d+)', + feed_html) + if mobj is None: break - paging = info['paging'] + paging = mobj.group('paging') return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): - IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)' + IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' _FEED_NAME = 'subscriptions' _PLAYLIST_TITLE = u'Youtube Subscriptions'