import netrc
import re
import socket
+import itertools
from .common import InfoExtractor, SearchInfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
unified_strdate,
+ orderedSet,
)
class YoutubeIE(InfoExtractor):
- """Information extractor for youtube.com."""
-
+ IE_DESC = u'YouTube.com'
_VALID_URL = r"""^
(
(?:https?://)? # http(s):// (optional)
(?: # the various things that can precede the ID:
(?:(?:v|embed|e)/) # v/ or embed/ or e/
|(?: # or the v= param in all its forms
- (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+ (?:watch|movie(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
(?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
- if YoutubePlaylistIE.suitable(url): return False
+ if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
def report_lang(self):
"""Report attempt to set language."""
self.to_screen(u'Setting language')
- def report_login(self):
- """Report attempt to log in."""
- self.to_screen(u'Logging in')
-
def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self.to_screen(u'%s: Downloading video webpage' % video_id)
if self._downloader is None:
return
- username = None
- password = None
- downloader_params = self._downloader.params
-
- # Attempt to use provided username and password or .netrc data
- if downloader_params.get('username', None) is not None:
- username = downloader_params['username']
- password = downloader_params['password']
- elif downloader_params.get('usenetrc', False):
- try:
- info = netrc.netrc().authenticators(self._NETRC_MACHINE)
- if info is not None:
- username = info[0]
- password = info[2]
- else:
- raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
- except (IOError, netrc.NetrcParseError) as err:
- self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
- return
-
# Set language
request = compat_urllib_request.Request(self._LANG_URL)
try:
self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
return
+ (username, password) = self._get_login_info()
+
# No authentication to be performed
if username is None:
return
break
if 'token' not in video_info:
if 'reason' in video_info:
- raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0])
+ raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0], expected=True)
else:
raise ExtractorError(u'"token" parameter not in video info for unknown reason')
video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
# thumbnail image
- if 'thumbnail_url' not in video_info:
+ # We try first to get a high quality image:
+ m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
+ video_webpage, re.DOTALL)
+ if m_thumb is not None:
+ video_thumbnail = m_thumb.group(1)
+ elif 'thumbnail_url' not in video_info:
self._downloader.report_warning(u'unable to extract video thumbnail')
video_thumbnail = ''
else: # don't panic if we can't find it
if req_format is None or req_format == 'best':
video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
elif req_format == 'worst':
- video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
+ video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
elif req_format in ('-1', 'all'):
video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
else:
return results
class YoutubePlaylistIE(InfoExtractor):
- """Information Extractor for YouTube playlists."""
-
+ IE_DESC = u'YouTube.com playlists'
_VALID_URL = r"""(?:
(?:https?://)?
(?:\w+\.)?
videos = [v[1] for v in sorted(videos)]
- url_results = [self.url_result(url, 'Youtube') for url in videos]
+ url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
return [self.playlist_result(url_results, playlist_id, playlist_title)]
class YoutubeChannelIE(InfoExtractor):
- """Information Extractor for YouTube channels."""
-
+ IE_DESC = u'YouTube.com channels'
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
- url_entries = [self.url_result(url, 'Youtube') for url in urls]
+ url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls]
return [self.playlist_result(url_entries, channel_id)]
class YoutubeUserIE(InfoExtractor):
- """Information Extractor for YouTube users."""
-
+ IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
_GDATA_PAGE_SIZE = 50
pagenum += 1
urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
- url_results = [self.url_result(url, 'Youtube') for url in urls]
+ url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
return [self.playlist_result(url_results, playlist_title = username)]
class YoutubeSearchIE(SearchInfoExtractor):
- """Information Extractor for YouTube search queries."""
+ IE_DESC = u'YouTube.com searches'
_API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
_MAX_RESULTS = 1000
IE_NAME = u'youtube:search'
video_ids = video_ids[:n]
videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
return self.playlist_result(videos, query)
+
+
+class YoutubeShowIE(InfoExtractor):
+ IE_DESC = u'YouTube.com (multi-season) shows'
+ _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
+ IE_NAME = u'youtube:show'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ show_name = mobj.group(1)
+ webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
+ # There's one playlist for each season of the show
+ m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
+ self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
+ return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
+
+
+class YoutubeSubscriptionsIE(YoutubeIE):
+ """It's a subclass of YoutubeIE because we need to login"""
+ IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ IE_NAME = u'youtube:subscriptions'
+ _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s'
+ _PAGING_STEP = 30
+
+ # Overwrite YoutubeIE properties we don't want
+ _TESTS = []
+ @classmethod
+ def suitable(cls, url):
+ return re.match(cls._VALID_URL, url) is not None
+
+ def _real_extract(self, url):
+ feed_entries = []
+ # The step argument is available only in 2.7 or higher
+ for i in itertools.count(0):
+ paging = i*self._PAGING_STEP
+ info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed',
+ u'Downloading page %s' % i)
+ info = json.loads(info)
+ feed_html = info['feed_html']
+ m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html)
+ ids = orderedSet(m.group(1) for m in m_ids)
+ feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
+ if info['paging'] is None:
+ break
+ return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions')