dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
return webpage_bytes.decode(encoding, 'replace')
+
+ #Methods for following #608
+ #They set the correct value of the '_type' key
+ def video_result(self, video_info):
+ """Returns a video"""
+ video_info['_type'] = 'video'
+ return video_info
+ def url_result(self, url, ie=None):
+ """Returns a url that points to a page that should be processed"""
+ #TODO: ie should be the class used for getting the info
+ video_info = {'_type': 'url',
+ 'url': url}
+ return video_info
+ def playlist_result(self, entries, playlist_id=None, playlist_title=None):
+ """Returns a playlist"""
+ video_info = {'_type': 'playlist',
+ 'entries': entries}
+ if playlist_id:
+ video_info['id'] = playlist_id
+ if playlist_title:
+ video_info['title'] = playlist_title
+ return video_info
class YoutubeIE(InfoExtractor):
# Check if video comes from YouTube
mobj2 = re.match(r'^yt-(.*)$', video_id)
if mobj2 is not None:
- self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
- return
+ return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
# Retrieve video webpage to extract further information
- request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
- try:
- self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
if 'mediaData' not in vardict:
self._downloader.report_error(u'unable to extract media URL')
return
- mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
+ mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
if mobj is None:
self._downloader.report_error(u'unable to extract media URL')
return
- mediaURL = mobj.group(1).replace('\\/', '/')
+ mediaURL = mobj.group('mediaURL').replace('\\/', '/')
video_extension = mediaURL[-3:]
- video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
+ video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
if mobj is None:
self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
def _test_redirect(self, url):
- """Check if it is a redirect, like url shorteners, in case restart chain."""
+ """Check if it is a redirect, like url shorteners, in case return the new url."""
class HeadRequest(compat_urllib_request.Request):
def get_method(self):
return "HEAD"
return False
self.report_following_redirect(new_url)
- self._downloader.download([new_url])
- return True
+ return new_url
def _real_extract(self, url):
- if self._test_redirect(url): return
+ new_url = self._test_redirect(url)
+ if new_url: return [self.url_result(new_url)]
video_id = url.split('/')[-1]
try:
self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
return
- if not 'feed' in response or not 'entry' in response['feed']:
+ if 'feed' not in response:
self._downloader.report_error(u'Got a malformed response from YouTube API')
return
+ if 'entry' not in response['feed']:
+ # Number of videos is a multiple of self._MAX_RESULTS
+ break
+
videos += [ (entry['yt$position']['$t'], entry['content']['src'])
for entry in response['feed']['entry']
if 'content' in entry ]
page_num += 1
videos = [v[1] for v in sorted(videos)]
- total = len(videos)
-
- playliststart = self._downloader.params.get('playliststart', 1) - 1
- playlistend = self._downloader.params.get('playlistend', -1)
- if playlistend == -1:
- videos = videos[playliststart:]
- else:
- videos = videos[playliststart:playlistend]
- if len(videos) == total:
- self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
- else:
- self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
-
- for video in videos:
- self._downloader.download([video])
- return
+ url_results = [self.url_result(url) for url in videos]
+ return [self.playlist_result(url_results, playlist_id)]
class YoutubeChannelIE(InfoExtractor):
"""Information Extractor for YouTube channels."""
- _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
+ _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
- _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+ _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
+ _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel'
def report_download_page(self, channel_id, pagenum):
"""Report attempt to download channel page with given number."""
self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
+ def extract_videos_from_page(self, page):
+ ids_in_page = []
+ for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
+ if mobj.group(1) not in ids_in_page:
+ ids_in_page.append(mobj.group(1))
+ return ids_in_page
+
def _real_extract(self, url):
# Extract channel id
mobj = re.match(self._VALID_URL, url)
self._downloader.report_error(u'invalid url: %s' % url)
return
- # Download channel pages
+ # Download channel page
channel_id = mobj.group(1)
video_ids = []
pagenum = 1
- while True:
- self.report_download_page(channel_id, pagenum)
- url = self._TEMPLATE_URL % (channel_id, pagenum)
- request = compat_urllib_request.Request(url)
- try:
- page = compat_urllib_request.urlopen(request).read().decode('utf8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ self.report_download_page(channel_id, pagenum)
+ url = self._TEMPLATE_URL % (channel_id, pagenum)
+ request = compat_urllib_request.Request(url)
+ try:
+ page = compat_urllib_request.urlopen(request).read().decode('utf8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
+ return
- # Extract video identifiers
- ids_in_page = []
- for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
- if mobj.group(1) not in ids_in_page:
- ids_in_page.append(mobj.group(1))
- video_ids.extend(ids_in_page)
+ # Extract video identifiers
+ ids_in_page = self.extract_videos_from_page(page)
+ video_ids.extend(ids_in_page)
- if self._MORE_PAGES_INDICATOR not in page:
- break
- pagenum = pagenum + 1
+ # Download any subsequent channel pages using the json-based channel_ajax query
+ if self._MORE_PAGES_INDICATOR in page:
+ while True:
+ pagenum = pagenum + 1
+
+ self.report_download_page(channel_id, pagenum)
+ url = self._MORE_PAGES_URL % (pagenum, channel_id)
+ request = compat_urllib_request.Request(url)
+ try:
+ page = compat_urllib_request.urlopen(request).read().decode('utf8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
+ return
+
+ page = json.loads(page)
+
+ ids_in_page = self.extract_videos_from_page(page['content_html'])
+ video_ids.extend(ids_in_page)
+
+ if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
+ break
self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
- for id in video_ids:
- self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
- return
+ urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
+ url_entries = [self.url_result(url) for url in urls]
+ return [self.playlist_result(url_entries, channel_id)]
class YoutubeUserIE(InfoExtractor):
pagenum += 1
- all_ids_count = len(video_ids)
- playliststart = self._downloader.params.get('playliststart', 1) - 1
- playlistend = self._downloader.params.get('playlistend', -1)
-
- if playlistend == -1:
- video_ids = video_ids[playliststart:]
- else:
- video_ids = video_ids[playliststart:playlistend]
-
- self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
- (username, all_ids_count, len(video_ids)))
-
- for video_id in video_ids:
- self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
+ urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
+ url_results = [self.url_result(url) for url in urls]
+ return [self.playlist_result(url_results, playlist_title = username)]
class BlipTVUserIE(InfoExtractor):
pagenum += 1
- all_ids_count = len(video_ids)
- playliststart = self._downloader.params.get('playliststart', 1) - 1
- playlistend = self._downloader.params.get('playlistend', -1)
-
- if playlistend == -1:
- video_ids = video_ids[playliststart:]
- else:
- video_ids = video_ids[playliststart:playlistend]
-
self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
(self.IE_NAME, username, all_ids_count, len(video_ids)))
- for video_id in video_ids:
- self._downloader.download([u'http://blip.tv/'+video_id])
+ urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
+ url_entries = [self.url_result(url) for url in urls]
+ return [self.playlist_result(url_entries, playlist_title = username)]
class DepositFilesIE(InfoExtractor):
ARDIE(),
GenericIE()
]
+
+def get_info_extractor(ie_name):
+ """Returns the info extractor class with the given ie_name"""
+ return globals()[ie_name+'IE']