X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fyahoo.py;h=3ab6017cdb51a3eaef6a3a1686719fba714780dd;hb=cc7fec5818254f4679896823c7de9d17f50201ca;hp=4f3af17d719443d9d19e32193e95cbb7f802acfb;hpb=d6039175e5b66740de0258898ff3fc44b2760a3d;p=youtube-dl diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4f3af17d7..3ab6017cd 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -1,76 +1,186 @@ -import datetime +from __future__ import unicode_literals + +import itertools import json import re -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( - ExtractorError, + compat_urllib_parse, + compat_urlparse, + clean_html, + int_or_none, ) + class YahooIE(InfoExtractor): - """Information extractor for screen.yahoo.com.""" - _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' + IE_DESC = 'Yahoo screen and movies' + _VALID_URL = r'(?Phttps?://(?:screen|movies)\.yahoo\.com/.*?-(?P[0-9]+)(?:-[a-z]+)?\.html)' + _TESTS = [ + { + 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', + 'md5': '4962b075c08be8690a922ee026d05e69', + 'info_dict': { + 'id': '2d25e626-2378-391f-ada0-ddaf1417e588', + 'ext': 'mp4', + 'title': 'Julian Smith & Travis Legg Watch Julian Smith', + 'description': 'Julian and Travis watch Julian Smith', + }, + }, + { + 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html', + 'md5': 'd6e6fc6e1313c608f316ddad7b82b306', + 'info_dict': { + 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9', + 'ext': 'mp4', + 'title': 'Codefellas - The Cougar Lies with Spanish Moss', + 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?', + }, + }, + { + 'url': 'https://movies.yahoo.com/video/world-loves-spider-man-190819223.html', + 'md5': '410b7104aa9893b765bc22787a22f3d9', + 'info_dict': { + 'id': '516ed8e2-2c4f-339f-a211-7a8b49d30845', + 'ext': 'mp4', + 'title': 'The World Loves Spider-Man', + 'description': '''People all over the world are celebrating the release of \"The Amazing Spider-Man 2.\" We're taking a look at the enthusiastic response Spider-Man has received from viewers all over the world.''', + } + }, + { + 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed', + 'md5': '60e8ac193d8fb71997caa8fce54c6460', + 'info_dict': { + 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb', + 'ext': 'mp4', + 'title': "Yahoo Saves 'Community'", + 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053', + } + }, + ] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') + url = mobj.group('url') webpage = self._download_webpage(url, video_id) - m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) - - if m_id is None: - # TODO: Check which url parameters are required - info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id - webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') - info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .*?)\ .*\]\]>.* - \d*?)\.html' + + _TESTS = [{ + 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html', + 'md5': '67010fdf3a08d290e060a4dd96baa07b', + 'info_dict': { + 'id': '104538833', + 'ext': 'mp4', + 'title': 'China Moses Is Crazy About the Blues', + 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id') + return self._get_info(long_id, video_id, webpage) + + +class YahooSearchIE(SearchInfoExtractor): + IE_DESC = 'Yahoo screen search' + _MAX_RESULTS = 1000 + IE_NAME = 'screen.yahoo:search' + _SEARCH_KEY = 'yvsearch' + + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" + entries = [] + for pagenum in itertools.count(0): + result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) + info = self._download_json(result_url, query, + note='Downloading results page '+str(pagenum+1)) + m = info['m'] + results = info['results'] + + for (i, r) in enumerate(results): + if (pagenum * 30) + i >= n: + break + mobj = re.search(r'(?Pscreen\.yahoo\.com/.*?-\d*?\.html)"', r) + e = self.url_result('http://' + mobj.group('url'), 'Yahoo') + entries.append(e) + if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)): + break + + return { + '_type': 'playlist', + 'id': query, + 'entries': entries, + }