X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=139173016a9f7139e9e3caae11dc563cf066f499;hb=2f58b12dad1b5e19f2daf338cdba958be0b0a87c;hp=0c544d7475057872828debe49dba2eee98744a75;hpb=0c021ad171231c91e554c76d62e7fc749caf87bc;p=youtube-dl diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index 0c544d747..139173016 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -610,10 +610,13 @@ class YoutubeIE(InfoExtractor): self.report_rtmp_download() video_url_list = [(None, video_info['conn'][0])] elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1: - url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',') - url_data = [compat_parse_qs(uds) for uds in url_data_strs] - url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud] - url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data) + url_map = {} + for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): + url_data = compat_parse_qs(url_data_str) + if 'itag' in url_data and 'url' in url_data: + url = url_data['url'][0] + '&signature=' + url_data['sig'][0] + if not 'ratebypass' in url: url += '&ratebypass=yes' + url_map[url_data['itag'][0]] = url format_limit = self._downloader.params.get('format_limit', None) available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats @@ -848,7 +851,10 @@ class DailymotionIE(InfoExtractor): class PhotobucketIE(InfoExtractor): """Information extractor for photobucket.com.""" - _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' + # TODO: the original _VALID_URL was: + # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' + # Check if it's necessary to keep the old extracion process + _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P.*)\.(?P(flv)|(mp4))' IE_NAME = u'photobucket' def _real_extract(self, url): @@ -857,20 +863,30 @@ class PhotobucketIE(InfoExtractor): if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) + video_id = mobj.group('id') - video_extension = 'flv' + video_extension = mobj.group('ext') # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) + webpage = self._download_webpage(url, video_id) # Extract URL, uploader, and title from webpage self.report_extraction(video_id) + # We try first by looking the javascript code: + mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P.*?)\);', webpage) + if mobj is not None: + info = json.loads(mobj.group('json')) + return [{ + 'id': video_id, + 'url': info[u'downloadUrl'], + 'uploader': info[u'username'], + 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'), + 'title': info[u'title'], + 'ext': video_extension, + 'thumbnail': info[u'thumbUrl'], + }] + + # We try looking in other parts of the webpage mobj = re.search(r'', webpage) if mobj is None: raise ExtractorError(u'Unable to extract media URL') @@ -896,123 +912,72 @@ class PhotobucketIE(InfoExtractor): class YahooIE(InfoExtractor): - """Information extractor for video.yahoo.com.""" + """Information extractor for screen.yahoo.com.""" + _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' - _WORKING = False - # _VALID_URL matches all Yahoo! Video URLs - # _VPAGE_URL matches only the extractable '/watch/' URLs - _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' - _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' - IE_NAME = u'video.yahoo' - - def _real_extract(self, url, new_video=True): - # Extract ID from URL + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) if mobj is None: raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(2) - video_extension = 'flv' - - # Rewrite valid but non-extractable URLs as - # extractable English language /watch/ URLs - if re.match(self._VPAGE_URL, url) is None: - request = compat_urllib_request.Request(url) - try: - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) - - mobj = re.search(r'\("id", "([0-9]+)"\);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract id field') - yahoo_id = mobj.group(1) - - mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract vid field') - yahoo_vid = mobj.group(1) - - url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id) - return self._real_extract(url, new_video=False) - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) - - # Extract uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video title') - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'

(.*)

', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video uploader') - video_uploader = mobj.group(1).decode('utf-8') - - # Extract video thumbnail - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video thumbnail') - video_thumbnail = mobj.group(1).decode('utf-8') - - # Extract video description - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video description') - video_description = mobj.group(1).decode('utf-8') - if not video_description: - video_description = 'No description available.' - - # Extract video height and width - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video height') - yv_video_height = mobj.group(1) - - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video width') - yv_video_width = mobj.group(1) - - # Retrieve video playlist to extract media URL - # I'm not completely sure what all these options are, but we - # seem to need most of them, otherwise the server sends a 401. - yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents - yv_bitrate = '700' # according to Wikipedia this is hard-coded - request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id + - '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height + - '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797') - try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) - - # Extract media URL from playlist XML - mobj = re.search(r'.+?)";', webpage) + + if m_id is None: + # TODO: Check which url parameters are required + info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id + webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') + info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* + .*?)\]\]>.* + .*?)\ .*\]\]>.* + |\d+|all):(?P[\s\S]+)' _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"' _max_google_results = 1000 IE_NAME = u'video.google:search' - def report_download_page(self, query, pagenum): - """Report attempt to download playlist page with given number.""" - query = query.decode(preferredencoding()) - self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum)) - def _real_extract(self, query): mobj = re.match(self._VALID_URL, query) - if mobj is None: - raise ExtractorError(u'Invalid search query "%s"' % query) - prefix, query = query.split(':') - prefix = prefix[8:] - query = query.encode('utf-8') + prefix = mobj.group('prefix') + query = mobj.group('query') if prefix == '': - self._download_n_results(query, 1) - return + return self._get_n_results(query, 1) elif prefix == 'all': - self._download_n_results(query, self._max_google_results) - return + return self._get_n_results(query, self._max_google_results) else: - try: - n = int(prefix) - if n <= 0: - raise ExtractorError(u'Invalid download number %s for query "%s"' % (n, query)) - elif n > self._max_google_results: - self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) - n = self._max_google_results - self._download_n_results(query, n) - return - except ValueError: # parsing prefix as integer fails - self._download_n_results(query, 1) - return - - def _download_n_results(self, query, n): - """Downloads a specified number of results for a query""" + n = int(prefix) + if n <= 0: + raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) + elif n > self._max_google_results: + self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n)) + n = self._max_google_results + return self._get_n_results(query, n) - video_ids = [] - pagenum = 0 - - while True: - self.report_download_page(query, pagenum) - result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10) - request = compat_urllib_request.Request(result_url) - try: - page = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download webpage: %s' % compat_str(err)) + def _get_n_results(self, query, n): + """Get a specified number of results for a query""" - # Extract video identifiers - for mobj in re.finditer(self._VIDEO_INDICATOR, page): - video_id = mobj.group(1) - if video_id not in video_ids: - video_ids.append(video_id) - if len(video_ids) == n: - # Specified n videos reached - for id in video_ids: - self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id]) - return + res = { + '_type': 'playlist', + 'id': query, + 'entries': [] + } - if re.search(self._MORE_PAGES_INDICATOR, page) is None: - for id in video_ids: - self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id]) - return + for pagenum in itertools.count(1): + result_url = u'http://video.google.com/videosearch?q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10) + webpage = self._download_webpage(result_url, u'gvsearch:' + query, + note='Downloading result page ' + str(pagenum)) - pagenum = pagenum + 1 + for mobj in re.finditer(r'

n) or not re.search(self._MORE_PAGES_INDICATOR, webpage): + return res class YahooSearchIE(InfoExtractor): """Information Extractor for Yahoo! Video search queries.""" @@ -3112,7 +3050,7 @@ class GooglePlusIE(InfoExtractor): pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]' mobj = re.search(pattern, webpage) if mobj is None: - self._downloader.report_error(u'unable to extract video page URL') + raise ExtractorError(u'Unable to extract video page URL') video_page = mobj.group(1) webpage = self._download_webpage(video_page, video_id, u'Downloading video page') @@ -3124,7 +3062,7 @@ class GooglePlusIE(InfoExtractor): pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"' mobj = re.findall(pattern, webpage) if len(mobj) == 0: - self._downloader.report_error(u'unable to extract video links') + raise ExtractorError(u'Unable to extract video links') # Sort in resolution links = sorted(mobj) @@ -3330,14 +3268,14 @@ class FunnyOrDieIE(InfoExtractor): m = re.search(r']*>\s*]*>\s*(?P.*?)</h1>", webpage, flags=re.DOTALL) if not m: m = re.search(r'<title>(?P<title>[^<]+?)', webpage) if not m: - self._downloader.report_error(u'Cannot find video title') + raise ExtractorError(u'Cannot find video title') title = clean_html(m.group('title')) m = re.search(r'I[0-9]+)/.*' + + def _real_extract(self,url): + mobj = re.match(self._VALID_URL, url) + + video_id = mobj.group('id') + mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id + video_extension = 'mp4' + webpage = self._download_webpage(mrss_url, video_id) + mobj = re.search(r'.*?)]]>', webpage) + if mobj is None: + raise ExtractorError(u'Unable to extract title') + video_title = mobj.group(1) + + return [{ + 'id': video_id, + 'url': video_url, + 'ext': video_extension, + 'title': video_title, + }] def gen_extractors(): """ Return a list of an instance of every supported extractor. @@ -4196,6 +4162,7 @@ def gen_extractors(): TumblrIE(), BandcampIE(), RedTubeIE(), + InaIE(), GenericIE() ]