From: Jaime Marquínez Ferrándiz Date: Thu, 28 Mar 2013 12:02:04 +0000 (+0100) Subject: Merge branch 'master' into extract_info_rewrite X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=cfa90f4adce8b5e7faf92d0a08abe38630b150b8;hp=-c;p=youtube-dl Merge branch 'master' into extract_info_rewrite --- cfa90f4adce8b5e7faf92d0a08abe38630b150b8 diff --combined youtube_dl/FileDownloader.py index 68fad11bc,725d4a016..6af2acbee --- a/youtube_dl/FileDownloader.py +++ b/youtube_dl/FileDownloader.py @@@ -78,7 -78,11 +78,11 @@@ class FileDownloader(object) updatetime: Use the Last-modified header to set output file timestamps. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file - writesubtitles: Write the video subtitles to a .srt file + writesubtitles: Write the video subtitles to a file + onlysubtitles: Downloads only the subtitles of the video + allsubtitles: Downloads all the subtitles of the video + listsubtitles: Lists all available subtitles for the video + subtitlesformat: Subtitle format [sbv/srt] (default=srt) subtitleslang: Language of the subtitles to download test: Download only first bytes to test the downloader. keepvideo: Keep the video file after post-processing @@@ -301,9 -305,9 +305,9 @@@ """ Report that the description file is being written """ self.to_screen(u'[info] Writing video description to: ' + descfn) - def report_writesubtitles(self, srtfn): + def report_writesubtitles(self, sub_filename): """ Report that the subtitles file is being written """ - self.to_screen(u'[info] Writing video subtitles to: ' + srtfn) + self.to_screen(u'[info] Writing video subtitles to: ' + sub_filename) def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ @@@ -372,8 -376,11 +376,11 @@@ filename = self.params['outtmpl'] % template_dict return filename - except (ValueError, KeyError) as err: - self.trouble(u'ERROR: invalid system charset or erroneous output template') + except KeyError as err: + self.trouble(u'ERROR: Erroneous output template') + return None + except ValueError as err: + self.trouble(u'ERROR: Insufficient system charset ' + repr(preferredencoding())) return None def _match_entry(self, info_dict): @@@ -389,72 -396,6 +396,72 @@@ if re.search(rejecttitle, title, re.IGNORECASE): return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' return None + + def extract_info(self, url): + ''' + Returns a list with a dictionary for each video we find. + ''' + suitable_found = False + for ie in self._ies: + # Go to next InfoExtractor if not suitable + if not ie.suitable(url): + continue + + # Warn if the _WORKING attribute is False + if not ie.working(): + self.to_stderr(u'WARNING: the program functionality for this site has been marked as broken, ' + u'and will probably not work. If you want to go on, use the -i option.') + + # Suitable InfoExtractor found + suitable_found = True + + # Extract information from URL and process it + try: + ie_results = ie.extract(url) + results = self.process_ie_results(ie_results, ie) + return results + except ExtractorError as de: # An error we somewhat expected + self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback()) + break + except Exception as e: + if self.params.get('ignoreerrors', False): + self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc())) + break + else: + raise + if not suitable_found: + self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url) + def extract_info_iterable(self, urls): + ''' + Return the videos founded for the urls + ''' + results = [] + for url in urls: + results.extend(self.extract_info(url)) + return results + + def process_ie_results(self, ie_results, ie): + """ + Take the results of the ie and return a list of videos. + For url elements it will seartch the suitable ie and get the videos + For playlist elements it will process each of the elements of the 'entries' key + """ + results = [] + for result in ie_results or []: + result_type = result.get('_type', 'video') #If not given we suppose it's a video, support the dafault old system + if result_type == 'video': + if not 'extractor' in result: + #The extractor has already been set somewhere else + result['extractor'] = ie.IE_NAME + results.append(result) + elif result_type == 'url': + #We get the videos pointed by the url + results.extend(self.extract_info(result['url'])) + elif result_type == 'playlist': + #We process each entry in the playlist + entries_result = self.process_ie_results(result['entries'], ie) + results.extend(entries_result) + return results def process_info(self, info_dict): """Process a single dictionary returned by an InfoExtractor.""" @@@ -519,14 -460,35 +526,35 @@@ if self.params.get('writesubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE + subtitle = info_dict['subtitles'][0] + (sub_error, sub_lang, sub) = subtitle + sub_format = self.params.get('subtitlesformat') try: - srtfn = filename.rsplit('.', 1)[0] + u'.srt' - self.report_writesubtitles(srtfn) - with io.open(encodeFilename(srtfn), 'w', encoding='utf-8') as srtfile: - srtfile.write(info_dict['subtitles']) + sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + self.report_writesubtitles(sub_filename) + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + subfile.write(sub) except (OSError, IOError): self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) return + if self.params.get('onlysubtitles', False): + return + + if self.params.get('allsubtitles', False) and 'subtitles' in info_dict and info_dict['subtitles']: + subtitles = info_dict['subtitles'] + sub_format = self.params.get('subtitlesformat') + for subtitle in subtitles: + (sub_error, sub_lang, sub) = subtitle + try: + sub_filename = filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format + self.report_writesubtitles(sub_filename) + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + subfile.write(sub) + except (OSError, IOError): + self.trouble(u'ERROR: Cannot write subtitles file ' + descfn) + return + if self.params.get('onlysubtitles', False): + return if self.params.get('writeinfojson', False): infofn = filename + u'.info.json' @@@ -566,14 -528,49 +594,14 @@@ raise SameFileError(self.params['outtmpl']) for url in url_list: - suitable_found = False - for ie in self._ies: - # Go to next InfoExtractor if not suitable - if not ie.suitable(url): - continue - - # Warn if the _WORKING attribute is False - if not ie.working(): - self.report_warning(u'the program functionality for this site has been marked as broken, ' - u'and will probably not work. If you want to go on, use the -i option.') + videos = self.extract_info(url) - # Suitable InfoExtractor found - suitable_found = True - - # Extract information from URL and process it + for video in videos or []: try: - videos = ie.extract(url) - except ExtractorError as de: # An error we somewhat expected - self.trouble(u'ERROR: ' + compat_str(de), de.format_traceback()) - break - except Exception as e: - if self.params.get('ignoreerrors', False): - self.trouble(u'ERROR: ' + compat_str(e), tb=compat_str(traceback.format_exc())) - break - else: - raise - - if len(videos or []) > 1 and self.fixed_template(): - raise SameFileError(self.params['outtmpl']) - - for video in videos or []: - video['extractor'] = ie.IE_NAME - try: - self.increment_downloads() - self.process_info(video) - except UnavailableVideoError: - self.trouble(u'\nERROR: unable to download video') - - # Suitable InfoExtractor had been found; go to next URL - break - - if not suitable_found: - self.trouble(u'ERROR: no suitable InfoExtractor: %s' % url) + self.increment_downloads() + self.process_info(video) + except UnavailableVideoError: + self.trouble(u'\nERROR: unable to download video') return self._download_retcode diff --combined youtube_dl/InfoExtractors.py index e714fa6b0,835428f32..dd4a776e4 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@@ -48,7 -48,7 +48,7 @@@ class InfoExtractor(object) uploader_id: Nickname or id of the video uploader. location: Physical location of the video. player_url: SWF Player URL (used for rtmpdump). - subtitles: The .srt file contents. + subtitles: The subtitle file contents. urlhandle: [internal] The urlHandle to be used to download the file, like returned by urllib.request.urlopen @@@ -126,26 -126,14 +126,32 @@@ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): """ Returns the data of the page as a string """ urlh = self._request_webpage(url_or_request, video_id, note, errnote) + content_type = urlh.headers.get('Content-Type', '') + m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) + if m: + encoding = m.group(1) + else: + encoding = 'utf-8' webpage_bytes = urlh.read() - return webpage_bytes.decode('utf-8', 'replace') + return webpage_bytes.decode(encoding, 'replace') + + #Methods for following #608 + #They set the correct value of the '_type' key + def video_result(self, video_info): + """Returns a video""" + video_info['_type'] = 'video' + return video_info + def url_result(self, url, ie=None): + """Returns a url that points to a page that should be processed""" + #TODO: ie should be the class used for getting the info + video_info = {'_type': 'url', + 'url': url} + return video_info + def playlist_result(self, entries): + """Returns a playlist""" + video_info = {'_type': 'playlist', + 'entries': entries} + return video_info class YoutubeIE(InfoExtractor): @@@ -236,7 -224,16 +242,16 @@@ def report_video_subtitles_download(self, video_id): """Report attempt to download video info webpage.""" - self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id) + self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id) + + def report_video_subtitles_request(self, video_id, sub_lang, format): + """Report attempt to download video info webpage.""" + self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) + + def report_video_subtitles_available(self, video_id, sub_lang_list): + """Report available subtitles.""" + sub_lang = ",".join(list(sub_lang_list.keys())) + self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang)) def report_information_extraction(self, video_id): """Report attempt to extract video information.""" @@@ -250,55 -247,63 +265,63 @@@ """Indicate the download will use the RTMP protocol.""" self._downloader.to_screen(u'[youtube] RTMP download detected') - def _closed_captions_xml_to_srt(self, xml_string): - srt = '' - texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE) - # TODO parse xml instead of regex - for n, (start, dur_tag, dur, caption) in enumerate(texts): - if not dur: dur = '4' - start = float(start) - end = start + float(dur) - start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000) - end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000) - caption = unescapeHTML(caption) - caption = unescapeHTML(caption) # double cycle, intentional - srt += str(n+1) + '\n' - srt += start + ' --> ' + end + '\n' - srt += caption + '\n\n' - return srt - - def _extract_subtitles(self, video_id): + def _get_available_subtitles(self, video_id): self.report_video_subtitles_download(video_id) request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) try: - srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8') + sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list) - srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list) - if not srt_lang_list: - return (u'WARNING: video has no closed captions', None) - if self._downloader.params.get('subtitleslang', False): - srt_lang = self._downloader.params.get('subtitleslang') - elif 'en' in srt_lang_list: - srt_lang = 'en' - else: - srt_lang = list(srt_lang_list.keys())[0] - if not srt_lang in srt_lang_list: - return (u'WARNING: no closed captions found in the specified language', None) + sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) + sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) + if not sub_lang_list: + return (u'WARNING: video doesn\'t have subtitles', None) + return sub_lang_list + + def _list_available_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + self.report_video_subtitles_available(video_id, sub_lang_list) + + def _request_subtitle(self, sub_lang, sub_name, video_id, format): + self.report_video_subtitles_request(video_id, sub_lang, format) params = compat_urllib_parse.urlencode({ - 'lang': srt_lang, - 'name': srt_lang_list[srt_lang].encode('utf-8'), + 'lang': sub_lang, + 'name': sub_name, 'v': video_id, + 'fmt': format, }) url = 'http://www.youtube.com/api/timedtext?' + params try: - srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8') + sub = compat_urllib_request.urlopen(url).read().decode('utf-8') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None) - if not srt_xml: + if not sub: return (u'WARNING: Did not fetch video subtitles', None) - return (None, self._closed_captions_xml_to_srt(srt_xml)) + return (None, sub_lang, sub) + + def _extract_subtitle(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + if self._downloader.params.get('subtitleslang', False): + sub_lang = self._downloader.params.get('subtitleslang') + elif 'en' in sub_lang_list: + sub_lang = 'en' + else: + sub_lang = list(sub_lang_list.keys())[0] + if not sub_lang in sub_lang_list: + return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None) + + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + return [subtitle] + + def _extract_all_subtitles(self, video_id): + sub_lang_list = self._get_available_subtitles(video_id) + sub_format = self._downloader.params.get('subtitlesformat') + subtitles = [] + for sub_lang in sub_lang_list: + subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) + subtitles.append(subtitle) + return subtitles def _print_formats(self, formats): print('Available formats:') @@@ -519,12 -524,26 +542,26 @@@ else: video_description = '' - # closed captions + # subtitles video_subtitles = None + if self._downloader.params.get('writesubtitles', False): - (srt_error, video_subtitles) = self._extract_subtitles(video_id) - if srt_error: - self._downloader.trouble(srt_error) + video_subtitles = self._extract_subtitle(video_id) + if video_subtitles: + (sub_error, sub_lang, sub) = video_subtitles[0] + if sub_error: + self._downloader.trouble(sub_error) + + if self._downloader.params.get('allsubtitles', False): + video_subtitles = self._extract_all_subtitles(video_id) + for video_subtitle in video_subtitles: + (sub_error, sub_lang, sub) = video_subtitle + if sub_error: + self._downloader.trouble(sub_error) + + if self._downloader.params.get('listsubtitles', False): + sub_lang_list = self._list_available_subtitles(video_id) + return if 'length_seconds' not in video_info: self._downloader.trouble(u'WARNING: unable to extract video duration') @@@ -1299,7 -1318,8 +1336,8 @@@ class GenericIE(InfoExtractor) def report_download_webpage(self, video_id): """Report webpage download.""" - self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') + if not self._downloader.params.get('test', False): + self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.') self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id) def report_extraction(self, video_id): @@@ -1311,7 -1331,7 +1349,7 @@@ self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url) def _test_redirect(self, url): - """Check if it is a redirect, like url shorteners, in case restart chain.""" + """Check if it is a redirect, like url shorteners, in case return the new url.""" class HeadRequest(compat_urllib_request.Request): def get_method(self): return "HEAD" @@@ -1362,20 -1382,15 +1400,15 @@@ return False self.report_following_redirect(new_url) - self._downloader.download([new_url]) - return True + return new_url def _real_extract(self, url): - if self._test_redirect(url): return + new_url = self._test_redirect(url) + if new_url: return [self.url_result(new_url)] video_id = url.split('/')[-1] - request = compat_urllib_request.Request(url) try: - self.report_download_webpage(video_id) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err)) - return + webpage = self._download_webpage(url, video_id) except ValueError as err: # since this is the last-resort InfoExtractor, if # this error is thrown, it'll be thrown here @@@ -1774,8 -1789,9 +1807,8 @@@ class YoutubePlaylistIE(InfoExtractor) else: self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos))) - for video in videos: - self._downloader.download([video]) - return + url_results = [self.url_result(url) for url in videos] + return [self.playlist_result(url_results)] class YoutubeChannelIE(InfoExtractor): @@@ -1825,9 -1841,9 +1858,9 @@@ self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - for id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % id]) - return + urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] + url_entries = [self.url_result(url) for url in urls] + return [self.playlist_result(url_entries)] class YoutubeUserIE(InfoExtractor): @@@ -1909,9 -1925,8 +1942,9 @@@ self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" % (username, all_ids_count, len(video_ids))) - for video_id in video_ids: - self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id]) + urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] + url_results = [self.url_result(url) for url in urls] + return [self.playlist_result(url_results)] class BlipTVUserIE(InfoExtractor): @@@ -2001,9 -2016,8 +2034,9 @@@ self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" % (self.IE_NAME, username, all_ids_count, len(video_ids))) - for video_id in video_ids: - self._downloader.download([u'http://blip.tv/'+video_id]) + urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids] + url_entries = [self.url_result(url) for url in urls] + return [self.playlist_result(url_entries)] class DepositFilesIE(InfoExtractor): @@@ -2576,7 -2590,7 +2609,7 @@@ class EscapistIE(InfoExtractor) 'uploader': showName, 'upload_date': None, 'title': showName, - 'ext': 'flv', + 'ext': 'mp4', 'thumbnail': imgUrl, 'description': description, 'player_url': playerUrl, @@@ -3972,11 -3986,11 +4005,11 @@@ class KeekIE(InfoExtractor) webpage = self._download_webpage(url, video_id) m = re.search(r'[\s\n]+

(?P\w+)

', webpage) - uploader = unescapeHTML(m.group('uploader')) + m = re.search(r'
[\S\s]+?

(?P.+?)

', webpage) + uploader = clean_html(m.group('uploader')) info = { - 'id':video_id, - 'url':video_url, + 'id': video_id, + 'url': video_url, 'ext': 'mp4', 'title': title, 'thumbnail': thumbnail, @@@ -4113,6 -4127,40 +4146,40 @@@ class MySpassIE(InfoExtractor) } return [info] + class SpiegelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P[0-9]+)(?:\.html)?$' + + def _real_extract(self, url): + m = re.match(self._VALID_URL, url) + video_id = m.group('videoID') + + webpage = self._download_webpage(url, video_id) + m = re.search(r'
(.*?)
', webpage) + if not m: + raise ExtractorError(u'Cannot find title') + video_title = unescapeHTML(m.group(1)) + + xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' + xml_code = self._download_webpage(xml_url, video_id, + note=u'Downloading XML', errnote=u'Failed to download XML') + + idoc = xml.etree.ElementTree.fromstring(xml_code) + last_type = idoc[-1] + filename = last_type.findall('./filename')[0].text + duration = float(last_type.findall('./duration')[0].text) + + video_url = 'http://video2.spiegel.de/flash/' + filename + video_ext = filename.rpartition('.')[2] + info = { + 'id': video_id, + 'url': video_url, + 'ext': video_ext, + 'title': video_title, + 'duration': duration, + } + return [info] + + def gen_extractors(): """ Return a list of an instance of every supported extractor. The order does matter; the first extractor matched is the one handling the URL. @@@ -4161,6 -4209,7 +4228,7 @@@ KeekIE(), TEDIE(), MySpassIE(), + SpiegelIE(), GenericIE() ]