From fefb166c52e00075a776dba3440039c046998eb4 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Sun, 8 Jan 2012 17:20:01 +0100 Subject: [PATCH] Leave out characters the filesystem cannot encode (Closes: #264) --- devscripts/posix-locale.sh | 6 + youtube_dl/__init__.py | 301 +++++++++++++++++++------------------ 2 files changed, 161 insertions(+), 146 deletions(-) create mode 100644 devscripts/posix-locale.sh diff --git a/devscripts/posix-locale.sh b/devscripts/posix-locale.sh new file mode 100644 index 000000000..0aa7a592d --- /dev/null +++ b/devscripts/posix-locale.sh @@ -0,0 +1,6 @@ + +# source this file in your shell to get a POSIX locale (which will break many programs, but that's kind of the point) + +export LC_ALL=POSIX +export LANG=POSIX +export LANGUAGE=POSIX diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index f6d6e7515..0f9eeccbf 100755 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -259,14 +259,14 @@ def sanitize_open(filename, open_mode): import msvcrt msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) return (sys.stdout, filename) - stream = open(filename, open_mode) + stream = open(_encodeFilename(filename), open_mode) return (stream, filename) except (IOError, OSError), err: # In case of error, try to remove win32 forbidden chars filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) # An exception here should be caught in the caller - stream = open(filename, open_mode) + stream = open(_encodeFilename(filename), open_mode) return (stream, filename) @@ -291,13 +291,21 @@ def _orderedSet(iterable): return res def _unescapeHTML(s): - """ - @param s a string (of type unicode) - """ - assert type(s) == type(u'') + """ + @param s a string (of type unicode) + """ + assert type(s) == type(u'') + + htmlParser = HTMLParser.HTMLParser() + return htmlParser.unescape(s) - htmlParser = HTMLParser.HTMLParser() - return htmlParser.unescape(s) +def _encodeFilename(s): + """ + @param s The name of the file (of type unicode) + """ + + assert type(s) == type(u'') + return s.encode(sys.getfilesystemencoding(), 'ignore') class DownloadError(Exception): """Download Error exception. @@ -563,16 +571,17 @@ class FileDownloader(object): self._pps.append(pp) pp.set_downloader(self) - def to_screen(self, message, skip_eol=False, ignore_encoding_errors=False): + def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - try: - if not self.params.get('quiet', False): - terminator = [u'\n', u''][skip_eol] - print >>self._screen_file, (u'%s%s' % (message, terminator)).encode(preferredencoding()), + assert type(message) == type(u'') + if not self.params.get('quiet', False): + terminator = [u'\n', u''][skip_eol] + output = message + terminator + + if 'b' not in self._screen_file.mode or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr + output = output.encode(preferredencoding(), 'ignore') + self._screen_file.write(output) self._screen_file.flush() - except (UnicodeEncodeError), err: - if not ignore_encoding_errors: - raise def to_stderr(self, message): """Print message to stderr.""" @@ -622,7 +631,7 @@ class FileDownloader(object): def temp_name(self, filename): """Returns a temporary filename for the given filename.""" if self.params.get('nopart', False) or filename == u'-' or \ - (os.path.exists(filename) and not os.path.isfile(filename)): + (os.path.exists(_encodeFilename(filename)) and not os.path.isfile(_encodeFilename(filename))): return filename return filename + u'.part' @@ -635,7 +644,7 @@ class FileDownloader(object): try: if old_filename == new_filename: return - os.rename(old_filename, new_filename) + os.rename(_encodeFilename(old_filename), _encodeFilename(new_filename)) except (IOError, OSError), err: self.trouble(u'ERROR: unable to rename file') @@ -643,7 +652,7 @@ class FileDownloader(object): """Try to set the last-modified time of the given file.""" if last_modified_hdr is None: return - if not os.path.isfile(filename): + if not os.path.isfile(_encodeFilename(filename)): return timestr = last_modified_hdr if timestr is None: @@ -659,15 +668,15 @@ class FileDownloader(object): def report_writedescription(self, descfn): """ Report that the description file is being written """ - self.to_screen(u'[info] Writing video description to: %s' % descfn, ignore_encoding_errors=True) + self.to_screen(u'[info] Writing video description to: ' + descfn) def report_writeinfojson(self, infofn): """ Report that the metadata file has been written """ - self.to_screen(u'[info] Video description metadata as JSON to: %s' % infofn, ignore_encoding_errors=True) + self.to_screen(u'[info] Video description metadata as JSON to: ' + infofn) def report_destination(self, filename): """Report destination filename.""" - self.to_screen(u'[download] Destination: %s' % filename, ignore_encoding_errors=True) + self.to_screen(u'[download] Destination: ' + filename) def report_progress(self, percent_str, data_len_str, speed_str, eta_str): """Report download progress.""" @@ -769,8 +778,8 @@ class FileDownloader(object): return try: - dn = os.path.dirname(filename) - if dn != '' and not os.path.exists(dn): + dn = os.path.dirname(_encodeFilename(filename)) + if dn != '' and not os.path.exists(dn): # dn is already encoded os.makedirs(dn) except (OSError, IOError), err: self.trouble(u'ERROR: unable to create directory ' + unicode(err)) @@ -778,9 +787,9 @@ class FileDownloader(object): if self.params.get('writedescription', False): try: - descfn = filename + '.description' + descfn = filename + u'.description' self.report_writedescription(descfn) - descfile = open(descfn, 'wb') + descfile = open(_encodeFilename(descfn), 'wb') try: descfile.write(info_dict['description'].encode('utf-8')) finally: @@ -790,7 +799,7 @@ class FileDownloader(object): return if self.params.get('writeinfojson', False): - infofn = filename + '.info.json' + infofn = filename + u'.info.json' self.report_writeinfojson(infofn) try: json.dump @@ -798,7 +807,7 @@ class FileDownloader(object): self.trouble(u'ERROR: No JSON encoder found. Update to Python 2.6+, setup a json module, or leave out --write-info-json.') return try: - infof = open(infofn, 'wb') + infof = open(_encodeFilename(infofn), 'wb') try: json_info_dict = dict((k,v) for k,v in info_dict.iteritems() if not k in ('urlhandle',)) json.dump(json_info_dict, infof) @@ -809,7 +818,7 @@ class FileDownloader(object): return if not self.params.get('skip_download', False): - if self.params.get('nooverwrites', False) and os.path.exists(filename): + if self.params.get('nooverwrites', False) and os.path.exists(_encodeFilename(filename)): success = True else: try: @@ -882,11 +891,11 @@ class FileDownloader(object): basic_args = ['rtmpdump', '-q'] + [[], ['-W', player_url]][player_url is not None] + ['-r', url, '-o', tmpfilename] retval = subprocess.call(basic_args + [[], ['-e', '-k', '1']][self.params.get('continuedl', False)]) while retval == 2 or retval == 1: - prevsize = os.path.getsize(tmpfilename) + prevsize = os.path.getsize(_encodeFilename(tmpfilename)) self.to_screen(u'\r[rtmpdump] %s bytes' % prevsize, skip_eol=True) time.sleep(5.0) # This seems to be needed retval = subprocess.call(basic_args + ['-e'] + [[], ['-k', '1']][retval == 1]) - cursize = os.path.getsize(tmpfilename) + cursize = os.path.getsize(_encodeFilename(tmpfilename)) if prevsize == cursize and retval == 1: break # Some rtmp streams seem abort after ~ 99.8%. Don't complain for those @@ -895,7 +904,7 @@ class FileDownloader(object): retval = 0 break if retval == 0: - self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(tmpfilename)) + self.to_screen(u'\r[rtmpdump] %s bytes' % os.path.getsize(_encodeFilename(tmpfilename))) self.try_rename(tmpfilename, filename) return True else: @@ -907,7 +916,7 @@ class FileDownloader(object): player_url = info_dict.get('player_url', None) # Check file already present - if self.params.get('continuedl', False) and os.path.isfile(filename) and not self.params.get('nopart', False): + if self.params.get('continuedl', False) and os.path.isfile(_encodeFilename(filename)) and not self.params.get('nopart', False): self.report_file_already_downloaded(filename) return True @@ -924,8 +933,8 @@ class FileDownloader(object): request = urllib2.Request(url, None, headers) # Establish possible resume length - if os.path.isfile(tmpfilename): - resume_len = os.path.getsize(tmpfilename) + if os.path.isfile(_encodeFilename(tmpfilename)): + resume_len = os.path.getsize(_encodeFilename(tmpfilename)) else: resume_len = 0 @@ -3750,17 +3759,17 @@ class MixcloudIE(InfoExtractor): try: # Process file information self._downloader.process_info({ - 'id': file_id.decode('utf-8'), - 'url': file_url.decode('utf-8'), + 'id': file_id.decode('utf-8'), + 'url': file_url.decode('utf-8'), 'uploader': uploader.decode('utf-8'), - 'upload_date': u'NA', - 'title': json_data['name'], - 'stitle': _simplify_title(json_data['name']), - 'ext': file_url.split('.')[-1].decode('utf-8'), - 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), - 'thumbnail': json_data['thumbnail_url'], - 'description': json_data['description'], - 'player_url': player_url.decode('utf-8'), + 'upload_date': u'NA', + 'title': json_data['name'], + 'stitle': _simplify_title(json_data['name']), + 'ext': file_url.split('.')[-1].decode('utf-8'), + 'format': (format_param is None and u'NA' or format_param.decode('utf-8')), + 'thumbnail': json_data['thumbnail_url'], + 'description': json_data['description'], + 'player_url': player_url.decode('utf-8'), }) except UnavailableVideoError, err: self._downloader.trouble(u'ERROR: unable to download file') @@ -3885,98 +3894,98 @@ class StanfordOpenClassroomIE(InfoExtractor): self.extract(entry['url']) class MTVIE(InfoExtractor): - """Information extractor for MTV.com""" - - _VALID_URL = r'^(?Phttps?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P[0-9]+)/[^/]+$' - IE_NAME = u'mtv' - - def report_webpage(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) - - def report_extraction(self, video_id): - """Report information extraction.""" - self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - self._downloader.trouble(u'ERROR: invalid URL: %s' % url) - return - if not mobj.group('proto'): - url = 'http://' + url - video_id = mobj.group('videoid') - self.report_webpage(video_id) - - request = urllib2.Request(url) - try: - webpage = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) - return - - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract song name') - return - song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract performer') - return - performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) - video_title = performer + ' - ' + song_name - - mobj = re.search(r'', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to mtvn_uri') - return - mtvn_uri = mobj.group(1) - - mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) - if mobj is None: - self._downloader.trouble(u'ERROR: unable to extract content id') - return - content_id = mobj.group(1) - - videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri - self.report_extraction(video_id) - request = urllib2.Request(videogen_url) - try: - metadataXml = urllib2.urlopen(request).read() - except (urllib2.URLError, httplib.HTTPException, socket.error), err: - self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err)) - return - - mdoc = xml.etree.ElementTree.fromstring(metadataXml) - renditions = mdoc.findall('.//rendition') - - # For now, always pick the highest quality. - rendition = renditions[-1] - - try: - _,_,ext = rendition.attrib['type'].partition('/') - format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] - video_url = rendition.find('./src').text - except KeyError: - self._downloader.trouble('Invalid rendition field.') - return - - self._downloader.increment_downloads() - info = { - 'id': video_id, - 'url': video_url, - 'uploader': performer, - 'title': video_title, - 'stitle': _simplify_title(video_title), - 'ext': ext, - 'format': format, - } - - try: - self._downloader.process_info(info) - except UnavailableVideoError, err: - self._downloader.trouble(u'\nERROR: unable to download ' + video_id) + """Information extractor for MTV.com""" + + _VALID_URL = r'^(?Phttps?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P[0-9]+)/[^/]+$' + IE_NAME = u'mtv' + + def report_webpage(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id)) + + def report_extraction(self, video_id): + """Report information extraction.""" + self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + self._downloader.trouble(u'ERROR: invalid URL: %s' % url) + return + if not mobj.group('proto'): + url = 'http://' + url + video_id = mobj.group('videoid') + self.report_webpage(video_id) + + request = urllib2.Request(url) + try: + webpage = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % str(err)) + return + + mobj = re.search(r'', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract song name') + return + song_name = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) + mobj = re.search(r'', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract performer') + return + performer = _unescapeHTML(mobj.group(1).decode('iso-8859-1')) + video_title = performer + ' - ' + song_name + + mobj = re.search(r'', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to mtvn_uri') + return + mtvn_uri = mobj.group(1) + + mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage) + if mobj is None: + self._downloader.trouble(u'ERROR: unable to extract content id') + return + content_id = mobj.group(1) + + videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri + self.report_extraction(video_id) + request = urllib2.Request(videogen_url) + try: + metadataXml = urllib2.urlopen(request).read() + except (urllib2.URLError, httplib.HTTPException, socket.error), err: + self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % str(err)) + return + + mdoc = xml.etree.ElementTree.fromstring(metadataXml) + renditions = mdoc.findall('.//rendition') + + # For now, always pick the highest quality. + rendition = renditions[-1] + + try: + _,_,ext = rendition.attrib['type'].partition('/') + format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate'] + video_url = rendition.find('./src').text + except KeyError: + self._downloader.trouble('Invalid rendition field.') + return + + self._downloader.increment_downloads() + info = { + 'id': video_id, + 'url': video_url, + 'uploader': performer, + 'title': video_title, + 'stitle': _simplify_title(video_title), + 'ext': ext, + 'format': format, + } + + try: + self._downloader.process_info(info) + except UnavailableVideoError, err: + self._downloader.trouble(u'\nERROR: unable to download ' + video_id) class PostProcessor(object): @@ -4042,7 +4051,7 @@ class FFmpegExtractAudioPP(PostProcessor): @staticmethod def get_audio_codec(path): try: - cmd = ['ffprobe', '-show_streams', '--', path] + cmd = ['ffprobe', '-show_streams', '--', _encodeFilename(path)] handle = subprocess.Popen(cmd, stderr=file(os.path.devnull, 'w'), stdout=subprocess.PIPE) output = handle.communicate()[0] if handle.wait() != 0: @@ -4063,7 +4072,7 @@ class FFmpegExtractAudioPP(PostProcessor): acodec_opts = [] else: acodec_opts = ['-acodec', codec] - cmd = ['ffmpeg', '-y', '-i', path, '-vn'] + acodec_opts + more_opts + ['--', out_path] + cmd = ['ffmpeg', '-y', '-i', _encodeFilename(path), '-vn'] + acodec_opts + more_opts + ['--', _encodeFilename(out_path)] try: p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout,stderr = p.communicate() @@ -4124,9 +4133,9 @@ class FFmpegExtractAudioPP(PostProcessor): extension = 'wav' more_opts += ['-f', 'wav'] - (prefix, ext) = os.path.splitext(path) - new_path = prefix + '.' + extension - self._downloader.to_screen(u'[ffmpeg] Destination: %s' % new_path) + prefix, sep, ext = path.rpartition(u'.') # not os.path.splitext, since the latter does not work on unicode in all setups + new_path = prefix + sep + extension + self._downloader.to_screen(u'[ffmpeg] Destination: ' + new_path) try: self.run_ffmpeg(path, new_path, acodec, more_opts) except: @@ -4140,13 +4149,13 @@ class FFmpegExtractAudioPP(PostProcessor): # Try to update the date time for extracted audio file. if information.get('filetime') is not None: try: - os.utime(new_path, (time.time(), information['filetime'])) + os.utime(_encodeFilename(new_path), (time.time(), information['filetime'])) except: self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file') if not self._keepvideo: try: - os.remove(path) + os.remove(_encodeFilename(path)) except (IOError, OSError): self._downloader.to_stderr(u'WARNING: Unable to remove downloaded video file') return None @@ -4194,9 +4203,9 @@ def parseOpts(): import optparse import shlex - def _readOptions(filename): + def _readOptions(filename_bytes): try: - optionf = open(filename) + optionf = open(filename_bytes) except IOError: return [] # silently skip if file is not present try: -- 2.39.5