X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=41e6b85509be30a723d2d50e5a95a4cc5571333d;hb=73dce4b2e4cb6eea951dbd682a92ad7508c957b0;hp=6e982157c4811b9db43642af3b9ee90d8ab5e376;hpb=3fe294e4ef96317c61398707ed65a9e3f1c281c4;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 6e982157c..41e6b8550 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,49 +2,90 @@ # -*- coding: utf-8 -*- import gzip -import htmlentitydefs -import HTMLParser +import io import locale import os import re import sys import zlib -import urllib2 import email.utils import json try: - import cStringIO as StringIO -except ImportError: - import StringIO + import urllib.request as compat_urllib_request +except ImportError: # Python 2 + import urllib2 as compat_urllib_request + +try: + import urllib.error as compat_urllib_error +except ImportError: # Python 2 + import urllib2 as compat_urllib_error + +try: + import urllib.parse as compat_urllib_parse +except ImportError: # Python 2 + import urllib as compat_urllib_parse + +try: + import http.cookiejar as compat_cookiejar +except ImportError: # Python 2 + import cookielib as compat_cookiejar + +try: + import html.entities as compat_html_entities +except ImportError: # Python 2 + import htmlentitydefs as compat_html_entities + +try: + import html.parser as compat_html_parser +except ImportError: # Python 2 + import HTMLParser as compat_html_parser + +try: + import http.client as compat_html_client +except ImportError: # Python 2 + import httplib as compat_html_client + +try: + from urllib.parse import parse_qs as compat_parse_qs +except ImportError: # Python 2 + from urlparse import parse_qs as compat_parse_qs + +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + compat_chr = unichr # Python 2 +except NameError: + compat_chr = chr + std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } - def preferredencoding(): """Get preferred encoding. Returns the best encoding scheme for the system, based on locale.getpreferredencoding() and some further tweaks. """ - def yield_preferredencoding(): - try: - pref = locale.getpreferredencoding() - u'TEST'.encode(pref) - except: - pref = 'UTF-8' - while True: - yield pref - return yield_preferredencoding().next() + try: + pref = locale.getpreferredencoding() + u'TEST'.encode(pref) + except: + pref = 'UTF-8' + + return pref def htmlentity_transform(matchobj): - """Transforms an HTML entity to a Unicode character. + """Transforms an HTML entity to a character. This function receives a match object and is intended to be used with the re.sub() function. @@ -52,11 +93,10 @@ def htmlentity_transform(matchobj): entity = matchobj.group(1) # Known non-numeric HTML entity - if entity in htmlentitydefs.name2codepoint: - return unichr(htmlentitydefs.name2codepoint[entity]) + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) - # Unicode character - mobj = re.match(ur'(?u)#(x?\d+)', entity) + mobj = re.match(u'(?u)#(x?\\d+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith(u'x'): @@ -64,13 +104,13 @@ def htmlentity_transform(matchobj): numstr = u'0%s' % numstr else: base = 10 - return unichr(long(numstr, base)) + return compat_chr(int(numstr, base)) # Unknown entity in name, return its literal representation return (u'&%s;' % entity) -HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(HTMLParser.HTMLParser): +compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix +class IDParser(compat_html_parser.HTMLParser): """Modified HTMLParser that isolates a tag with the specified id""" def __init__(self, id): self.id = id @@ -80,12 +120,11 @@ class IDParser(HTMLParser.HTMLParser): self.html = None self.watch_startpos = False self.error_count = 0 - HTMLParser.HTMLParser.__init__(self) + compat_html_parser.HTMLParser.__init__(self) def error(self, message): - print self.getpos() if self.error_count > 10 or self.started: - raise HTMLParser.HTMLParseError(message, self.getpos()) + raise compat_html_parser.HTMLParseError(message, self.getpos()) self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line self.error_count += 1 self.goahead(1) @@ -124,8 +163,10 @@ class IDParser(HTMLParser.HTMLParser): handle_decl = handle_pi = unknown_decl = find_startpos def get_result(self): - if self.result == None: return None - if len(self.result) != 3: return None + if self.result is None: + return None + if len(self.result) != 3: + return None lines = self.html.split('\n') lines = lines[self.result[1][0]-1:self.result[2][0]] lines[0] = lines[0][self.result[1][1]:] @@ -139,7 +180,7 @@ def get_element_by_id(id, html): parser = IDParser(id) try: parser.loads(html) - except HTMLParser.HTMLParseError: + except compat_html_parser.HTMLParseError: pass return parser.get_result() @@ -156,12 +197,6 @@ def clean_html(html): return html -def sanitize_title(utitle): - """Sanitizes a video title so it could be used as part of a filename.""" - utitle = unescapeHTML(utitle) - return utitle.replace(unicode(os.sep), u'%') - - def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. @@ -180,9 +215,9 @@ def sanitize_open(filename, open_mode): return (sys.stdout, filename) stream = open(encodeFilename(filename), open_mode) return (stream, filename) - except (IOError, OSError), err: + except (IOError, OSError) as err: # In case of error, try to remove win32 forbidden chars - filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) + filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename) # An exception here should be caught in the caller stream = open(encodeFilename(filename), open_mode) @@ -197,9 +232,35 @@ def timeconvert(timestr): timestamp = email.utils.mktime_tz(timetuple) return timestamp -def simplify_title(title): - expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) - return expr.sub(u'_', title).strip(u'_') +def sanitize_filename(s, restricted=False): + """Sanitizes a string so it could be used as part of a filename. + If restricted is set, use a stricter subset of allowed characters. + """ + def replace_insane(char): + if char == '?' or ord(char) < 32 or ord(char) == 127: + return '' + elif char == '"': + return '' if restricted else '\'' + elif char == ':': + return '_-' if restricted else ' -' + elif char in '\\/|*<>': + return '_' + if restricted and (char in '!&\'' or char.isspace()): + return '_' + if restricted and ord(char) > 127: + return '_' + return char + + result = u''.join(map(replace_insane, s)) + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if not result: + result = '_' + return result def orderedSet(iterable): """ Remove all duplicates from the input iterable """ @@ -211,21 +272,21 @@ def orderedSet(iterable): def unescapeHTML(s): """ - @param s a string (of type unicode) + @param s a string """ assert type(s) == type(u'') - result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) + result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) return result def encodeFilename(s): """ - @param s The name of the file (of type unicode) + @param s The name of the file """ assert type(s) == type(u'') - if sys.platform == 'win32' and sys.getwindowsversion().major >= 5: + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: # Pass u'' directly to use Unicode APIs on Windows 2000 and up # (Detecting Windows NT 4 is tricky because 'major >= 4' would # match Windows 9x series as well. Besides, NT 4 is obsolete.) @@ -290,7 +351,14 @@ class ContentTooShortError(Exception): self.expected = expected -class YoutubeDLHandler(urllib2.HTTPHandler): +class Trouble(Exception): + """Trouble helper exception + + This is an exception to be handled with + FileDownloader.trouble + """ + +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -317,9 +385,9 @@ class YoutubeDLHandler(urllib2.HTTPHandler): @staticmethod def addinfourl_wrapper(stream, headers, url, code): - if hasattr(urllib2.addinfourl, 'getcode'): - return urllib2.addinfourl(stream, headers, url, code) - ret = urllib2.addinfourl(stream, headers, url) + if hasattr(compat_urllib_request.addinfourl, 'getcode'): + return compat_urllib_request.addinfourl(stream, headers, url, code) + ret = compat_urllib_request.addinfourl(stream, headers, url) ret.code = code return ret @@ -338,12 +406,12 @@ class YoutubeDLHandler(urllib2.HTTPHandler): old_resp = resp # gzip if resp.headers.get('Content-encoding', '') == 'gzip': - gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r') + gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r') resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg # deflate if resp.headers.get('Content-encoding', '') == 'deflate': - gz = StringIO.StringIO(self.deflate(resp.read())) + gz = io.BytesIO(self.deflate(resp.read())) resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg return resp