X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=c4917012bbc8d329a31b1358ece863b86d3f8631;hb=a8156c1d2e4b2a7ac5e034c247c6fccaca15a21d;hp=d18073d72894c905b92233d0493cf83d525a6607;hpb=0b8c922da91fb7238ea15434d6a4792da84015bf;p=youtube-dl diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index d18073d72..c4917012b 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2,14 +2,11 @@ # -*- coding: utf-8 -*- import gzip -import htmlentitydefs -import HTMLParser import locale import os import re import sys import zlib -import urllib2 import email.utils import json @@ -18,33 +15,71 @@ try: except ImportError: import StringIO +try: + import urllib.request as compat_urllib_request +except ImportError: # Python 2 + import urllib2 as compat_urllib_request + +try: + import urllib.error as compat_urllib_error +except ImportError: # Python 2 + import urllib2 as compat_urllib_error + +try: + import urllib.parse as compat_urllib_parse +except ImportError: # Python 2 + import urllib as compat_urllib_parse + +try: + import http.cookiejar as compat_cookiejar +except ImportError: # Python 2 + import cookielib as compat_cookiejar + +try: + import html.entities as compat_html_entities +except NameError: # Python 2 + import htmlentitydefs as compat_html_entities + +try: + import html.parser as compat_html_parser +except NameError: # Python 2 + import HTMLParser as compat_html_parser + +try: + compat_str = unicode # Python 2 +except NameError: + compat_str = str + +try: + compat_chr = unichr # Python 2 +except NameError: + compat_chr = chr + + std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-us,en;q=0.5', } - def preferredencoding(): """Get preferred encoding. Returns the best encoding scheme for the system, based on locale.getpreferredencoding() and some further tweaks. """ - def yield_preferredencoding(): - try: - pref = locale.getpreferredencoding() - u'TEST'.encode(pref) - except: - pref = 'UTF-8' - while True: - yield pref - return yield_preferredencoding().next() + try: + pref = locale.getpreferredencoding() + u'TEST'.encode(pref) + except: + pref = 'UTF-8' + + return pref def htmlentity_transform(matchobj): - """Transforms an HTML entity to a Unicode character. + """Transforms an HTML entity to a character. This function receives a match object and is intended to be used with the re.sub() function. @@ -52,11 +87,10 @@ def htmlentity_transform(matchobj): entity = matchobj.group(1) # Known non-numeric HTML entity - if entity in htmlentitydefs.name2codepoint: - return unichr(htmlentitydefs.name2codepoint[entity]) + if entity in compat_html_entities.name2codepoint: + return compat_chr(compat_html_entities.name2codepoint[entity]) - # Unicode character - mobj = re.match(ur'(?u)#(x?\d+)', entity) + mobj = re.match(u'(?u)#(x?\\d+)', entity) if mobj is not None: numstr = mobj.group(1) if numstr.startswith(u'x'): @@ -64,13 +98,13 @@ def htmlentity_transform(matchobj): numstr = u'0%s' % numstr else: base = 10 - return unichr(long(numstr, base)) + return compat_chr(int(numstr, base)) # Unknown entity in name, return its literal representation return (u'&%s;' % entity) -HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix -class IDParser(HTMLParser.HTMLParser): +compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix +class IDParser(compat_html_parser.HTMLParser): """Modified HTMLParser that isolates a tag with the specified id""" def __init__(self, id): self.id = id @@ -80,12 +114,11 @@ class IDParser(HTMLParser.HTMLParser): self.html = None self.watch_startpos = False self.error_count = 0 - HTMLParser.HTMLParser.__init__(self) + compat_html_parser.HTMLParser.__init__(self) def error(self, message): - print self.getpos() if self.error_count > 10 or self.started: - raise HTMLParser.HTMLParseError(message, self.getpos()) + raise compat_html_parser.HTMLParseError(message, self.getpos()) self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line self.error_count += 1 self.goahead(1) @@ -124,8 +157,10 @@ class IDParser(HTMLParser.HTMLParser): handle_decl = handle_pi = unknown_decl = find_startpos def get_result(self): - if self.result == None: return None - if len(self.result) != 3: return None + if self.result is None: + return None + if len(self.result) != 3: + return None lines = self.html.split('\n') lines = lines[self.result[1][0]-1:self.result[2][0]] lines[0] = lines[0][self.result[1][1]:] @@ -139,7 +174,7 @@ def get_element_by_id(id, html): parser = IDParser(id) try: parser.loads(html) - except HTMLParser.HTMLParseError: + except compat_html_parser.HTMLParseError: pass return parser.get_result() @@ -156,12 +191,6 @@ def clean_html(html): return html -def sanitize_title(utitle): - """Sanitizes a video title so it could be used as part of a filename.""" - utitle = unescapeHTML(utitle) - return utitle.replace(unicode(os.sep), u'%') - - def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. @@ -180,9 +209,9 @@ def sanitize_open(filename, open_mode): return (sys.stdout, filename) stream = open(encodeFilename(filename), open_mode) return (stream, filename) - except (IOError, OSError), err: + except (IOError, OSError) as err: # In case of error, try to remove win32 forbidden chars - filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename) + filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename) # An exception here should be caught in the caller stream = open(encodeFilename(filename), open_mode) @@ -197,9 +226,35 @@ def timeconvert(timestr): timestamp = email.utils.mktime_tz(timetuple) return timestamp -def simplify_title(title): - expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) - return expr.sub(u'_', title).strip(u'_') +def sanitize_filename(s, restricted=False): + """Sanitizes a string so it could be used as part of a filename. + If restricted is set, use a stricter subset of allowed characters. + """ + def replace_insane(char): + if char == '?' or ord(char) < 32 or ord(char) == 127: + return '' + elif char == '"': + return '' if restricted else '\'' + elif char == ':': + return '_-' if restricted else ' -' + elif char in '\\/|*<>': + return '_' + if restricted and (char in '!&\'' or char.isspace()): + return '_' + if restricted and ord(char) > 127: + return '_' + return char + + result = u''.join(map(replace_insane, s)) + while '__' in result: + result = result.replace('__', '_') + result = result.strip('_') + # Common case of "Foreign band name - English song title" + if restricted and result.startswith('-_'): + result = result[2:] + if not result: + result = '_' + return result def orderedSet(iterable): """ Remove all duplicates from the input iterable """ @@ -211,21 +266,21 @@ def orderedSet(iterable): def unescapeHTML(s): """ - @param s a string (of type unicode) + @param s a string """ assert type(s) == type(u'') - result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s) + result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s) return result def encodeFilename(s): """ - @param s The name of the file (of type unicode) + @param s The name of the file """ assert type(s) == type(u'') - if sys.platform == 'win32' and sys.getwindowsversion().major >= 5: + if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: # Pass u'' directly to use Unicode APIs on Windows 2000 and up # (Detecting Windows NT 4 is tricky because 'major >= 4' would # match Windows 9x series as well. Besides, NT 4 is obsolete.) @@ -292,12 +347,12 @@ class ContentTooShortError(Exception): class Trouble(Exception): """Trouble helper exception - + This is an exception to be handled with FileDownloader.trouble """ -class YoutubeDLHandler(urllib2.HTTPHandler): +class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -324,9 +379,9 @@ class YoutubeDLHandler(urllib2.HTTPHandler): @staticmethod def addinfourl_wrapper(stream, headers, url, code): - if hasattr(urllib2.addinfourl, 'getcode'): - return urllib2.addinfourl(stream, headers, url, code) - ret = urllib2.addinfourl(stream, headers, url) + if hasattr(compat_urllib_request.addinfourl, 'getcode'): + return compat_urllib_request.addinfourl(stream, headers, url, code) + ret = compat_urllib_request.addinfourl(stream, headers, url) ret.code = code return ret