X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Futils.py;h=40d6823a0f8ddbbf42a54bf4b0aea9e344825600;hb=1a9c655e3b1569f315d4193e877cba0b4a863c63;hp=737cca8e13ad03fffa6848178c65415ec7d7ec7d;hpb=d11d05d07acdd11a93b02d750852dea4ae32be3b;p=youtube-dl

diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 737cca8e1..40d6823a0 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -11,40 +11,39 @@ import sys
 import zlib
 import urllib2
 import email.utils
+import json
 
 try:
 	import cStringIO as StringIO
 except ImportError:
 	import StringIO
-		
-try:
-	import json
-except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
-	import trivialjson as json
 
 std_headers = {
-	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
+	'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
 	'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
 	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 	'Accept-Encoding': 'gzip, deflate',
 	'Accept-Language': 'en-us,en;q=0.5',
 }
 
+try:
+    compat_str = unicode # Python 2
+except NameError:
+    compat_str = str
+
 def preferredencoding():
 	"""Get preferred encoding.
 
 	Returns the best encoding scheme for the system, based on
 	locale.getpreferredencoding() and some further tweaks.
 	"""
-	def yield_preferredencoding():
-		try:
-			pref = locale.getpreferredencoding()
-			u'TEST'.encode(pref)
-		except:
-			pref = 'UTF-8'
-		while True:
-			yield pref
-	return yield_preferredencoding().next()
+	try:
+		pref = locale.getpreferredencoding()
+		u'TEST'.encode(pref)
+	except:
+		pref = 'UTF-8'
+
+	return pref
 
 
 def htmlentity_transform(matchobj):
@@ -73,11 +72,90 @@ def htmlentity_transform(matchobj):
 	# Unknown entity in name, return its literal representation
 	return (u'&%s;' % entity)
 
+HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
+class IDParser(HTMLParser.HTMLParser):
+	"""Modified HTMLParser that isolates a tag with the specified id"""
+	def __init__(self, id):
+		self.id = id
+		self.result = None
+		self.started = False
+		self.depth = {}
+		self.html = None
+		self.watch_startpos = False
+		self.error_count = 0
+		HTMLParser.HTMLParser.__init__(self)
+
+	def error(self, message):
+		if self.error_count > 10 or self.started:
+			raise HTMLParser.HTMLParseError(message, self.getpos())
+		self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
+		self.error_count += 1
+		self.goahead(1)
+
+	def loads(self, html):
+		self.html = html
+		self.feed(html)
+		self.close()
+
+	def handle_starttag(self, tag, attrs):
+		attrs = dict(attrs)
+		if self.started:
+			self.find_startpos(None)
+		if 'id' in attrs and attrs['id'] == self.id:
+			self.result = [tag]
+			self.started = True
+			self.watch_startpos = True
+		if self.started:
+			if not tag in self.depth: self.depth[tag] = 0
+			self.depth[tag] += 1
+
+	def handle_endtag(self, tag):
+		if self.started:
+			if tag in self.depth: self.depth[tag] -= 1
+			if self.depth[self.result[0]] == 0:
+				self.started = False
+				self.result.append(self.getpos())
+
+	def find_startpos(self, x):
+		"""Needed to put the start position of the result (self.result[1])
+		after the opening tag with the requested id"""
+		if self.watch_startpos:
+			self.watch_startpos = False
+			self.result.append(self.getpos())
+	handle_entityref = handle_charref = handle_data = handle_comment = \
+	handle_decl = handle_pi = unknown_decl = find_startpos
+
+	def get_result(self):
+		if self.result == None: return None
+		if len(self.result) != 3: return None
+		lines = self.html.split('\n')
+		lines = lines[self.result[1][0]-1:self.result[2][0]]
+		lines[0] = lines[0][self.result[1][1]:]
+		if len(lines) == 1:
+			lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
+		lines[-1] = lines[-1][:self.result[2][1]]
+		return '\n'.join(lines).strip()
+
+def get_element_by_id(id, html):
+	"""Return the content of the tag with the specified id in the passed HTML document"""
+	parser = IDParser(id)
+	try:
+		parser.loads(html)
+	except HTMLParser.HTMLParseError:
+		pass
+	return parser.get_result()
+
 
-def sanitize_title(utitle):
-	"""Sanitizes a video title so it could be used as part of a filename."""
-	utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
-	return utitle.replace(unicode(os.sep), u'%')
+def clean_html(html):
+	"""Clean an HTML snippet into a readable string"""
+	# Newline vs <br />
+	html = html.replace('\n', ' ')
+	html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
+	# Strip html tags
+	html = re.sub('<.*?>', '', html)
+	# Replace html entities
+	html = unescapeHTML(html)
+	return html
 
 
 def sanitize_open(filename, open_mode):
@@ -115,9 +193,35 @@ def timeconvert(timestr):
 		timestamp = email.utils.mktime_tz(timetuple)
 	return timestamp
 
-def simplify_title(title):
-	expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
-	return expr.sub(u'_', title).strip(u'_')
+def sanitize_filename(s, restricted=False):
+	"""Sanitizes a string so it could be used as part of a filename.
+	If restricted is set, use a stricter subset of allowed characters.
+	"""
+	def replace_insane(char):
+		if char == '?' or ord(char) < 32 or ord(char) == 127:
+			return ''
+		elif char == '"':
+			return '' if restricted else '\''
+		elif char == ':':
+			return '_-' if restricted else ' -'
+		elif char in '\\/|*<>':
+			return '_'
+		if restricted and (char in '!&\'' or char.isspace()):
+			return '_'
+		if restricted and ord(char) > 127:
+			return '_'
+		return char
+
+	result = u''.join(map(replace_insane, s))
+	while '__' in result:
+		result = result.replace('__', '_')
+	result = result.strip('_')
+	# Common case of "Foreign band name - English song title"
+	if restricted and result.startswith('-_'):
+		result = result[2:]
+	if not result:
+		result = '_'
+	return result
 
 def orderedSet(iterable):
 	""" Remove all duplicates from the input iterable """
@@ -133,8 +237,8 @@ def unescapeHTML(s):
 	"""
 	assert type(s) == type(u'')
 
-	htmlParser = HTMLParser.HTMLParser()
-	return htmlParser.unescape(s)
+	result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
+	return result
 
 def encodeFilename(s):
 	"""
@@ -143,7 +247,7 @@ def encodeFilename(s):
 
 	assert type(s) == type(u'')
 
-	if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
+	if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
 		# Pass u'' directly to use Unicode APIs on Windows 2000 and up
 		# (Detecting Windows NT 4 is tricky because 'major >= 4' would
 		# match Windows 9x series as well. Besides, NT 4 is obsolete.)
@@ -208,6 +312,13 @@ class ContentTooShortError(Exception):
 		self.expected = expected
 
 
+class Trouble(Exception):
+	"""Trouble helper exception
+
+	This is an exception to be handled with
+	FileDownloader.trouble
+	"""
+
 class YoutubeDLHandler(urllib2.HTTPHandler):
 	"""Handler for HTTP requests and responses.