2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
22 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 'Accept-Encoding': 'gzip, deflate',
26 'Accept-Language': 'en-us,en;q=0.5',
29 def preferredencoding():
30 """Get preferred encoding.
32 Returns the best encoding scheme for the system, based on
33 locale.getpreferredencoding() and some further tweaks.
35 def yield_preferredencoding():
37 pref = locale.getpreferredencoding()
43 return yield_preferredencoding().next()
46 def htmlentity_transform(matchobj):
47 """Transforms an HTML entity to a Unicode character.
49 This function receives a match object and is intended to be used with
50 the re.sub() function.
52 entity = matchobj.group(1)
54 # Known non-numeric HTML entity
55 if entity in htmlentitydefs.name2codepoint:
56 return unichr(htmlentitydefs.name2codepoint[entity])
59 mobj = re.match(ur'(?u)#(x?\d+)', entity)
61 numstr = mobj.group(1)
62 if numstr.startswith(u'x'):
64 numstr = u'0%s' % numstr
67 return unichr(long(numstr, base))
69 # Unknown entity in name, return its literal representation
70 return (u'&%s;' % entity)
72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
73 class IDParser(HTMLParser.HTMLParser):
74 """Modified HTMLParser that isolates a tag with the specified id"""
75 def __init__(self, id):
81 self.watch_startpos = False
83 HTMLParser.HTMLParser.__init__(self)
85 def error(self, message):
86 print >> sys.stderr, self.getpos()
87 if self.error_count > 10 or self.started:
88 raise HTMLParser.HTMLParseError(message, self.getpos())
89 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
93 def loads(self, html):
98 def handle_starttag(self, tag, attrs):
101 self.find_startpos(None)
102 if 'id' in attrs and attrs['id'] == self.id:
105 self.watch_startpos = True
107 if not tag in self.depth: self.depth[tag] = 0
110 def handle_endtag(self, tag):
112 if tag in self.depth: self.depth[tag] -= 1
113 if self.depth[self.result[0]] == 0:
115 self.result.append(self.getpos())
117 def find_startpos(self, x):
118 """Needed to put the start position of the result (self.result[1])
119 after the opening tag with the requested id"""
120 if self.watch_startpos:
121 self.watch_startpos = False
122 self.result.append(self.getpos())
123 handle_entityref = handle_charref = handle_data = handle_comment = \
124 handle_decl = handle_pi = unknown_decl = find_startpos
126 def get_result(self):
127 if self.result == None: return None
128 if len(self.result) != 3: return None
129 lines = self.html.split('\n')
130 lines = lines[self.result[1][0]-1:self.result[2][0]]
131 lines[0] = lines[0][self.result[1][1]:]
133 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
134 lines[-1] = lines[-1][:self.result[2][1]]
135 return '\n'.join(lines).strip()
137 def get_element_by_id(id, html):
138 """Return the content of the tag with the specified id in the passed HTML document"""
139 parser = IDParser(id)
142 except HTMLParser.HTMLParseError:
144 return parser.get_result()
147 def clean_html(html):
148 """Clean an HTML snippet into a readable string"""
150 html = html.replace('\n', ' ')
151 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
153 html = re.sub('<.*?>', '', html)
154 # Replace html entities
155 html = unescapeHTML(html)
159 def sanitize_open(filename, open_mode):
160 """Try to open the given filename, and slightly tweak it if this fails.
162 Attempts to open the given filename. If this fails, it tries to change
163 the filename slightly, step by step, until it's either able to open it
164 or it fails and raises a final exception, like the standard open()
167 It returns the tuple (stream, definitive_file_name).
171 if sys.platform == 'win32':
173 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
174 return (sys.stdout, filename)
175 stream = open(encodeFilename(filename), open_mode)
176 return (stream, filename)
177 except (IOError, OSError), err:
178 # In case of error, try to remove win32 forbidden chars
179 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
181 # An exception here should be caught in the caller
182 stream = open(encodeFilename(filename), open_mode)
183 return (stream, filename)
186 def timeconvert(timestr):
187 """Convert RFC 2822 defined time string into system timestamp"""
189 timetuple = email.utils.parsedate_tz(timestr)
190 if timetuple is not None:
191 timestamp = email.utils.mktime_tz(timetuple)
194 def sanitize_filename(s):
195 """Sanitizes a string so it could be used as part of a filename."""
196 def replace_insane(char):
197 if char == '?' or ord(char) < 32 or ord(char) == 127:
203 elif char in '\\/|*<>':
207 result = u''.join(map(replace_insane, s))
208 while '--' in result:
209 result = result.replace('--', '-')
210 return result.strip('-')
212 def orderedSet(iterable):
213 """ Remove all duplicates from the input iterable """
222 @param s a string (of type unicode)
224 assert type(s) == type(u'')
226 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
229 def encodeFilename(s):
231 @param s The name of the file (of type unicode)
234 assert type(s) == type(u'')
236 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
237 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
238 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
239 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
242 return s.encode(sys.getfilesystemencoding(), 'ignore')
244 class DownloadError(Exception):
245 """Download Error exception.
247 This exception may be thrown by FileDownloader objects if they are not
248 configured to continue on errors. They will contain the appropriate
254 class SameFileError(Exception):
255 """Same File exception.
257 This exception will be thrown by FileDownloader objects if they detect
258 multiple files would have to be downloaded to the same file on disk.
263 class PostProcessingError(Exception):
264 """Post Processing exception.
266 This exception may be raised by PostProcessor's .run() method to
267 indicate an error in the postprocessing task.
271 class MaxDownloadsReached(Exception):
272 """ --max-downloads limit has been reached. """
276 class UnavailableVideoError(Exception):
277 """Unavailable Format exception.
279 This exception will be thrown when a video is requested
280 in a format that is not available for that video.
285 class ContentTooShortError(Exception):
286 """Content Too Short exception.
288 This exception may be raised by FileDownloader objects when a file they
289 download is too small for what the server announced first, indicating
290 the connection was probably interrupted.
296 def __init__(self, downloaded, expected):
297 self.downloaded = downloaded
298 self.expected = expected
301 class Trouble(Exception):
302 """Trouble helper exception
304 This is an exception to be handled with
305 FileDownloader.trouble
308 class YoutubeDLHandler(urllib2.HTTPHandler):
309 """Handler for HTTP requests and responses.
311 This class, when installed with an OpenerDirector, automatically adds
312 the standard headers to every HTTP request and handles gzipped and
313 deflated responses from web servers. If compression is to be avoided in
314 a particular request, the original request in the program code only has
315 to include the HTTP header "Youtubedl-No-Compression", which will be
316 removed before making the real request.
318 Part of this code was copied from:
320 http://techknack.net/python-urllib2-handlers/
322 Andrew Rowls, the author of that code, agreed to release it to the
329 return zlib.decompress(data, -zlib.MAX_WBITS)
331 return zlib.decompress(data)
334 def addinfourl_wrapper(stream, headers, url, code):
335 if hasattr(urllib2.addinfourl, 'getcode'):
336 return urllib2.addinfourl(stream, headers, url, code)
337 ret = urllib2.addinfourl(stream, headers, url)
341 def http_request(self, req):
342 for h in std_headers:
345 req.add_header(h, std_headers[h])
346 if 'Youtubedl-no-compression' in req.headers:
347 if 'Accept-encoding' in req.headers:
348 del req.headers['Accept-encoding']
349 del req.headers['Youtubedl-no-compression']
352 def http_response(self, req, resp):
355 if resp.headers.get('Content-encoding', '') == 'gzip':
356 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
357 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
358 resp.msg = old_resp.msg
360 if resp.headers.get('Content-encoding', '') == 'deflate':
361 gz = StringIO.StringIO(self.deflate(resp.read()))
362 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
363 resp.msg = old_resp.msg