2 # -*- coding: utf-8 -*-
17 import cStringIO as StringIO
22 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
23 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
24 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
25 'Accept-Encoding': 'gzip, deflate',
26 'Accept-Language': 'en-us,en;q=0.5',
29 def preferredencoding():
30 """Get preferred encoding.
32 Returns the best encoding scheme for the system, based on
33 locale.getpreferredencoding() and some further tweaks.
35 def yield_preferredencoding():
37 pref = locale.getpreferredencoding()
43 return yield_preferredencoding().next()
46 def htmlentity_transform(matchobj):
47 """Transforms an HTML entity to a Unicode character.
49 This function receives a match object and is intended to be used with
50 the re.sub() function.
52 entity = matchobj.group(1)
54 # Known non-numeric HTML entity
55 if entity in htmlentitydefs.name2codepoint:
56 return unichr(htmlentitydefs.name2codepoint[entity])
59 mobj = re.match(ur'(?u)#(x?\d+)', entity)
61 numstr = mobj.group(1)
62 if numstr.startswith(u'x'):
64 numstr = u'0%s' % numstr
67 return unichr(long(numstr, base))
69 # Unknown entity in name, return its literal representation
70 return (u'&%s;' % entity)
72 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
73 class IDParser(HTMLParser.HTMLParser):
74 """Modified HTMLParser that isolates a tag with the specified id"""
75 def __init__(self, id):
81 self.watch_startpos = False
83 HTMLParser.HTMLParser.__init__(self)
85 def error(self, message):
87 if self.error_count > 10 or self.started:
88 raise HTMLParser.HTMLParseError(message, self.getpos())
89 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
93 def loads(self, html):
98 def handle_starttag(self, tag, attrs):
101 self.find_startpos(None)
102 if 'id' in attrs and attrs['id'] == self.id:
105 self.watch_startpos = True
107 if not tag in self.depth: self.depth[tag] = 0
110 def handle_endtag(self, tag):
112 if tag in self.depth: self.depth[tag] -= 1
113 if self.depth[self.result[0]] == 0:
115 self.result.append(self.getpos())
117 def find_startpos(self, x):
118 """Needed to put the start position of the result (self.result[1])
119 after the opening tag with the requested id"""
120 if self.watch_startpos:
121 self.watch_startpos = False
122 self.result.append(self.getpos())
123 handle_entityref = handle_charref = handle_data = handle_comment = \
124 handle_decl = handle_pi = unknown_decl = find_startpos
126 def get_result(self):
127 if self.result == None: return None
128 if len(self.result) != 3: return None
129 lines = self.html.split('\n')
130 lines = lines[self.result[1][0]-1:self.result[2][0]]
131 lines[0] = lines[0][self.result[1][1]:]
133 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
134 lines[-1] = lines[-1][:self.result[2][1]]
135 return '\n'.join(lines).strip()
137 def get_element_by_id(id, html):
138 """Return the content of the tag with the specified id in the passed HTML document"""
139 parser = IDParser(id)
142 except HTMLParser.HTMLParseError:
144 return parser.get_result()
147 def clean_html(html):
148 """Clean an HTML snippet into a readable string"""
150 html = html.replace('\n', ' ')
151 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
153 html = re.sub('<.*?>', '', html)
154 # Replace html entities
155 html = unescapeHTML(html)
159 def sanitize_title(utitle):
160 """Sanitizes a video title so it could be used as part of a filename."""
161 utitle = unescapeHTML(utitle)
162 return utitle.replace(unicode(os.sep), u'%')
165 def sanitize_open(filename, open_mode):
166 """Try to open the given filename, and slightly tweak it if this fails.
168 Attempts to open the given filename. If this fails, it tries to change
169 the filename slightly, step by step, until it's either able to open it
170 or it fails and raises a final exception, like the standard open()
173 It returns the tuple (stream, definitive_file_name).
177 if sys.platform == 'win32':
179 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
180 return (sys.stdout, filename)
181 stream = open(encodeFilename(filename), open_mode)
182 return (stream, filename)
183 except (IOError, OSError), err:
184 # In case of error, try to remove win32 forbidden chars
185 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
187 # An exception here should be caught in the caller
188 stream = open(encodeFilename(filename), open_mode)
189 return (stream, filename)
192 def timeconvert(timestr):
193 """Convert RFC 2822 defined time string into system timestamp"""
195 timetuple = email.utils.parsedate_tz(timestr)
196 if timetuple is not None:
197 timestamp = email.utils.mktime_tz(timetuple)
200 def simplify_title(title):
201 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
202 return expr.sub(u'_', title).strip(u'_')
204 def orderedSet(iterable):
205 """ Remove all duplicates from the input iterable """
214 @param s a string (of type unicode)
216 assert type(s) == type(u'')
218 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
221 def encodeFilename(s):
223 @param s The name of the file (of type unicode)
226 assert type(s) == type(u'')
228 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
229 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
230 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
231 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
234 return s.encode(sys.getfilesystemencoding(), 'ignore')
236 class DownloadError(Exception):
237 """Download Error exception.
239 This exception may be thrown by FileDownloader objects if they are not
240 configured to continue on errors. They will contain the appropriate
246 class SameFileError(Exception):
247 """Same File exception.
249 This exception will be thrown by FileDownloader objects if they detect
250 multiple files would have to be downloaded to the same file on disk.
255 class PostProcessingError(Exception):
256 """Post Processing exception.
258 This exception may be raised by PostProcessor's .run() method to
259 indicate an error in the postprocessing task.
263 class MaxDownloadsReached(Exception):
264 """ --max-downloads limit has been reached. """
268 class UnavailableVideoError(Exception):
269 """Unavailable Format exception.
271 This exception will be thrown when a video is requested
272 in a format that is not available for that video.
277 class ContentTooShortError(Exception):
278 """Content Too Short exception.
280 This exception may be raised by FileDownloader objects when a file they
281 download is too small for what the server announced first, indicating
282 the connection was probably interrupted.
288 def __init__(self, downloaded, expected):
289 self.downloaded = downloaded
290 self.expected = expected
293 class Trouble(Exception):
294 """Trouble helper exception
296 This is an exception to be handled with
297 FileDownloader.trouble
300 class YoutubeDLHandler(urllib2.HTTPHandler):
301 """Handler for HTTP requests and responses.
303 This class, when installed with an OpenerDirector, automatically adds
304 the standard headers to every HTTP request and handles gzipped and
305 deflated responses from web servers. If compression is to be avoided in
306 a particular request, the original request in the program code only has
307 to include the HTTP header "Youtubedl-No-Compression", which will be
308 removed before making the real request.
310 Part of this code was copied from:
312 http://techknack.net/python-urllib2-handlers/
314 Andrew Rowls, the author of that code, agreed to release it to the
321 return zlib.decompress(data, -zlib.MAX_WBITS)
323 return zlib.decompress(data)
326 def addinfourl_wrapper(stream, headers, url, code):
327 if hasattr(urllib2.addinfourl, 'getcode'):
328 return urllib2.addinfourl(stream, headers, url, code)
329 ret = urllib2.addinfourl(stream, headers, url)
333 def http_request(self, req):
334 for h in std_headers:
337 req.add_header(h, std_headers[h])
338 if 'Youtubedl-no-compression' in req.headers:
339 if 'Accept-encoding' in req.headers:
340 del req.headers['Accept-encoding']
341 del req.headers['Youtubedl-no-compression']
344 def http_response(self, req, resp):
347 if resp.headers.get('Content-encoding', '') == 'gzip':
348 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
349 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
350 resp.msg = old_resp.msg
352 if resp.headers.get('Content-encoding', '') == 'deflate':
353 gz = StringIO.StringIO(self.deflate(resp.read()))
354 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
355 resp.msg = old_resp.msg