2 # -*- coding: utf-8 -*-
16 import cStringIO as StringIO
21 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
22 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
23 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
24 'Accept-Encoding': 'gzip, deflate',
25 'Accept-Language': 'en-us,en;q=0.5',
29 compat_str = unicode # Python 2
34 import urllib.request as compat_urllib_request
35 except ImportError: # Python 2
36 import urllib2 as compat_urllib_request
39 import urllib.error as compat_urllib_error
40 except ImportError: # Python 2
41 import urllib2 as compat_urllib_error
44 import urllib.parse as compat_urllib_parse
45 except ImportError: # Python 2
46 import urllib2 as compat_urllib_parse
49 import http.cookiejar as compat_cookiejar
50 except ImportError: # Python 2
51 import cookielib as compat_cookiejar
53 def preferredencoding():
54 """Get preferred encoding.
56 Returns the best encoding scheme for the system, based on
57 locale.getpreferredencoding() and some further tweaks.
60 pref = locale.getpreferredencoding()
68 def htmlentity_transform(matchobj):
69 """Transforms an HTML entity to a character.
71 This function receives a match object and is intended to be used with
72 the re.sub() function.
74 entity = matchobj.group(1)
76 # Known non-numeric HTML entity
77 if entity in htmlentitydefs.name2codepoint:
78 return unichr(htmlentitydefs.name2codepoint[entity])
80 mobj = re.match(ur'(?u)#(x?\d+)', entity)
82 numstr = mobj.group(1)
83 if numstr.startswith(u'x'):
85 numstr = u'0%s' % numstr
88 return unichr(int(numstr, base))
90 # Unknown entity in name, return its literal representation
91 return (u'&%s;' % entity)
93 HTMLParser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
94 class IDParser(HTMLParser.HTMLParser):
95 """Modified HTMLParser that isolates a tag with the specified id"""
96 def __init__(self, id):
102 self.watch_startpos = False
104 HTMLParser.HTMLParser.__init__(self)
106 def error(self, message):
107 if self.error_count > 10 or self.started:
108 raise HTMLParser.HTMLParseError(message, self.getpos())
109 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
110 self.error_count += 1
113 def loads(self, html):
118 def handle_starttag(self, tag, attrs):
121 self.find_startpos(None)
122 if 'id' in attrs and attrs['id'] == self.id:
125 self.watch_startpos = True
127 if not tag in self.depth: self.depth[tag] = 0
130 def handle_endtag(self, tag):
132 if tag in self.depth: self.depth[tag] -= 1
133 if self.depth[self.result[0]] == 0:
135 self.result.append(self.getpos())
137 def find_startpos(self, x):
138 """Needed to put the start position of the result (self.result[1])
139 after the opening tag with the requested id"""
140 if self.watch_startpos:
141 self.watch_startpos = False
142 self.result.append(self.getpos())
143 handle_entityref = handle_charref = handle_data = handle_comment = \
144 handle_decl = handle_pi = unknown_decl = find_startpos
146 def get_result(self):
147 if self.result is None:
149 if len(self.result) != 3:
151 lines = self.html.split('\n')
152 lines = lines[self.result[1][0]-1:self.result[2][0]]
153 lines[0] = lines[0][self.result[1][1]:]
155 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
156 lines[-1] = lines[-1][:self.result[2][1]]
157 return '\n'.join(lines).strip()
159 def get_element_by_id(id, html):
160 """Return the content of the tag with the specified id in the passed HTML document"""
161 parser = IDParser(id)
164 except HTMLParser.HTMLParseError:
166 return parser.get_result()
169 def clean_html(html):
170 """Clean an HTML snippet into a readable string"""
172 html = html.replace('\n', ' ')
173 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
175 html = re.sub('<.*?>', '', html)
176 # Replace html entities
177 html = unescapeHTML(html)
181 def sanitize_open(filename, open_mode):
182 """Try to open the given filename, and slightly tweak it if this fails.
184 Attempts to open the given filename. If this fails, it tries to change
185 the filename slightly, step by step, until it's either able to open it
186 or it fails and raises a final exception, like the standard open()
189 It returns the tuple (stream, definitive_file_name).
193 if sys.platform == 'win32':
195 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
196 return (sys.stdout, filename)
197 stream = open(encodeFilename(filename), open_mode)
198 return (stream, filename)
199 except (IOError, OSError) as err:
200 # In case of error, try to remove win32 forbidden chars
201 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
203 # An exception here should be caught in the caller
204 stream = open(encodeFilename(filename), open_mode)
205 return (stream, filename)
208 def timeconvert(timestr):
209 """Convert RFC 2822 defined time string into system timestamp"""
211 timetuple = email.utils.parsedate_tz(timestr)
212 if timetuple is not None:
213 timestamp = email.utils.mktime_tz(timetuple)
216 def sanitize_filename(s, restricted=False):
217 """Sanitizes a string so it could be used as part of a filename.
218 If restricted is set, use a stricter subset of allowed characters.
220 def replace_insane(char):
221 if char == '?' or ord(char) < 32 or ord(char) == 127:
224 return '' if restricted else '\''
226 return '_-' if restricted else ' -'
227 elif char in '\\/|*<>':
229 if restricted and (char in '!&\'' or char.isspace()):
231 if restricted and ord(char) > 127:
235 result = u''.join(map(replace_insane, s))
236 while '__' in result:
237 result = result.replace('__', '_')
238 result = result.strip('_')
239 # Common case of "Foreign band name - English song title"
240 if restricted and result.startswith('-_'):
246 def orderedSet(iterable):
247 """ Remove all duplicates from the input iterable """
258 assert type(s) == type(u'')
260 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
263 def encodeFilename(s):
265 @param s The name of the file
268 assert type(s) == type(u'')
270 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
271 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
272 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
273 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
276 return s.encode(sys.getfilesystemencoding(), 'ignore')
278 class DownloadError(Exception):
279 """Download Error exception.
281 This exception may be thrown by FileDownloader objects if they are not
282 configured to continue on errors. They will contain the appropriate
288 class SameFileError(Exception):
289 """Same File exception.
291 This exception will be thrown by FileDownloader objects if they detect
292 multiple files would have to be downloaded to the same file on disk.
297 class PostProcessingError(Exception):
298 """Post Processing exception.
300 This exception may be raised by PostProcessor's .run() method to
301 indicate an error in the postprocessing task.
305 class MaxDownloadsReached(Exception):
306 """ --max-downloads limit has been reached. """
310 class UnavailableVideoError(Exception):
311 """Unavailable Format exception.
313 This exception will be thrown when a video is requested
314 in a format that is not available for that video.
319 class ContentTooShortError(Exception):
320 """Content Too Short exception.
322 This exception may be raised by FileDownloader objects when a file they
323 download is too small for what the server announced first, indicating
324 the connection was probably interrupted.
330 def __init__(self, downloaded, expected):
331 self.downloaded = downloaded
332 self.expected = expected
335 class Trouble(Exception):
336 """Trouble helper exception
338 This is an exception to be handled with
339 FileDownloader.trouble
342 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
343 """Handler for HTTP requests and responses.
345 This class, when installed with an OpenerDirector, automatically adds
346 the standard headers to every HTTP request and handles gzipped and
347 deflated responses from web servers. If compression is to be avoided in
348 a particular request, the original request in the program code only has
349 to include the HTTP header "Youtubedl-No-Compression", which will be
350 removed before making the real request.
352 Part of this code was copied from:
354 http://techknack.net/python-urllib2-handlers/
356 Andrew Rowls, the author of that code, agreed to release it to the
363 return zlib.decompress(data, -zlib.MAX_WBITS)
365 return zlib.decompress(data)
368 def addinfourl_wrapper(stream, headers, url, code):
369 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
370 return compat_urllib_request.addinfourl(stream, headers, url, code)
371 ret = compat_urllib_request.addinfourl(stream, headers, url)
375 def http_request(self, req):
376 for h in std_headers:
379 req.add_header(h, std_headers[h])
380 if 'Youtubedl-no-compression' in req.headers:
381 if 'Accept-encoding' in req.headers:
382 del req.headers['Accept-encoding']
383 del req.headers['Youtubedl-no-compression']
386 def http_response(self, req, resp):
389 if resp.headers.get('Content-encoding', '') == 'gzip':
390 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
391 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
392 resp.msg = old_resp.msg
394 if resp.headers.get('Content-encoding', '') == 'deflate':
395 gz = StringIO.StringIO(self.deflate(resp.read()))
396 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
397 resp.msg = old_resp.msg