2 # -*- coding: utf-8 -*-
15 import urllib.request as compat_urllib_request
16 except ImportError: # Python 2
17 import urllib2 as compat_urllib_request
20 import urllib.error as compat_urllib_error
21 except ImportError: # Python 2
22 import urllib2 as compat_urllib_error
25 import urllib.parse as compat_urllib_parse
26 except ImportError: # Python 2
27 import urllib as compat_urllib_parse
30 import http.cookiejar as compat_cookiejar
31 except ImportError: # Python 2
32 import cookielib as compat_cookiejar
35 import html.entities as compat_html_entities
36 except ImportError: # Python 2
37 import htmlentitydefs as compat_html_entities
40 import html.parser as compat_html_parser
41 except ImportError: # Python 2
42 import HTMLParser as compat_html_parser
45 import http.client as compat_http_client
46 except ImportError: # Python 2
47 import httplib as compat_http_client
50 from urllib.parse import parse_qs as compat_parse_qs
51 except ImportError: # Python 2
52 from urlparse import parse_qs as compat_parse_qs
55 compat_str = unicode # Python 2
60 compat_chr = unichr # Python 2
65 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
66 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
67 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
68 'Accept-Encoding': 'gzip, deflate',
69 'Accept-Language': 'en-us,en;q=0.5',
71 def preferredencoding():
72 """Get preferred encoding.
74 Returns the best encoding scheme for the system, based on
75 locale.getpreferredencoding() and some further tweaks.
78 pref = locale.getpreferredencoding()
85 if sys.version_info < (3,0):
87 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
90 assert type(s) == type(u'')
93 def htmlentity_transform(matchobj):
94 """Transforms an HTML entity to a character.
96 This function receives a match object and is intended to be used with
97 the re.sub() function.
99 entity = matchobj.group(1)
101 # Known non-numeric HTML entity
102 if entity in compat_html_entities.name2codepoint:
103 return compat_chr(compat_html_entities.name2codepoint[entity])
105 mobj = re.match(u'(?u)#(x?\\d+)', entity)
107 numstr = mobj.group(1)
108 if numstr.startswith(u'x'):
110 numstr = u'0%s' % numstr
113 return compat_chr(int(numstr, base))
115 # Unknown entity in name, return its literal representation
116 return (u'&%s;' % entity)
118 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
119 class IDParser(compat_html_parser.HTMLParser):
120 """Modified HTMLParser that isolates a tag with the specified id"""
121 def __init__(self, id):
127 self.watch_startpos = False
129 compat_html_parser.HTMLParser.__init__(self)
131 def error(self, message):
132 if self.error_count > 10 or self.started:
133 raise compat_html_parser.HTMLParseError(message, self.getpos())
134 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
135 self.error_count += 1
138 def loads(self, html):
143 def handle_starttag(self, tag, attrs):
146 self.find_startpos(None)
147 if 'id' in attrs and attrs['id'] == self.id:
150 self.watch_startpos = True
152 if not tag in self.depth: self.depth[tag] = 0
155 def handle_endtag(self, tag):
157 if tag in self.depth: self.depth[tag] -= 1
158 if self.depth[self.result[0]] == 0:
160 self.result.append(self.getpos())
162 def find_startpos(self, x):
163 """Needed to put the start position of the result (self.result[1])
164 after the opening tag with the requested id"""
165 if self.watch_startpos:
166 self.watch_startpos = False
167 self.result.append(self.getpos())
168 handle_entityref = handle_charref = handle_data = handle_comment = \
169 handle_decl = handle_pi = unknown_decl = find_startpos
171 def get_result(self):
172 if self.result is None:
174 if len(self.result) != 3:
176 lines = self.html.split('\n')
177 lines = lines[self.result[1][0]-1:self.result[2][0]]
178 lines[0] = lines[0][self.result[1][1]:]
180 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
181 lines[-1] = lines[-1][:self.result[2][1]]
182 return '\n'.join(lines).strip()
184 def get_element_by_id(id, html):
185 """Return the content of the tag with the specified id in the passed HTML document"""
186 parser = IDParser(id)
189 except compat_html_parser.HTMLParseError:
191 return parser.get_result()
194 def clean_html(html):
195 """Clean an HTML snippet into a readable string"""
197 html = html.replace('\n', ' ')
198 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
200 html = re.sub('<.*?>', '', html)
201 # Replace html entities
202 html = unescapeHTML(html)
206 def sanitize_open(filename, open_mode):
207 """Try to open the given filename, and slightly tweak it if this fails.
209 Attempts to open the given filename. If this fails, it tries to change
210 the filename slightly, step by step, until it's either able to open it
211 or it fails and raises a final exception, like the standard open()
214 It returns the tuple (stream, definitive_file_name).
218 if sys.platform == 'win32':
220 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
221 return (sys.stdout, filename)
222 stream = open(encodeFilename(filename), open_mode)
223 return (stream, filename)
224 except (IOError, OSError) as err:
225 # In case of error, try to remove win32 forbidden chars
226 filename = re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', filename)
228 # An exception here should be caught in the caller
229 stream = open(encodeFilename(filename), open_mode)
230 return (stream, filename)
233 def timeconvert(timestr):
234 """Convert RFC 2822 defined time string into system timestamp"""
236 timetuple = email.utils.parsedate_tz(timestr)
237 if timetuple is not None:
238 timestamp = email.utils.mktime_tz(timetuple)
241 def sanitize_filename(s, restricted=False):
242 """Sanitizes a string so it could be used as part of a filename.
243 If restricted is set, use a stricter subset of allowed characters.
245 def replace_insane(char):
246 if char == '?' or ord(char) < 32 or ord(char) == 127:
249 return '' if restricted else '\''
251 return '_-' if restricted else ' -'
252 elif char in '\\/|*<>':
254 if restricted and (char in '!&\'' or char.isspace()):
256 if restricted and ord(char) > 127:
260 result = u''.join(map(replace_insane, s))
261 while '__' in result:
262 result = result.replace('__', '_')
263 result = result.strip('_')
264 # Common case of "Foreign band name - English song title"
265 if restricted and result.startswith('-_'):
271 def orderedSet(iterable):
272 """ Remove all duplicates from the input iterable """
283 assert type(s) == type(u'')
285 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
288 def encodeFilename(s):
290 @param s The name of the file
293 assert type(s) == type(u'')
295 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
296 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
297 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
298 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
301 return s.encode(sys.getfilesystemencoding(), 'ignore')
303 class DownloadError(Exception):
304 """Download Error exception.
306 This exception may be thrown by FileDownloader objects if they are not
307 configured to continue on errors. They will contain the appropriate
313 class SameFileError(Exception):
314 """Same File exception.
316 This exception will be thrown by FileDownloader objects if they detect
317 multiple files would have to be downloaded to the same file on disk.
322 class PostProcessingError(Exception):
323 """Post Processing exception.
325 This exception may be raised by PostProcessor's .run() method to
326 indicate an error in the postprocessing task.
330 class MaxDownloadsReached(Exception):
331 """ --max-downloads limit has been reached. """
335 class UnavailableVideoError(Exception):
336 """Unavailable Format exception.
338 This exception will be thrown when a video is requested
339 in a format that is not available for that video.
344 class ContentTooShortError(Exception):
345 """Content Too Short exception.
347 This exception may be raised by FileDownloader objects when a file they
348 download is too small for what the server announced first, indicating
349 the connection was probably interrupted.
355 def __init__(self, downloaded, expected):
356 self.downloaded = downloaded
357 self.expected = expected
360 class Trouble(Exception):
361 """Trouble helper exception
363 This is an exception to be handled with
364 FileDownloader.trouble
367 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
368 """Handler for HTTP requests and responses.
370 This class, when installed with an OpenerDirector, automatically adds
371 the standard headers to every HTTP request and handles gzipped and
372 deflated responses from web servers. If compression is to be avoided in
373 a particular request, the original request in the program code only has
374 to include the HTTP header "Youtubedl-No-Compression", which will be
375 removed before making the real request.
377 Part of this code was copied from:
379 http://techknack.net/python-urllib2-handlers/
381 Andrew Rowls, the author of that code, agreed to release it to the
388 return zlib.decompress(data, -zlib.MAX_WBITS)
390 return zlib.decompress(data)
393 def addinfourl_wrapper(stream, headers, url, code):
394 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
395 return compat_urllib_request.addinfourl(stream, headers, url, code)
396 ret = compat_urllib_request.addinfourl(stream, headers, url)
400 def http_request(self, req):
401 for h in std_headers:
404 req.add_header(h, std_headers[h])
405 if 'Youtubedl-no-compression' in req.headers:
406 if 'Accept-encoding' in req.headers:
407 del req.headers['Accept-encoding']
408 del req.headers['Youtubedl-no-compression']
411 def http_response(self, req, resp):
414 if resp.headers.get('Content-encoding', '') == 'gzip':
415 gz = gzip.GzipFile(fileobj=io.BytesIO(resp.read()), mode='r')
416 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
417 resp.msg = old_resp.msg
419 if resp.headers.get('Content-encoding', '') == 'deflate':
420 gz = io.BytesIO(self.deflate(resp.read()))
421 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
422 resp.msg = old_resp.msg