2 # -*- coding: utf-8 -*-
16 import cStringIO as StringIO
22 except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/trivialjson):
23 import trivialjson as json
26 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:5.0.1) Gecko/20100101 Firefox/5.0.1',
27 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
28 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
29 'Accept-Encoding': 'gzip, deflate',
30 'Accept-Language': 'en-us,en;q=0.5',
33 def preferredencoding():
34 """Get preferred encoding.
36 Returns the best encoding scheme for the system, based on
37 locale.getpreferredencoding() and some further tweaks.
39 def yield_preferredencoding():
41 pref = locale.getpreferredencoding()
47 return yield_preferredencoding().next()
50 def htmlentity_transform(matchobj):
51 """Transforms an HTML entity to a Unicode character.
53 This function receives a match object and is intended to be used with
54 the re.sub() function.
56 entity = matchobj.group(1)
58 # Known non-numeric HTML entity
59 if entity in htmlentitydefs.name2codepoint:
60 return unichr(htmlentitydefs.name2codepoint[entity])
63 mobj = re.match(ur'(?u)#(x?\d+)', entity)
65 numstr = mobj.group(1)
66 if numstr.startswith(u'x'):
68 numstr = u'0%s' % numstr
71 return unichr(long(numstr, base))
73 # Unknown entity in name, return its literal representation
74 return (u'&%s;' % entity)
77 class IDParser(HTMLParser.HTMLParser):
78 """Modified HTMLParser that isolates a tag with the specified id"""
79 def __init__(self, id):
85 self.watch_startpos = False
86 HTMLParser.HTMLParser.__init__(self)
88 def loads(self, html):
93 def handle_starttag(self, tag, attrs):
96 self.find_startpos(None)
97 if 'id' in attrs and attrs['id'] == self.id:
100 self.watch_startpos = True
102 if not tag in self.depth: self.depth[tag] = 0
105 def handle_endtag(self, tag):
107 if tag in self.depth: self.depth[tag] -= 1
108 if self.depth[self.result[0]] == 0:
110 self.result.append(self.getpos())
112 def find_startpos(self, x):
113 """Needed to put the start position of the result (self.result[1])
114 after the opening tag with the requested id"""
115 if self.watch_startpos:
116 self.watch_startpos = False
117 self.result.append(self.getpos())
118 handle_entityref = handle_charref = handle_data = handle_comment = \
119 handle_decl = handle_pi = unknown_decl = find_startpos
121 def get_result(self):
122 if self.result == None: return None
123 if len(self.result) != 3: return None
124 lines = self.html.split('\n')
125 lines = lines[self.result[1][0]-1:self.result[2][0]]
126 lines[0] = lines[0][self.result[1][1]:]
128 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
129 lines[-1] = lines[-1][:self.result[2][1]]
130 return '\n'.join(lines).strip()
132 def get_element_by_id(id, html):
133 """Return the content of the tag with the specified id in the passed HTML document"""
134 parser = IDParser(id)
137 except HTMLParser.HTMLParseError:
139 return parser.get_result()
142 def clean_html(html):
143 """Clean an HTML snippet into a readable string"""
145 html = html.replace('\n', ' ')
146 html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
148 html = re.sub('<.*?>', '', html)
149 # Replace html entities
150 html = unescapeHTML(html)
154 def sanitize_title(utitle):
155 """Sanitizes a video title so it could be used as part of a filename."""
156 utitle = unescapeHTML(utitle)
157 return utitle.replace(unicode(os.sep), u'%')
160 def sanitize_open(filename, open_mode):
161 """Try to open the given filename, and slightly tweak it if this fails.
163 Attempts to open the given filename. If this fails, it tries to change
164 the filename slightly, step by step, until it's either able to open it
165 or it fails and raises a final exception, like the standard open()
168 It returns the tuple (stream, definitive_file_name).
172 if sys.platform == 'win32':
174 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
175 return (sys.stdout, filename)
176 stream = open(encodeFilename(filename), open_mode)
177 return (stream, filename)
178 except (IOError, OSError), err:
179 # In case of error, try to remove win32 forbidden chars
180 filename = re.sub(ur'[/<>:"\|\?\*]', u'#', filename)
182 # An exception here should be caught in the caller
183 stream = open(encodeFilename(filename), open_mode)
184 return (stream, filename)
187 def timeconvert(timestr):
188 """Convert RFC 2822 defined time string into system timestamp"""
190 timetuple = email.utils.parsedate_tz(timestr)
191 if timetuple is not None:
192 timestamp = email.utils.mktime_tz(timetuple)
195 def simplify_title(title):
196 expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE)
197 return expr.sub(u'_', title).strip(u'_')
199 def orderedSet(iterable):
200 """ Remove all duplicates from the input iterable """
209 @param s a string (of type unicode)
211 assert type(s) == type(u'')
213 result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
216 def encodeFilename(s):
218 @param s The name of the file (of type unicode)
221 assert type(s) == type(u'')
223 if sys.platform == 'win32' and sys.getwindowsversion().major >= 5:
224 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
225 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
226 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
229 return s.encode(sys.getfilesystemencoding(), 'ignore')
231 class DownloadError(Exception):
232 """Download Error exception.
234 This exception may be thrown by FileDownloader objects if they are not
235 configured to continue on errors. They will contain the appropriate
241 class SameFileError(Exception):
242 """Same File exception.
244 This exception will be thrown by FileDownloader objects if they detect
245 multiple files would have to be downloaded to the same file on disk.
250 class PostProcessingError(Exception):
251 """Post Processing exception.
253 This exception may be raised by PostProcessor's .run() method to
254 indicate an error in the postprocessing task.
258 class MaxDownloadsReached(Exception):
259 """ --max-downloads limit has been reached. """
263 class UnavailableVideoError(Exception):
264 """Unavailable Format exception.
266 This exception will be thrown when a video is requested
267 in a format that is not available for that video.
272 class ContentTooShortError(Exception):
273 """Content Too Short exception.
275 This exception may be raised by FileDownloader objects when a file they
276 download is too small for what the server announced first, indicating
277 the connection was probably interrupted.
283 def __init__(self, downloaded, expected):
284 self.downloaded = downloaded
285 self.expected = expected
288 class YoutubeDLHandler(urllib2.HTTPHandler):
289 """Handler for HTTP requests and responses.
291 This class, when installed with an OpenerDirector, automatically adds
292 the standard headers to every HTTP request and handles gzipped and
293 deflated responses from web servers. If compression is to be avoided in
294 a particular request, the original request in the program code only has
295 to include the HTTP header "Youtubedl-No-Compression", which will be
296 removed before making the real request.
298 Part of this code was copied from:
300 http://techknack.net/python-urllib2-handlers/
302 Andrew Rowls, the author of that code, agreed to release it to the
309 return zlib.decompress(data, -zlib.MAX_WBITS)
311 return zlib.decompress(data)
314 def addinfourl_wrapper(stream, headers, url, code):
315 if hasattr(urllib2.addinfourl, 'getcode'):
316 return urllib2.addinfourl(stream, headers, url, code)
317 ret = urllib2.addinfourl(stream, headers, url)
321 def http_request(self, req):
322 for h in std_headers:
325 req.add_header(h, std_headers[h])
326 if 'Youtubedl-no-compression' in req.headers:
327 if 'Accept-encoding' in req.headers:
328 del req.headers['Accept-encoding']
329 del req.headers['Youtubedl-no-compression']
332 def http_response(self, req, resp):
335 if resp.headers.get('Content-encoding', '') == 'gzip':
336 gz = gzip.GzipFile(fileobj=StringIO.StringIO(resp.read()), mode='r')
337 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
338 resp.msg = old_resp.msg
340 if resp.headers.get('Content-encoding', '') == 'deflate':
341 gz = StringIO.StringIO(self.deflate(resp.read()))
342 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
343 resp.msg = old_resp.msg