2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
42 compat_urllib_parse_urlparse,
43 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
59 def preferredencoding():
60 """Get preferred encoding.
62 Returns the best encoding scheme for the system, based on
63 locale.getpreferredencoding() and some further tweaks.
66 pref = locale.getpreferredencoding()
74 def write_json_file(obj, fn):
75 """ Encode obj as JSON and write it to fn, atomically """
79 'prefix': os.path.basename(fn) + '.',
80 'dir': os.path.dirname(fn),
84 # In Python 2.x, json.dump expects a bytestream.
85 # In Python 3.x, it writes to a character stream
86 if sys.version_info < (3, 0):
94 tf = tempfile.NamedTemporaryFile(**args)
99 os.rename(tf.name, fn)
108 if sys.version_info >= (2, 7):
109 def find_xpath_attr(node, xpath, key, val):
110 """ Find the xpath xpath[@key=val] """
111 assert re.match(r'^[a-zA-Z-]+$', key)
112 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
113 expr = xpath + u"[@%s='%s']" % (key, val)
114 return node.find(expr)
116 def find_xpath_attr(node, xpath, key, val):
117 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
118 # .//node does not match if a node is a direct child of . !
119 if isinstance(xpath, unicode):
120 xpath = xpath.encode('ascii')
122 for f in node.findall(xpath):
123 if f.attrib.get(key) == val:
127 # On python2.6 the xml.etree.ElementTree.Element methods don't support
128 # the namespace parameter
129 def xpath_with_ns(path, ns_map):
130 components = [c.split(':') for c in path.split('/')]
134 replaced.append(c[0])
137 replaced.append('{%s}%s' % (ns_map[ns], tag))
138 return '/'.join(replaced)
141 def xpath_text(node, xpath, name=None, fatal=False):
142 if sys.version_info < (2, 7): # Crazy 2.6
143 xpath = xpath.encode('ascii')
148 name = xpath if name is None else name
149 raise ExtractorError('Could not find XML element %s' % name)
155 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
156 class BaseHTMLParser(compat_html_parser.HTMLParser):
158 compat_html_parser.HTMLParser.__init__(self)
161 def loads(self, html):
166 class AttrParser(BaseHTMLParser):
167 """Modified HTMLParser that isolates a tag with the specified attribute"""
168 def __init__(self, attribute, value):
169 self.attribute = attribute
174 self.watch_startpos = False
176 BaseHTMLParser.__init__(self)
178 def error(self, message):
179 if self.error_count > 10 or self.started:
180 raise compat_html_parser.HTMLParseError(message, self.getpos())
181 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
182 self.error_count += 1
185 def handle_starttag(self, tag, attrs):
188 self.find_startpos(None)
189 if self.attribute in attrs and attrs[self.attribute] == self.value:
192 self.watch_startpos = True
194 if not tag in self.depth: self.depth[tag] = 0
197 def handle_endtag(self, tag):
199 if tag in self.depth: self.depth[tag] -= 1
200 if self.depth[self.result[0]] == 0:
202 self.result.append(self.getpos())
204 def find_startpos(self, x):
205 """Needed to put the start position of the result (self.result[1])
206 after the opening tag with the requested id"""
207 if self.watch_startpos:
208 self.watch_startpos = False
209 self.result.append(self.getpos())
210 handle_entityref = handle_charref = handle_data = handle_comment = \
211 handle_decl = handle_pi = unknown_decl = find_startpos
213 def get_result(self):
214 if self.result is None:
216 if len(self.result) != 3:
218 lines = self.html.split('\n')
219 lines = lines[self.result[1][0]-1:self.result[2][0]]
220 lines[0] = lines[0][self.result[1][1]:]
222 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
223 lines[-1] = lines[-1][:self.result[2][1]]
224 return '\n'.join(lines).strip()
225 # Hack for https://github.com/rg3/youtube-dl/issues/662
226 if sys.version_info < (2, 7, 3):
227 AttrParser.parse_endtag = (lambda self, i:
228 i + len("</scr'+'ipt>")
229 if self.rawdata[i:].startswith("</scr'+'ipt>")
230 else compat_html_parser.HTMLParser.parse_endtag(self, i))
232 def get_element_by_id(id, html):
233 """Return the content of the tag with the specified ID in the passed HTML document"""
234 return get_element_by_attribute("id", id, html)
236 def get_element_by_attribute(attribute, value, html):
237 """Return the content of the tag with the specified attribute in the passed HTML document"""
238 parser = AttrParser(attribute, value)
241 except compat_html_parser.HTMLParseError:
243 return parser.get_result()
245 class MetaParser(BaseHTMLParser):
247 Modified HTMLParser that isolates a meta tag with the specified name
250 def __init__(self, name):
251 BaseHTMLParser.__init__(self)
256 def handle_starttag(self, tag, attrs):
260 if attrs.get('name') == self.name:
261 self.result = attrs.get('content')
263 def get_result(self):
266 def get_meta_content(name, html):
268 Return the content attribute from the meta tag with the given name attribute.
270 parser = MetaParser(name)
273 except compat_html_parser.HTMLParseError:
275 return parser.get_result()
278 def clean_html(html):
279 """Clean an HTML snippet into a readable string"""
281 html = html.replace('\n', ' ')
282 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
283 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
285 html = re.sub('<.*?>', '', html)
286 # Replace html entities
287 html = unescapeHTML(html)
291 def sanitize_open(filename, open_mode):
292 """Try to open the given filename, and slightly tweak it if this fails.
294 Attempts to open the given filename. If this fails, it tries to change
295 the filename slightly, step by step, until it's either able to open it
296 or it fails and raises a final exception, like the standard open()
299 It returns the tuple (stream, definitive_file_name).
303 if sys.platform == 'win32':
305 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
306 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
307 stream = open(encodeFilename(filename), open_mode)
308 return (stream, filename)
309 except (IOError, OSError) as err:
310 if err.errno in (errno.EACCES,):
313 # In case of error, try to remove win32 forbidden chars
314 alt_filename = os.path.join(
315 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
316 for path_part in os.path.split(filename)
318 if alt_filename == filename:
321 # An exception here should be caught in the caller
322 stream = open(encodeFilename(filename), open_mode)
323 return (stream, alt_filename)
326 def timeconvert(timestr):
327 """Convert RFC 2822 defined time string into system timestamp"""
329 timetuple = email.utils.parsedate_tz(timestr)
330 if timetuple is not None:
331 timestamp = email.utils.mktime_tz(timetuple)
334 def sanitize_filename(s, restricted=False, is_id=False):
335 """Sanitizes a string so it could be used as part of a filename.
336 If restricted is set, use a stricter subset of allowed characters.
337 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
339 def replace_insane(char):
340 if char == '?' or ord(char) < 32 or ord(char) == 127:
343 return '' if restricted else '\''
345 return '_-' if restricted else ' -'
346 elif char in '\\/|*<>':
348 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
350 if restricted and ord(char) > 127:
354 result = u''.join(map(replace_insane, s))
356 while '__' in result:
357 result = result.replace('__', '_')
358 result = result.strip('_')
359 # Common case of "Foreign band name - English song title"
360 if restricted and result.startswith('-_'):
366 def orderedSet(iterable):
367 """ Remove all duplicates from the input iterable """
375 def _htmlentity_transform(entity):
376 """Transforms an HTML entity to a character."""
377 # Known non-numeric HTML entity
378 if entity in compat_html_entities.name2codepoint:
379 return compat_chr(compat_html_entities.name2codepoint[entity])
381 mobj = re.match(r'#(x?[0-9]+)', entity)
383 numstr = mobj.group(1)
384 if numstr.startswith(u'x'):
386 numstr = u'0%s' % numstr
389 return compat_chr(int(numstr, base))
391 # Unknown entity in name, return its literal representation
392 return (u'&%s;' % entity)
398 assert type(s) == compat_str
401 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
404 def encodeFilename(s, for_subprocess=False):
406 @param s The name of the file
409 assert type(s) == compat_str
411 # Python 3 has a Unicode API
412 if sys.version_info >= (3, 0):
415 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
416 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
417 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
418 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
419 if not for_subprocess:
422 # For subprocess calls, encode with locale encoding
423 # Refer to http://stackoverflow.com/a/9951851/35070
424 encoding = preferredencoding()
426 encoding = sys.getfilesystemencoding()
429 return s.encode(encoding, 'ignore')
432 def encodeArgument(s):
433 if not isinstance(s, compat_str):
434 # Legacy code that uses byte strings
435 # Uncomment the following line after fixing all post processors
436 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
437 s = s.decode('ascii')
438 return encodeFilename(s, True)
441 def decodeOption(optval):
444 if isinstance(optval, bytes):
445 optval = optval.decode(preferredencoding())
447 assert isinstance(optval, compat_str)
450 def formatSeconds(secs):
452 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
454 return '%d:%02d' % (secs // 60, secs % 60)
459 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
460 if sys.version_info < (3, 2):
463 class HTTPSConnectionV3(httplib.HTTPSConnection):
464 def __init__(self, *args, **kwargs):
465 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
468 sock = socket.create_connection((self.host, self.port), self.timeout)
469 if getattr(self, '_tunnel_host', False):
473 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
475 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
477 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
478 def https_open(self, req):
479 return self.do_open(HTTPSConnectionV3, req)
480 return HTTPSHandlerV3(**kwargs)
481 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
482 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
483 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
484 if opts_no_check_certificate:
485 context.verify_mode = ssl.CERT_NONE
486 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
488 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
489 context.verify_mode = (ssl.CERT_NONE
490 if opts_no_check_certificate
491 else ssl.CERT_REQUIRED)
492 context.set_default_verify_paths()
494 context.load_default_certs()
495 except AttributeError:
497 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
499 class ExtractorError(Exception):
500 """Error during info extraction."""
501 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
502 """ tb, if given, is the original traceback (so that it can be printed out).
503 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
506 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
508 if video_id is not None:
509 msg = video_id + ': ' + msg
511 msg += u' (caused by %r)' % cause
513 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
514 super(ExtractorError, self).__init__(msg)
517 self.exc_info = sys.exc_info() # preserve original exception
519 self.video_id = video_id
521 def format_traceback(self):
522 if self.traceback is None:
524 return u''.join(traceback.format_tb(self.traceback))
527 class RegexNotFoundError(ExtractorError):
528 """Error when a regex didn't match"""
532 class DownloadError(Exception):
533 """Download Error exception.
535 This exception may be thrown by FileDownloader objects if they are not
536 configured to continue on errors. They will contain the appropriate
539 def __init__(self, msg, exc_info=None):
540 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
541 super(DownloadError, self).__init__(msg)
542 self.exc_info = exc_info
545 class SameFileError(Exception):
546 """Same File exception.
548 This exception will be thrown by FileDownloader objects if they detect
549 multiple files would have to be downloaded to the same file on disk.
554 class PostProcessingError(Exception):
555 """Post Processing exception.
557 This exception may be raised by PostProcessor's .run() method to
558 indicate an error in the postprocessing task.
560 def __init__(self, msg):
563 class MaxDownloadsReached(Exception):
564 """ --max-downloads limit has been reached. """
568 class UnavailableVideoError(Exception):
569 """Unavailable Format exception.
571 This exception will be thrown when a video is requested
572 in a format that is not available for that video.
577 class ContentTooShortError(Exception):
578 """Content Too Short exception.
580 This exception may be raised by FileDownloader objects when a file they
581 download is too small for what the server announced first, indicating
582 the connection was probably interrupted.
588 def __init__(self, downloaded, expected):
589 self.downloaded = downloaded
590 self.expected = expected
592 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
593 """Handler for HTTP requests and responses.
595 This class, when installed with an OpenerDirector, automatically adds
596 the standard headers to every HTTP request and handles gzipped and
597 deflated responses from web servers. If compression is to be avoided in
598 a particular request, the original request in the program code only has
599 to include the HTTP header "Youtubedl-No-Compression", which will be
600 removed before making the real request.
602 Part of this code was copied from:
604 http://techknack.net/python-urllib2-handlers/
606 Andrew Rowls, the author of that code, agreed to release it to the
613 return zlib.decompress(data, -zlib.MAX_WBITS)
615 return zlib.decompress(data)
618 def addinfourl_wrapper(stream, headers, url, code):
619 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
620 return compat_urllib_request.addinfourl(stream, headers, url, code)
621 ret = compat_urllib_request.addinfourl(stream, headers, url)
625 def http_request(self, req):
626 for h, v in std_headers.items():
627 if h not in req.headers:
629 if 'Youtubedl-no-compression' in req.headers:
630 if 'Accept-encoding' in req.headers:
631 del req.headers['Accept-encoding']
632 del req.headers['Youtubedl-no-compression']
633 if 'Youtubedl-user-agent' in req.headers:
634 if 'User-agent' in req.headers:
635 del req.headers['User-agent']
636 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
637 del req.headers['Youtubedl-user-agent']
639 if sys.version_info < (2, 7) and '#' in req.get_full_url():
640 # Python 2.6 is brain-dead when it comes to fragments
641 req._Request__original = req._Request__original.partition('#')[0]
642 req._Request__r_type = req._Request__r_type.partition('#')[0]
646 def http_response(self, req, resp):
649 if resp.headers.get('Content-encoding', '') == 'gzip':
650 content = resp.read()
651 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
653 uncompressed = io.BytesIO(gz.read())
654 except IOError as original_ioerror:
655 # There may be junk add the end of the file
656 # See http://stackoverflow.com/q/4928560/35070 for details
657 for i in range(1, 1024):
659 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
660 uncompressed = io.BytesIO(gz.read())
665 raise original_ioerror
666 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
667 resp.msg = old_resp.msg
669 if resp.headers.get('Content-encoding', '') == 'deflate':
670 gz = io.BytesIO(self.deflate(resp.read()))
671 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
672 resp.msg = old_resp.msg
675 https_request = http_request
676 https_response = http_response
679 def parse_iso8601(date_str, delimiter='T'):
680 """ Return a UNIX timestamp from the given date """
686 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
689 timezone = datetime.timedelta()
691 date_str = date_str[:-len(m.group(0))]
692 if not m.group('sign'):
693 timezone = datetime.timedelta()
695 sign = 1 if m.group('sign') == '+' else -1
696 timezone = datetime.timedelta(
697 hours=sign * int(m.group('hours')),
698 minutes=sign * int(m.group('minutes')))
699 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
700 dt = datetime.datetime.strptime(date_str, date_format) - timezone
701 return calendar.timegm(dt.timetuple())
704 def unified_strdate(date_str):
705 """Return a string with the date in the format YYYYMMDD"""
712 date_str = date_str.replace(',', ' ')
713 # %z (UTC offset) is only supported in python>=3.2
714 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
715 format_expressions = [
720 '%b %dst %Y %I:%M%p',
721 '%b %dnd %Y %I:%M%p',
722 '%b %dth %Y %I:%M%p',
731 '%Y-%m-%d %H:%M:%S.%f',
734 '%Y-%m-%dT%H:%M:%SZ',
735 '%Y-%m-%dT%H:%M:%S.%fZ',
736 '%Y-%m-%dT%H:%M:%S.%f0Z',
738 '%Y-%m-%dT%H:%M:%S.%f',
741 for expression in format_expressions:
743 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
746 if upload_date is None:
747 timetuple = email.utils.parsedate_tz(date_str)
749 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
752 def determine_ext(url, default_ext=u'unknown_video'):
755 guess = url.partition(u'?')[0].rpartition(u'.')[2]
756 if re.match(r'^[A-Za-z0-9]+$', guess):
761 def subtitles_filename(filename, sub_lang, sub_format):
762 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
764 def date_from_str(date_str):
766 Return a datetime object from a string in the format YYYYMMDD or
767 (now|today)[+-][0-9](day|week|month|year)(s)?"""
768 today = datetime.date.today()
769 if date_str == 'now'or date_str == 'today':
771 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
772 if match is not None:
773 sign = match.group('sign')
774 time = int(match.group('time'))
777 unit = match.group('unit')
786 delta = datetime.timedelta(**{unit: time})
788 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
790 def hyphenate_date(date_str):
792 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
793 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
794 if match is not None:
795 return '-'.join(match.groups())
799 class DateRange(object):
800 """Represents a time interval between two dates"""
801 def __init__(self, start=None, end=None):
802 """start and end must be strings in the format accepted by date"""
803 if start is not None:
804 self.start = date_from_str(start)
806 self.start = datetime.datetime.min.date()
808 self.end = date_from_str(end)
810 self.end = datetime.datetime.max.date()
811 if self.start > self.end:
812 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
815 """Returns a range that only contains the given day"""
817 def __contains__(self, date):
818 """Check if the date is in the range"""
819 if not isinstance(date, datetime.date):
820 date = date_from_str(date)
821 return self.start <= date <= self.end
823 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
827 """ Returns the platform name as a compat_str """
828 res = platform.platform()
829 if isinstance(res, bytes):
830 res = res.decode(preferredencoding())
832 assert isinstance(res, compat_str)
836 def _windows_write_string(s, out):
837 """ Returns True if the string was written using special methods,
838 False if it has yet to be written out."""
839 # Adapted from http://stackoverflow.com/a/3259271/35070
842 import ctypes.wintypes
850 fileno = out.fileno()
851 except AttributeError:
852 # If the output stream doesn't have a fileno, it's virtual
854 if fileno not in WIN_OUTPUT_IDS:
857 GetStdHandle = ctypes.WINFUNCTYPE(
858 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
859 ("GetStdHandle", ctypes.windll.kernel32))
860 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
862 WriteConsoleW = ctypes.WINFUNCTYPE(
863 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
864 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
865 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
866 written = ctypes.wintypes.DWORD(0)
868 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
869 FILE_TYPE_CHAR = 0x0002
870 FILE_TYPE_REMOTE = 0x8000
871 GetConsoleMode = ctypes.WINFUNCTYPE(
872 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
873 ctypes.POINTER(ctypes.wintypes.DWORD))(
874 ("GetConsoleMode", ctypes.windll.kernel32))
875 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
877 def not_a_console(handle):
878 if handle == INVALID_HANDLE_VALUE or handle is None:
880 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
881 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
886 def next_nonbmp_pos(s):
888 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
889 except StopIteration:
893 count = min(next_nonbmp_pos(s), 1024)
896 h, s, count if count else 2, ctypes.byref(written), None)
898 raise OSError('Failed to write string')
899 if not count: # We just wrote a non-BMP character
900 assert written.value == 2
903 assert written.value > 0
904 s = s[written.value:]
908 def write_string(s, out=None, encoding=None):
911 assert type(s) == compat_str
913 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
914 if _windows_write_string(s, out):
917 if ('b' in getattr(out, 'mode', '') or
918 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
919 byt = s.encode(encoding or preferredencoding(), 'ignore')
921 elif hasattr(out, 'buffer'):
922 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
923 byt = s.encode(enc, 'ignore')
924 out.buffer.write(byt)
930 def bytes_to_intlist(bs):
933 if isinstance(bs[0], int): # Python 3
936 return [ord(c) for c in bs]
939 def intlist_to_bytes(xs):
942 if isinstance(chr(0), bytes): # Python 2
943 return ''.join([chr(x) for x in xs])
948 # Cross-platform file locking
949 if sys.platform == 'win32':
950 import ctypes.wintypes
953 class OVERLAPPED(ctypes.Structure):
955 ('Internal', ctypes.wintypes.LPVOID),
956 ('InternalHigh', ctypes.wintypes.LPVOID),
957 ('Offset', ctypes.wintypes.DWORD),
958 ('OffsetHigh', ctypes.wintypes.DWORD),
959 ('hEvent', ctypes.wintypes.HANDLE),
962 kernel32 = ctypes.windll.kernel32
963 LockFileEx = kernel32.LockFileEx
964 LockFileEx.argtypes = [
965 ctypes.wintypes.HANDLE, # hFile
966 ctypes.wintypes.DWORD, # dwFlags
967 ctypes.wintypes.DWORD, # dwReserved
968 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
969 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
970 ctypes.POINTER(OVERLAPPED) # Overlapped
972 LockFileEx.restype = ctypes.wintypes.BOOL
973 UnlockFileEx = kernel32.UnlockFileEx
974 UnlockFileEx.argtypes = [
975 ctypes.wintypes.HANDLE, # hFile
976 ctypes.wintypes.DWORD, # dwReserved
977 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
978 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
979 ctypes.POINTER(OVERLAPPED) # Overlapped
981 UnlockFileEx.restype = ctypes.wintypes.BOOL
982 whole_low = 0xffffffff
983 whole_high = 0x7fffffff
985 def _lock_file(f, exclusive):
986 overlapped = OVERLAPPED()
987 overlapped.Offset = 0
988 overlapped.OffsetHigh = 0
989 overlapped.hEvent = 0
990 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
991 handle = msvcrt.get_osfhandle(f.fileno())
992 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
993 whole_low, whole_high, f._lock_file_overlapped_p):
994 raise OSError('Locking file failed: %r' % ctypes.FormatError())
997 assert f._lock_file_overlapped_p
998 handle = msvcrt.get_osfhandle(f.fileno())
999 if not UnlockFileEx(handle, 0,
1000 whole_low, whole_high, f._lock_file_overlapped_p):
1001 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1006 def _lock_file(f, exclusive):
1007 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1009 def _unlock_file(f):
1010 fcntl.flock(f, fcntl.LOCK_UN)
1013 class locked_file(object):
1014 def __init__(self, filename, mode, encoding=None):
1015 assert mode in ['r', 'a', 'w']
1016 self.f = io.open(filename, mode, encoding=encoding)
1019 def __enter__(self):
1020 exclusive = self.mode != 'r'
1022 _lock_file(self.f, exclusive)
1028 def __exit__(self, etype, value, traceback):
1030 _unlock_file(self.f)
1037 def write(self, *args):
1038 return self.f.write(*args)
1040 def read(self, *args):
1041 return self.f.read(*args)
1044 def get_filesystem_encoding():
1045 encoding = sys.getfilesystemencoding()
1046 return encoding if encoding is not None else 'utf-8'
1049 def shell_quote(args):
1051 encoding = get_filesystem_encoding()
1053 if isinstance(a, bytes):
1054 # We may get a filename encoded with 'encodeFilename'
1055 a = a.decode(encoding)
1056 quoted_args.append(pipes.quote(a))
1057 return u' '.join(quoted_args)
1060 def takewhile_inclusive(pred, seq):
1061 """ Like itertools.takewhile, but include the latest evaluated element
1062 (the first element so that Not pred(e)) """
1069 def smuggle_url(url, data):
1070 """ Pass additional data in a URL for internal use. """
1072 sdata = compat_urllib_parse.urlencode(
1073 {u'__youtubedl_smuggle': json.dumps(data)})
1074 return url + u'#' + sdata
1077 def unsmuggle_url(smug_url, default=None):
1078 if not '#__youtubedl_smuggle' in smug_url:
1079 return smug_url, default
1080 url, _, sdata = smug_url.rpartition(u'#')
1081 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1082 data = json.loads(jsond)
1086 def format_bytes(bytes):
1089 if type(bytes) is str:
1090 bytes = float(bytes)
1094 exponent = int(math.log(bytes, 1024.0))
1095 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1096 converted = float(bytes) / float(1024 ** exponent)
1097 return u'%.2f%s' % (converted, suffix)
1100 def get_term_width():
1101 columns = compat_getenv('COLUMNS', None)
1106 sp = subprocess.Popen(
1108 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1109 out, err = sp.communicate()
1110 return int(out.split()[1])
1116 def month_by_name(name):
1117 """ Return the number of a month by (locale-independently) English name """
1120 u'January', u'February', u'March', u'April', u'May', u'June',
1121 u'July', u'August', u'September', u'October', u'November', u'December']
1123 return ENGLISH_NAMES.index(name) + 1
1128 def fix_xml_ampersands(xml_str):
1129 """Replace all the '&' by '&' in XML"""
1131 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1136 def setproctitle(title):
1137 assert isinstance(title, compat_str)
1139 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1142 title_bytes = title.encode('utf-8')
1143 buf = ctypes.create_string_buffer(len(title_bytes))
1144 buf.value = title_bytes
1146 libc.prctl(15, buf, 0, 0, 0)
1147 except AttributeError:
1148 return # Strange libc, just skip this
1151 def remove_start(s, start):
1152 if s.startswith(start):
1153 return s[len(start):]
1157 def remove_end(s, end):
1159 return s[:-len(end)]
1163 def url_basename(url):
1164 path = compat_urlparse.urlparse(url).path
1165 return path.strip(u'/').split(u'/')[-1]
1168 class HEADRequest(compat_urllib_request.Request):
1169 def get_method(self):
1173 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1176 v = getattr(v, get_attr, None)
1179 return default if v is None else (int(v) * invscale // scale)
1182 def str_or_none(v, default=None):
1183 return default if v is None else compat_str(v)
1186 def str_to_int(int_str):
1187 """ A more relaxed version of int_or_none """
1190 int_str = re.sub(r'[,\.\+]', u'', int_str)
1194 def float_or_none(v, scale=1, invscale=1, default=None):
1195 return default if v is None else (float(v) * invscale / scale)
1198 def parse_duration(s):
1205 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1208 res = int(m.group('secs'))
1210 res += int(m.group('mins')) * 60
1211 if m.group('hours'):
1212 res += int(m.group('hours')) * 60 * 60
1214 res += float(m.group('ms'))
1218 def prepend_extension(filename, ext):
1219 name, real_ext = os.path.splitext(filename)
1220 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1223 def check_executable(exe, args=[]):
1224 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1225 args can be a list of arguments for a short output (like -version) """
1227 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1233 def get_exe_version(exe, args=['--version'],
1234 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1235 unrecognized=u'present'):
1236 """ Returns the version of the specified executable,
1237 or False if the executable is not present """
1239 out, err = subprocess.Popen(
1241 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1244 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1245 m = re.search(version_re, firstline)
1252 class PagedList(object):
1254 # This is only useful for tests
1255 return len(self.getslice())
1258 class OnDemandPagedList(PagedList):
1259 def __init__(self, pagefunc, pagesize):
1260 self._pagefunc = pagefunc
1261 self._pagesize = pagesize
1263 def getslice(self, start=0, end=None):
1265 for pagenum in itertools.count(start // self._pagesize):
1266 firstid = pagenum * self._pagesize
1267 nextfirstid = pagenum * self._pagesize + self._pagesize
1268 if start >= nextfirstid:
1271 page_results = list(self._pagefunc(pagenum))
1274 start % self._pagesize
1275 if firstid <= start < nextfirstid
1279 ((end - 1) % self._pagesize) + 1
1280 if (end is not None and firstid <= end <= nextfirstid)
1283 if startv != 0 or endv is not None:
1284 page_results = page_results[startv:endv]
1285 res.extend(page_results)
1287 # A little optimization - if current page is not "full", ie. does
1288 # not contain page_size videos then we can assume that this page
1289 # is the last one - there are no more ids on further pages -
1290 # i.e. no need to query again.
1291 if len(page_results) + startv < self._pagesize:
1294 # If we got the whole page, but the next page is not interesting,
1295 # break out early as well
1296 if end == nextfirstid:
1301 class InAdvancePagedList(PagedList):
1302 def __init__(self, pagefunc, pagecount, pagesize):
1303 self._pagefunc = pagefunc
1304 self._pagecount = pagecount
1305 self._pagesize = pagesize
1307 def getslice(self, start=0, end=None):
1309 start_page = start // self._pagesize
1311 self._pagecount if end is None else (end // self._pagesize + 1))
1312 skip_elems = start - start_page * self._pagesize
1313 only_more = None if end is None else end - start
1314 for pagenum in range(start_page, end_page):
1315 page = list(self._pagefunc(pagenum))
1317 page = page[skip_elems:]
1319 if only_more is not None:
1320 if len(page) < only_more:
1321 only_more -= len(page)
1323 page = page[:only_more]
1330 def uppercase_escape(s):
1331 unicode_escape = codecs.getdecoder('unicode_escape')
1333 r'\\U[0-9a-fA-F]{8}',
1334 lambda m: unicode_escape(m.group(0))[0],
1338 def escape_rfc3986(s):
1339 """Escape non-ASCII characters as suggested by RFC 3986"""
1340 if sys.version_info < (3, 0) and isinstance(s, unicode):
1341 s = s.encode('utf-8')
1342 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1345 def escape_url(url):
1346 """Escape URL as suggested by RFC 3986"""
1347 url_parsed = compat_urllib_parse_urlparse(url)
1348 return url_parsed._replace(
1349 path=escape_rfc3986(url_parsed.path),
1350 params=escape_rfc3986(url_parsed.params),
1351 query=escape_rfc3986(url_parsed.query),
1352 fragment=escape_rfc3986(url_parsed.fragment)
1356 struct.pack(u'!I', 0)
1358 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1359 def struct_pack(spec, *args):
1360 if isinstance(spec, compat_str):
1361 spec = spec.encode('ascii')
1362 return struct.pack(spec, *args)
1364 def struct_unpack(spec, *args):
1365 if isinstance(spec, compat_str):
1366 spec = spec.encode('ascii')
1367 return struct.unpack(spec, *args)
1369 struct_pack = struct.pack
1370 struct_unpack = struct.unpack
1373 def read_batch_urls(batch_fd):
1375 if not isinstance(url, compat_str):
1376 url = url.decode('utf-8', 'replace')
1377 BOM_UTF8 = u'\xef\xbb\xbf'
1378 if url.startswith(BOM_UTF8):
1379 url = url[len(BOM_UTF8):]
1381 if url.startswith(('#', ';', ']')):
1385 with contextlib.closing(batch_fd) as fd:
1386 return [url for url in map(fixup, fd) if url]
1389 def urlencode_postdata(*args, **kargs):
1390 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1394 etree_iter = xml.etree.ElementTree.Element.iter
1395 except AttributeError: # Python <=2.6
1396 etree_iter = lambda n: n.findall('.//*')
1400 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1401 def doctype(self, name, pubid, system):
1402 pass # Ignore doctypes
1404 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1405 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1406 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1407 # Fix up XML parser in Python 2.x
1408 if sys.version_info < (3, 0):
1409 for n in etree_iter(tree):
1410 if n.text is not None:
1411 if not isinstance(n.text, compat_str):
1412 n.text = n.text.decode('utf-8')
1425 def parse_age_limit(s):
1428 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1429 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1432 def strip_jsonp(code):
1433 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1436 def js_to_json(code):
1439 if v in ('true', 'false', 'null'):
1441 if v.startswith('"'):
1443 if v.startswith("'"):
1445 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1452 res = re.sub(r'''(?x)
1453 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1454 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1455 [a-zA-Z_][a-zA-Z_0-9]*
1457 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1461 def qualities(quality_ids):
1462 """ Get a numeric quality value out of a list of possible values """
1465 return quality_ids.index(qid)
1471 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1474 def limit_length(s, length):
1475 """ Add ellipses to overly long strings """
1480 return s[:length - len(ELLIPSES)] + ELLIPSES
1484 def version_tuple(v):
1485 return [int(e) for e in v.split('.')]
1488 def is_outdated_version(version, limit, assume_new=True):
1490 return not assume_new
1492 return version_tuple(version) < version_tuple(limit)
1494 return not assume_new