2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
60 def preferredencoding():
61 """Get preferred encoding.
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
67 pref = locale.getpreferredencoding()
75 def write_json_file(obj, fn):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
78 fn = encodeFilename(fn)
79 if sys.version_info < (3, 0) and sys.platform != 'win32':
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
108 tf = tempfile.NamedTemporaryFile(**args)
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
120 os.rename(tf.name, fn)
129 if sys.version_info >= (2, 7):
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
134 expr = xpath + "[@%s='%s']" % (key, val)
135 return node.find(expr)
137 def find_xpath_attr(node, xpath, key, val):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
152 def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
157 replaced.append(c[0])
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
164 def xpath_text(node, xpath, name=None, fatal=False):
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
183 def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
186 m = re.search(r'''(?xs)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 ''' % (re.escape(attribute), re.escape(value)), html)
198 res = m.group('content')
200 if res.startswith('"') or res.startswith("'"):
203 return unescapeHTML(res)
206 def clean_html(html):
207 """Clean an HTML snippet into a readable string"""
209 html = html.replace('\n', ' ')
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
219 def sanitize_open(filename, open_mode):
220 """Try to open the given filename, and slightly tweak it if this fails.
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
227 It returns the tuple (stream, definitive_file_name).
231 if sys.platform == 'win32':
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
238 if err.errno in (errno.EACCES,):
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
246 if alt_filename == filename:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
254 def timeconvert(timestr):
255 """Convert RFC 2822 defined time string into system timestamp"""
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
263 def sanitize_filename(s, restricted=False, is_id=False):
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
272 return '' if restricted else '\''
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
279 if restricted and ord(char) > 127:
283 result = ''.join(map(replace_insane, s))
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
296 def orderedSet(iterable):
297 """ Remove all duplicates from the input iterable """
305 def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
311 mobj = re.match(r'#(x?[0-9]+)', entity)
313 numstr = mobj.group(1)
314 if numstr.startswith('x'):
316 numstr = '0%s' % numstr
319 return compat_chr(int(numstr, base))
321 # Unknown entity in name, return its literal representation
322 return ('&%s;' % entity)
328 assert type(s) == compat_str
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
334 def encodeFilename(s, for_subprocess=False):
336 @param s The name of the file
339 assert type(s) == compat_str
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
349 if not for_subprocess:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
356 encoding = sys.getfilesystemencoding()
359 return s.encode(encoding, 'ignore')
362 def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
371 def decodeOption(optval):
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
377 assert isinstance(optval, compat_str)
381 def formatSeconds(secs):
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
385 return '%d:%02d' % (secs // 60, secs % 60)
390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
391 if sys.version_info < (3, 2):
394 class HTTPSConnectionV3(httplib.HTTPSConnection):
395 def __init__(self, *args, **kwargs):
396 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
399 sock = socket.create_connection((self.host, self.port), self.timeout)
400 if getattr(self, '_tunnel_host', False):
404 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
406 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
408 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
409 def https_open(self, req):
410 return self.do_open(HTTPSConnectionV3, req)
411 return HTTPSHandlerV3(**kwargs)
412 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
413 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
414 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
415 if opts_no_check_certificate:
416 context.verify_mode = ssl.CERT_NONE
417 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
419 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
420 context.verify_mode = (ssl.CERT_NONE
421 if opts_no_check_certificate
422 else ssl.CERT_REQUIRED)
423 context.set_default_verify_paths()
425 context.load_default_certs()
426 except AttributeError:
428 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
431 class ExtractorError(Exception):
432 """Error during info extraction."""
434 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
435 """ tb, if given, is the original traceback (so that it can be printed out).
436 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
439 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
441 if video_id is not None:
442 msg = video_id + ': ' + msg
444 msg += ' (caused by %r)' % cause
446 if ytdl_is_updateable():
447 update_cmd = 'type youtube-dl -U to update'
449 update_cmd = 'see https://yt-dl.org/update on how to update'
450 msg += '; please report this issue on https://yt-dl.org/bug .'
451 msg += ' Make sure you are using the latest version; %s.' % update_cmd
452 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
453 super(ExtractorError, self).__init__(msg)
456 self.exc_info = sys.exc_info() # preserve original exception
458 self.video_id = video_id
460 def format_traceback(self):
461 if self.traceback is None:
463 return ''.join(traceback.format_tb(self.traceback))
466 class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
471 class DownloadError(Exception):
472 """Download Error exception.
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
485 class SameFileError(Exception):
486 """Same File exception.
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
494 class PostProcessingError(Exception):
495 """Post Processing exception.
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
501 def __init__(self, msg):
505 class MaxDownloadsReached(Exception):
506 """ --max-downloads limit has been reached. """
510 class UnavailableVideoError(Exception):
511 """Unavailable Format exception.
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
519 class ContentTooShortError(Exception):
520 """Content Too Short exception.
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
535 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
536 """Handler for HTTP requests and responses.
538 This class, when installed with an OpenerDirector, automatically adds
539 the standard headers to every HTTP request and handles gzipped and
540 deflated responses from web servers. If compression is to be avoided in
541 a particular request, the original request in the program code only has
542 to include the HTTP header "Youtubedl-No-Compression", which will be
543 removed before making the real request.
545 Part of this code was copied from:
547 http://techknack.net/python-urllib2-handlers/
549 Andrew Rowls, the author of that code, agreed to release it to the
556 return zlib.decompress(data, -zlib.MAX_WBITS)
558 return zlib.decompress(data)
561 def addinfourl_wrapper(stream, headers, url, code):
562 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
563 return compat_urllib_request.addinfourl(stream, headers, url, code)
564 ret = compat_urllib_request.addinfourl(stream, headers, url)
568 def http_request(self, req):
569 for h, v in std_headers.items():
570 if h not in req.headers:
572 if 'Youtubedl-no-compression' in req.headers:
573 if 'Accept-encoding' in req.headers:
574 del req.headers['Accept-encoding']
575 del req.headers['Youtubedl-no-compression']
576 if 'Youtubedl-user-agent' in req.headers:
577 if 'User-agent' in req.headers:
578 del req.headers['User-agent']
579 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
580 del req.headers['Youtubedl-user-agent']
582 if sys.version_info < (2, 7) and '#' in req.get_full_url():
583 # Python 2.6 is brain-dead when it comes to fragments
584 req._Request__original = req._Request__original.partition('#')[0]
585 req._Request__r_type = req._Request__r_type.partition('#')[0]
589 def http_response(self, req, resp):
592 if resp.headers.get('Content-encoding', '') == 'gzip':
593 content = resp.read()
594 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
596 uncompressed = io.BytesIO(gz.read())
597 except IOError as original_ioerror:
598 # There may be junk add the end of the file
599 # See http://stackoverflow.com/q/4928560/35070 for details
600 for i in range(1, 1024):
602 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
603 uncompressed = io.BytesIO(gz.read())
608 raise original_ioerror
609 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
610 resp.msg = old_resp.msg
612 if resp.headers.get('Content-encoding', '') == 'deflate':
613 gz = io.BytesIO(self.deflate(resp.read()))
614 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
615 resp.msg = old_resp.msg
618 https_request = http_request
619 https_response = http_response
622 def parse_iso8601(date_str, delimiter='T'):
623 """ Return a UNIX timestamp from the given date """
629 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
632 timezone = datetime.timedelta()
634 date_str = date_str[:-len(m.group(0))]
635 if not m.group('sign'):
636 timezone = datetime.timedelta()
638 sign = 1 if m.group('sign') == '+' else -1
639 timezone = datetime.timedelta(
640 hours=sign * int(m.group('hours')),
641 minutes=sign * int(m.group('minutes')))
642 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
643 dt = datetime.datetime.strptime(date_str, date_format) - timezone
644 return calendar.timegm(dt.timetuple())
647 def unified_strdate(date_str):
648 """Return a string with the date in the format YYYYMMDD"""
655 date_str = date_str.replace(',', ' ')
656 # %z (UTC offset) is only supported in python>=3.2
657 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
658 format_expressions = [
663 '%b %dst %Y %I:%M%p',
664 '%b %dnd %Y %I:%M%p',
665 '%b %dth %Y %I:%M%p',
674 '%Y-%m-%d %H:%M:%S.%f',
677 '%Y-%m-%dT%H:%M:%SZ',
678 '%Y-%m-%dT%H:%M:%S.%fZ',
679 '%Y-%m-%dT%H:%M:%S.%f0Z',
681 '%Y-%m-%dT%H:%M:%S.%f',
684 for expression in format_expressions:
686 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
689 if upload_date is None:
690 timetuple = email.utils.parsedate_tz(date_str)
692 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
696 def determine_ext(url, default_ext='unknown_video'):
699 guess = url.partition('?')[0].rpartition('.')[2]
700 if re.match(r'^[A-Za-z0-9]+$', guess):
706 def subtitles_filename(filename, sub_lang, sub_format):
707 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
710 def date_from_str(date_str):
712 Return a datetime object from a string in the format YYYYMMDD or
713 (now|today)[+-][0-9](day|week|month|year)(s)?"""
714 today = datetime.date.today()
715 if date_str in ('now', 'today'):
717 if date_str == 'yesterday':
718 return today - datetime.timedelta(days=1)
719 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
720 if match is not None:
721 sign = match.group('sign')
722 time = int(match.group('time'))
725 unit = match.group('unit')
726 # A bad aproximation?
734 delta = datetime.timedelta(**{unit: time})
736 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
739 def hyphenate_date(date_str):
741 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
742 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
743 if match is not None:
744 return '-'.join(match.groups())
749 class DateRange(object):
750 """Represents a time interval between two dates"""
752 def __init__(self, start=None, end=None):
753 """start and end must be strings in the format accepted by date"""
754 if start is not None:
755 self.start = date_from_str(start)
757 self.start = datetime.datetime.min.date()
759 self.end = date_from_str(end)
761 self.end = datetime.datetime.max.date()
762 if self.start > self.end:
763 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
767 """Returns a range that only contains the given day"""
770 def __contains__(self, date):
771 """Check if the date is in the range"""
772 if not isinstance(date, datetime.date):
773 date = date_from_str(date)
774 return self.start <= date <= self.end
777 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
781 """ Returns the platform name as a compat_str """
782 res = platform.platform()
783 if isinstance(res, bytes):
784 res = res.decode(preferredencoding())
786 assert isinstance(res, compat_str)
790 def _windows_write_string(s, out):
791 """ Returns True if the string was written using special methods,
792 False if it has yet to be written out."""
793 # Adapted from http://stackoverflow.com/a/3259271/35070
796 import ctypes.wintypes
804 fileno = out.fileno()
805 except AttributeError:
806 # If the output stream doesn't have a fileno, it's virtual
808 if fileno not in WIN_OUTPUT_IDS:
811 GetStdHandle = ctypes.WINFUNCTYPE(
812 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
813 ("GetStdHandle", ctypes.windll.kernel32))
814 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
816 WriteConsoleW = ctypes.WINFUNCTYPE(
817 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
818 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
819 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
820 written = ctypes.wintypes.DWORD(0)
822 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
823 FILE_TYPE_CHAR = 0x0002
824 FILE_TYPE_REMOTE = 0x8000
825 GetConsoleMode = ctypes.WINFUNCTYPE(
826 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
827 ctypes.POINTER(ctypes.wintypes.DWORD))(
828 ("GetConsoleMode", ctypes.windll.kernel32))
829 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
831 def not_a_console(handle):
832 if handle == INVALID_HANDLE_VALUE or handle is None:
834 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
835 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
840 def next_nonbmp_pos(s):
842 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
843 except StopIteration:
847 count = min(next_nonbmp_pos(s), 1024)
850 h, s, count if count else 2, ctypes.byref(written), None)
852 raise OSError('Failed to write string')
853 if not count: # We just wrote a non-BMP character
854 assert written.value == 2
857 assert written.value > 0
858 s = s[written.value:]
862 def write_string(s, out=None, encoding=None):
865 assert type(s) == compat_str
867 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
868 if _windows_write_string(s, out):
871 if ('b' in getattr(out, 'mode', '') or
872 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
873 byt = s.encode(encoding or preferredencoding(), 'ignore')
875 elif hasattr(out, 'buffer'):
876 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
877 byt = s.encode(enc, 'ignore')
878 out.buffer.write(byt)
884 def bytes_to_intlist(bs):
887 if isinstance(bs[0], int): # Python 3
890 return [ord(c) for c in bs]
893 def intlist_to_bytes(xs):
896 return struct_pack('%dB' % len(xs), *xs)
899 # Cross-platform file locking
900 if sys.platform == 'win32':
901 import ctypes.wintypes
904 class OVERLAPPED(ctypes.Structure):
906 ('Internal', ctypes.wintypes.LPVOID),
907 ('InternalHigh', ctypes.wintypes.LPVOID),
908 ('Offset', ctypes.wintypes.DWORD),
909 ('OffsetHigh', ctypes.wintypes.DWORD),
910 ('hEvent', ctypes.wintypes.HANDLE),
913 kernel32 = ctypes.windll.kernel32
914 LockFileEx = kernel32.LockFileEx
915 LockFileEx.argtypes = [
916 ctypes.wintypes.HANDLE, # hFile
917 ctypes.wintypes.DWORD, # dwFlags
918 ctypes.wintypes.DWORD, # dwReserved
919 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
920 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
921 ctypes.POINTER(OVERLAPPED) # Overlapped
923 LockFileEx.restype = ctypes.wintypes.BOOL
924 UnlockFileEx = kernel32.UnlockFileEx
925 UnlockFileEx.argtypes = [
926 ctypes.wintypes.HANDLE, # hFile
927 ctypes.wintypes.DWORD, # dwReserved
928 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
929 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
930 ctypes.POINTER(OVERLAPPED) # Overlapped
932 UnlockFileEx.restype = ctypes.wintypes.BOOL
933 whole_low = 0xffffffff
934 whole_high = 0x7fffffff
936 def _lock_file(f, exclusive):
937 overlapped = OVERLAPPED()
938 overlapped.Offset = 0
939 overlapped.OffsetHigh = 0
940 overlapped.hEvent = 0
941 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
942 handle = msvcrt.get_osfhandle(f.fileno())
943 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
944 whole_low, whole_high, f._lock_file_overlapped_p):
945 raise OSError('Locking file failed: %r' % ctypes.FormatError())
948 assert f._lock_file_overlapped_p
949 handle = msvcrt.get_osfhandle(f.fileno())
950 if not UnlockFileEx(handle, 0,
951 whole_low, whole_high, f._lock_file_overlapped_p):
952 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
957 def _lock_file(f, exclusive):
958 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
961 fcntl.flock(f, fcntl.LOCK_UN)
964 class locked_file(object):
965 def __init__(self, filename, mode, encoding=None):
966 assert mode in ['r', 'a', 'w']
967 self.f = io.open(filename, mode, encoding=encoding)
971 exclusive = self.mode != 'r'
973 _lock_file(self.f, exclusive)
979 def __exit__(self, etype, value, traceback):
988 def write(self, *args):
989 return self.f.write(*args)
991 def read(self, *args):
992 return self.f.read(*args)
995 def get_filesystem_encoding():
996 encoding = sys.getfilesystemencoding()
997 return encoding if encoding is not None else 'utf-8'
1000 def shell_quote(args):
1002 encoding = get_filesystem_encoding()
1004 if isinstance(a, bytes):
1005 # We may get a filename encoded with 'encodeFilename'
1006 a = a.decode(encoding)
1007 quoted_args.append(pipes.quote(a))
1008 return ' '.join(quoted_args)
1011 def takewhile_inclusive(pred, seq):
1012 """ Like itertools.takewhile, but include the latest evaluated element
1013 (the first element so that Not pred(e)) """
1020 def smuggle_url(url, data):
1021 """ Pass additional data in a URL for internal use. """
1023 sdata = compat_urllib_parse.urlencode(
1024 {'__youtubedl_smuggle': json.dumps(data)})
1025 return url + '#' + sdata
1028 def unsmuggle_url(smug_url, default=None):
1029 if '#__youtubedl_smuggle' not in smug_url:
1030 return smug_url, default
1031 url, _, sdata = smug_url.rpartition('#')
1032 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1033 data = json.loads(jsond)
1037 def format_bytes(bytes):
1040 if type(bytes) is str:
1041 bytes = float(bytes)
1045 exponent = int(math.log(bytes, 1024.0))
1046 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1047 converted = float(bytes) / float(1024 ** exponent)
1048 return '%.2f%s' % (converted, suffix)
1051 def parse_filesize(s):
1055 # The lower-case forms are of course incorrect and inofficial,
1056 # but we support those too
1094 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1096 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1100 num_str = m.group('num').replace(',', '.')
1101 mult = _UNIT_TABLE[m.group('unit')]
1102 return int(float(num_str) * mult)
1105 def get_term_width():
1106 columns = compat_getenv('COLUMNS', None)
1111 sp = subprocess.Popen(
1113 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1114 out, err = sp.communicate()
1115 return int(out.split()[1])
1121 def month_by_name(name):
1122 """ Return the number of a month by (locale-independently) English name """
1125 'January', 'February', 'March', 'April', 'May', 'June',
1126 'July', 'August', 'September', 'October', 'November', 'December']
1128 return ENGLISH_NAMES.index(name) + 1
1133 def fix_xml_ampersands(xml_str):
1134 """Replace all the '&' by '&' in XML"""
1136 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1141 def setproctitle(title):
1142 assert isinstance(title, compat_str)
1144 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1147 title_bytes = title.encode('utf-8')
1148 buf = ctypes.create_string_buffer(len(title_bytes))
1149 buf.value = title_bytes
1151 libc.prctl(15, buf, 0, 0, 0)
1152 except AttributeError:
1153 return # Strange libc, just skip this
1156 def remove_start(s, start):
1157 if s.startswith(start):
1158 return s[len(start):]
1162 def remove_end(s, end):
1164 return s[:-len(end)]
1168 def url_basename(url):
1169 path = compat_urlparse.urlparse(url).path
1170 return path.strip('/').split('/')[-1]
1173 class HEADRequest(compat_urllib_request.Request):
1174 def get_method(self):
1178 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1181 v = getattr(v, get_attr, None)
1184 return default if v is None else (int(v) * invscale // scale)
1187 def str_or_none(v, default=None):
1188 return default if v is None else compat_str(v)
1191 def str_to_int(int_str):
1192 """ A more relaxed version of int_or_none """
1195 int_str = re.sub(r'[,\.\+]', '', int_str)
1199 def float_or_none(v, scale=1, invscale=1, default=None):
1200 return default if v is None else (float(v) * invscale / scale)
1203 def parse_duration(s):
1212 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1213 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1216 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1217 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1219 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1224 if m.group('only_mins'):
1225 return float_or_none(m.group('only_mins'), invscale=60)
1226 if m.group('only_hours'):
1227 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1229 res += int(m.group('secs'))
1231 res += int(m.group('mins')) * 60
1232 if m.group('hours'):
1233 res += int(m.group('hours')) * 60 * 60
1235 res += float(m.group('ms'))
1239 def prepend_extension(filename, ext):
1240 name, real_ext = os.path.splitext(filename)
1241 return '{0}.{1}{2}'.format(name, ext, real_ext)
1244 def check_executable(exe, args=[]):
1245 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1246 args can be a list of arguments for a short output (like -version) """
1248 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1254 def get_exe_version(exe, args=['--version'],
1255 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1256 unrecognized='present'):
1257 """ Returns the version of the specified executable,
1258 or False if the executable is not present """
1260 out, err = subprocess.Popen(
1262 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1265 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1266 m = re.search(version_re, firstline)
1273 class PagedList(object):
1275 # This is only useful for tests
1276 return len(self.getslice())
1279 class OnDemandPagedList(PagedList):
1280 def __init__(self, pagefunc, pagesize):
1281 self._pagefunc = pagefunc
1282 self._pagesize = pagesize
1284 def getslice(self, start=0, end=None):
1286 for pagenum in itertools.count(start // self._pagesize):
1287 firstid = pagenum * self._pagesize
1288 nextfirstid = pagenum * self._pagesize + self._pagesize
1289 if start >= nextfirstid:
1292 page_results = list(self._pagefunc(pagenum))
1295 start % self._pagesize
1296 if firstid <= start < nextfirstid
1300 ((end - 1) % self._pagesize) + 1
1301 if (end is not None and firstid <= end <= nextfirstid)
1304 if startv != 0 or endv is not None:
1305 page_results = page_results[startv:endv]
1306 res.extend(page_results)
1308 # A little optimization - if current page is not "full", ie. does
1309 # not contain page_size videos then we can assume that this page
1310 # is the last one - there are no more ids on further pages -
1311 # i.e. no need to query again.
1312 if len(page_results) + startv < self._pagesize:
1315 # If we got the whole page, but the next page is not interesting,
1316 # break out early as well
1317 if end == nextfirstid:
1322 class InAdvancePagedList(PagedList):
1323 def __init__(self, pagefunc, pagecount, pagesize):
1324 self._pagefunc = pagefunc
1325 self._pagecount = pagecount
1326 self._pagesize = pagesize
1328 def getslice(self, start=0, end=None):
1330 start_page = start // self._pagesize
1332 self._pagecount if end is None else (end // self._pagesize + 1))
1333 skip_elems = start - start_page * self._pagesize
1334 only_more = None if end is None else end - start
1335 for pagenum in range(start_page, end_page):
1336 page = list(self._pagefunc(pagenum))
1338 page = page[skip_elems:]
1340 if only_more is not None:
1341 if len(page) < only_more:
1342 only_more -= len(page)
1344 page = page[:only_more]
1351 def uppercase_escape(s):
1352 unicode_escape = codecs.getdecoder('unicode_escape')
1354 r'\\U[0-9a-fA-F]{8}',
1355 lambda m: unicode_escape(m.group(0))[0],
1359 def escape_rfc3986(s):
1360 """Escape non-ASCII characters as suggested by RFC 3986"""
1361 if sys.version_info < (3, 0) and isinstance(s, unicode):
1362 s = s.encode('utf-8')
1363 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1366 def escape_url(url):
1367 """Escape URL as suggested by RFC 3986"""
1368 url_parsed = compat_urllib_parse_urlparse(url)
1369 return url_parsed._replace(
1370 path=escape_rfc3986(url_parsed.path),
1371 params=escape_rfc3986(url_parsed.params),
1372 query=escape_rfc3986(url_parsed.query),
1373 fragment=escape_rfc3986(url_parsed.fragment)
1377 struct.pack('!I', 0)
1379 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1380 def struct_pack(spec, *args):
1381 if isinstance(spec, compat_str):
1382 spec = spec.encode('ascii')
1383 return struct.pack(spec, *args)
1385 def struct_unpack(spec, *args):
1386 if isinstance(spec, compat_str):
1387 spec = spec.encode('ascii')
1388 return struct.unpack(spec, *args)
1390 struct_pack = struct.pack
1391 struct_unpack = struct.unpack
1394 def read_batch_urls(batch_fd):
1396 if not isinstance(url, compat_str):
1397 url = url.decode('utf-8', 'replace')
1398 BOM_UTF8 = '\xef\xbb\xbf'
1399 if url.startswith(BOM_UTF8):
1400 url = url[len(BOM_UTF8):]
1402 if url.startswith(('#', ';', ']')):
1406 with contextlib.closing(batch_fd) as fd:
1407 return [url for url in map(fixup, fd) if url]
1410 def urlencode_postdata(*args, **kargs):
1411 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1415 etree_iter = xml.etree.ElementTree.Element.iter
1416 except AttributeError: # Python <=2.6
1417 etree_iter = lambda n: n.findall('.//*')
1421 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1422 def doctype(self, name, pubid, system):
1423 pass # Ignore doctypes
1425 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1426 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1427 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1428 # Fix up XML parser in Python 2.x
1429 if sys.version_info < (3, 0):
1430 for n in etree_iter(tree):
1431 if n.text is not None:
1432 if not isinstance(n.text, compat_str):
1433 n.text = n.text.decode('utf-8')
1446 def parse_age_limit(s):
1449 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1450 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1453 def strip_jsonp(code):
1455 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1458 def js_to_json(code):
1461 if v in ('true', 'false', 'null'):
1463 if v.startswith('"'):
1465 if v.startswith("'"):
1467 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1474 res = re.sub(r'''(?x)
1475 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1476 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1477 [a-zA-Z_][a-zA-Z_0-9]*
1479 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1483 def qualities(quality_ids):
1484 """ Get a numeric quality value out of a list of possible values """
1487 return quality_ids.index(qid)
1493 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1496 def limit_length(s, length):
1497 """ Add ellipses to overly long strings """
1502 return s[:length - len(ELLIPSES)] + ELLIPSES
1506 def version_tuple(v):
1507 return tuple(int(e) for e in re.split(r'[-.]', v))
1510 def is_outdated_version(version, limit, assume_new=True):
1512 return not assume_new
1514 return version_tuple(version) < version_tuple(limit)
1516 return not assume_new
1519 def ytdl_is_updateable():
1520 """ Returns if youtube-dl can be updated with -U """
1521 from zipimport import zipimporter
1523 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1526 def args_to_str(args):
1527 # Get a short string representation for a subprocess command
1528 return ' '.join(shlex_quote(a) for a in args)