2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
60 def preferredencoding():
61 """Get preferred encoding.
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
67 pref = locale.getpreferredencoding()
75 def write_json_file(obj, fn):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
78 fn = encodeFilename(fn)
79 if sys.version_info < (3, 0) and sys.platform != 'win32':
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
108 tf = tempfile.NamedTemporaryFile(**args)
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
120 os.rename(tf.name, fn)
129 if sys.version_info >= (2, 7):
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
134 expr = xpath + "[@%s='%s']" % (key, val)
135 return node.find(expr)
137 def find_xpath_attr(node, xpath, key, val):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
152 def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
157 replaced.append(c[0])
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
164 def xpath_text(node, xpath, name=None, fatal=False):
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
169 if n is None or n.text is None:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
183 def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
186 m = re.search(r'''(?xs)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 ''' % (re.escape(attribute), re.escape(value)), html)
198 res = m.group('content')
200 if res.startswith('"') or res.startswith("'"):
203 return unescapeHTML(res)
206 def clean_html(html):
207 """Clean an HTML snippet into a readable string"""
209 html = html.replace('\n', ' ')
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
219 def sanitize_open(filename, open_mode):
220 """Try to open the given filename, and slightly tweak it if this fails.
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
227 It returns the tuple (stream, definitive_file_name).
231 if sys.platform == 'win32':
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
238 if err.errno in (errno.EACCES,):
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
246 if alt_filename == filename:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
254 def timeconvert(timestr):
255 """Convert RFC 2822 defined time string into system timestamp"""
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
263 def sanitize_filename(s, restricted=False, is_id=False):
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
272 return '' if restricted else '\''
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
279 if restricted and ord(char) > 127:
283 result = ''.join(map(replace_insane, s))
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
296 def orderedSet(iterable):
297 """ Remove all duplicates from the input iterable """
305 def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
311 mobj = re.match(r'#(x?[0-9]+)', entity)
313 numstr = mobj.group(1)
314 if numstr.startswith('x'):
316 numstr = '0%s' % numstr
319 return compat_chr(int(numstr, base))
321 # Unknown entity in name, return its literal representation
322 return ('&%s;' % entity)
328 assert type(s) == compat_str
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
334 def encodeFilename(s, for_subprocess=False):
336 @param s The name of the file
339 assert type(s) == compat_str
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
349 if not for_subprocess:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
356 encoding = sys.getfilesystemencoding()
359 return s.encode(encoding, 'ignore')
362 def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
371 def decodeOption(optval):
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
377 assert isinstance(optval, compat_str)
381 def formatSeconds(secs):
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
385 return '%d:%02d' % (secs // 60, secs % 60)
390 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
391 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
392 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
393 if opts_no_check_certificate:
394 context.verify_mode = ssl.CERT_NONE
396 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
399 # (create_default_context present but HTTPSHandler has no context=)
402 if sys.version_info < (3, 2):
405 class HTTPSConnectionV3(httplib.HTTPSConnection):
406 def __init__(self, *args, **kwargs):
407 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
410 sock = socket.create_connection((self.host, self.port), self.timeout)
411 if getattr(self, '_tunnel_host', False):
415 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
417 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
419 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
420 def https_open(self, req):
421 return self.do_open(HTTPSConnectionV3, req)
422 return HTTPSHandlerV3(**kwargs)
424 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
425 context.verify_mode = (ssl.CERT_NONE
426 if opts_no_check_certificate
427 else ssl.CERT_REQUIRED)
428 context.set_default_verify_paths()
429 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
432 class ExtractorError(Exception):
433 """Error during info extraction."""
435 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
436 """ tb, if given, is the original traceback (so that it can be printed out).
437 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
440 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
442 if video_id is not None:
443 msg = video_id + ': ' + msg
445 msg += ' (caused by %r)' % cause
447 if ytdl_is_updateable():
448 update_cmd = 'type youtube-dl -U to update'
450 update_cmd = 'see https://yt-dl.org/update on how to update'
451 msg += '; please report this issue on https://yt-dl.org/bug .'
452 msg += ' Make sure you are using the latest version; %s.' % update_cmd
453 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
454 super(ExtractorError, self).__init__(msg)
457 self.exc_info = sys.exc_info() # preserve original exception
459 self.video_id = video_id
461 def format_traceback(self):
462 if self.traceback is None:
464 return ''.join(traceback.format_tb(self.traceback))
467 class RegexNotFoundError(ExtractorError):
468 """Error when a regex didn't match"""
472 class DownloadError(Exception):
473 """Download Error exception.
475 This exception may be thrown by FileDownloader objects if they are not
476 configured to continue on errors. They will contain the appropriate
480 def __init__(self, msg, exc_info=None):
481 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
482 super(DownloadError, self).__init__(msg)
483 self.exc_info = exc_info
486 class SameFileError(Exception):
487 """Same File exception.
489 This exception will be thrown by FileDownloader objects if they detect
490 multiple files would have to be downloaded to the same file on disk.
495 class PostProcessingError(Exception):
496 """Post Processing exception.
498 This exception may be raised by PostProcessor's .run() method to
499 indicate an error in the postprocessing task.
502 def __init__(self, msg):
506 class MaxDownloadsReached(Exception):
507 """ --max-downloads limit has been reached. """
511 class UnavailableVideoError(Exception):
512 """Unavailable Format exception.
514 This exception will be thrown when a video is requested
515 in a format that is not available for that video.
520 class ContentTooShortError(Exception):
521 """Content Too Short exception.
523 This exception may be raised by FileDownloader objects when a file they
524 download is too small for what the server announced first, indicating
525 the connection was probably interrupted.
531 def __init__(self, downloaded, expected):
532 self.downloaded = downloaded
533 self.expected = expected
536 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
537 """Handler for HTTP requests and responses.
539 This class, when installed with an OpenerDirector, automatically adds
540 the standard headers to every HTTP request and handles gzipped and
541 deflated responses from web servers. If compression is to be avoided in
542 a particular request, the original request in the program code only has
543 to include the HTTP header "Youtubedl-No-Compression", which will be
544 removed before making the real request.
546 Part of this code was copied from:
548 http://techknack.net/python-urllib2-handlers/
550 Andrew Rowls, the author of that code, agreed to release it to the
557 return zlib.decompress(data, -zlib.MAX_WBITS)
559 return zlib.decompress(data)
562 def addinfourl_wrapper(stream, headers, url, code):
563 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
564 return compat_urllib_request.addinfourl(stream, headers, url, code)
565 ret = compat_urllib_request.addinfourl(stream, headers, url)
569 def http_request(self, req):
570 for h, v in std_headers.items():
571 if h not in req.headers:
573 if 'Youtubedl-no-compression' in req.headers:
574 if 'Accept-encoding' in req.headers:
575 del req.headers['Accept-encoding']
576 del req.headers['Youtubedl-no-compression']
577 if 'Youtubedl-user-agent' in req.headers:
578 if 'User-agent' in req.headers:
579 del req.headers['User-agent']
580 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
581 del req.headers['Youtubedl-user-agent']
583 if sys.version_info < (2, 7) and '#' in req.get_full_url():
584 # Python 2.6 is brain-dead when it comes to fragments
585 req._Request__original = req._Request__original.partition('#')[0]
586 req._Request__r_type = req._Request__r_type.partition('#')[0]
590 def http_response(self, req, resp):
593 if resp.headers.get('Content-encoding', '') == 'gzip':
594 content = resp.read()
595 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
597 uncompressed = io.BytesIO(gz.read())
598 except IOError as original_ioerror:
599 # There may be junk add the end of the file
600 # See http://stackoverflow.com/q/4928560/35070 for details
601 for i in range(1, 1024):
603 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
604 uncompressed = io.BytesIO(gz.read())
609 raise original_ioerror
610 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
611 resp.msg = old_resp.msg
613 if resp.headers.get('Content-encoding', '') == 'deflate':
614 gz = io.BytesIO(self.deflate(resp.read()))
615 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
616 resp.msg = old_resp.msg
619 https_request = http_request
620 https_response = http_response
623 def parse_iso8601(date_str, delimiter='T'):
624 """ Return a UNIX timestamp from the given date """
630 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
633 timezone = datetime.timedelta()
635 date_str = date_str[:-len(m.group(0))]
636 if not m.group('sign'):
637 timezone = datetime.timedelta()
639 sign = 1 if m.group('sign') == '+' else -1
640 timezone = datetime.timedelta(
641 hours=sign * int(m.group('hours')),
642 minutes=sign * int(m.group('minutes')))
643 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
644 dt = datetime.datetime.strptime(date_str, date_format) - timezone
645 return calendar.timegm(dt.timetuple())
648 def unified_strdate(date_str, day_first=True):
649 """Return a string with the date in the format YYYYMMDD"""
655 date_str = date_str.replace(',', ' ')
656 # %z (UTC offset) is only supported in python>=3.2
657 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
658 # Remove AM/PM + timezone
659 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
661 format_expressions = [
666 '%b %dst %Y %I:%M%p',
667 '%b %dnd %Y %I:%M%p',
668 '%b %dth %Y %I:%M%p',
676 '%Y-%m-%d %H:%M:%S.%f',
679 '%Y-%m-%dT%H:%M:%SZ',
680 '%Y-%m-%dT%H:%M:%S.%fZ',
681 '%Y-%m-%dT%H:%M:%S.%f0Z',
683 '%Y-%m-%dT%H:%M:%S.%f',
687 format_expressions.extend([
691 format_expressions.extend([
694 for expression in format_expressions:
696 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
699 if upload_date is None:
700 timetuple = email.utils.parsedate_tz(date_str)
702 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
706 def determine_ext(url, default_ext='unknown_video'):
709 guess = url.partition('?')[0].rpartition('.')[2]
710 if re.match(r'^[A-Za-z0-9]+$', guess):
716 def subtitles_filename(filename, sub_lang, sub_format):
717 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
720 def date_from_str(date_str):
722 Return a datetime object from a string in the format YYYYMMDD or
723 (now|today)[+-][0-9](day|week|month|year)(s)?"""
724 today = datetime.date.today()
725 if date_str in ('now', 'today'):
727 if date_str == 'yesterday':
728 return today - datetime.timedelta(days=1)
729 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
730 if match is not None:
731 sign = match.group('sign')
732 time = int(match.group('time'))
735 unit = match.group('unit')
736 # A bad aproximation?
744 delta = datetime.timedelta(**{unit: time})
746 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
749 def hyphenate_date(date_str):
751 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
752 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
753 if match is not None:
754 return '-'.join(match.groups())
759 class DateRange(object):
760 """Represents a time interval between two dates"""
762 def __init__(self, start=None, end=None):
763 """start and end must be strings in the format accepted by date"""
764 if start is not None:
765 self.start = date_from_str(start)
767 self.start = datetime.datetime.min.date()
769 self.end = date_from_str(end)
771 self.end = datetime.datetime.max.date()
772 if self.start > self.end:
773 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
777 """Returns a range that only contains the given day"""
780 def __contains__(self, date):
781 """Check if the date is in the range"""
782 if not isinstance(date, datetime.date):
783 date = date_from_str(date)
784 return self.start <= date <= self.end
787 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
791 """ Returns the platform name as a compat_str """
792 res = platform.platform()
793 if isinstance(res, bytes):
794 res = res.decode(preferredencoding())
796 assert isinstance(res, compat_str)
800 def _windows_write_string(s, out):
801 """ Returns True if the string was written using special methods,
802 False if it has yet to be written out."""
803 # Adapted from http://stackoverflow.com/a/3259271/35070
806 import ctypes.wintypes
814 fileno = out.fileno()
815 except AttributeError:
816 # If the output stream doesn't have a fileno, it's virtual
818 if fileno not in WIN_OUTPUT_IDS:
821 GetStdHandle = ctypes.WINFUNCTYPE(
822 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
823 (b"GetStdHandle", ctypes.windll.kernel32))
824 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
826 WriteConsoleW = ctypes.WINFUNCTYPE(
827 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
828 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
829 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
830 written = ctypes.wintypes.DWORD(0)
832 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
833 FILE_TYPE_CHAR = 0x0002
834 FILE_TYPE_REMOTE = 0x8000
835 GetConsoleMode = ctypes.WINFUNCTYPE(
836 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
837 ctypes.POINTER(ctypes.wintypes.DWORD))(
838 (b"GetConsoleMode", ctypes.windll.kernel32))
839 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
841 def not_a_console(handle):
842 if handle == INVALID_HANDLE_VALUE or handle is None:
844 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
845 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
850 def next_nonbmp_pos(s):
852 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
853 except StopIteration:
857 count = min(next_nonbmp_pos(s), 1024)
860 h, s, count if count else 2, ctypes.byref(written), None)
862 raise OSError('Failed to write string')
863 if not count: # We just wrote a non-BMP character
864 assert written.value == 2
867 assert written.value > 0
868 s = s[written.value:]
872 def write_string(s, out=None, encoding=None):
875 assert type(s) == compat_str
877 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
878 if _windows_write_string(s, out):
881 if ('b' in getattr(out, 'mode', '') or
882 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
883 byt = s.encode(encoding or preferredencoding(), 'ignore')
885 elif hasattr(out, 'buffer'):
886 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
887 byt = s.encode(enc, 'ignore')
888 out.buffer.write(byt)
894 def bytes_to_intlist(bs):
897 if isinstance(bs[0], int): # Python 3
900 return [ord(c) for c in bs]
903 def intlist_to_bytes(xs):
906 return struct_pack('%dB' % len(xs), *xs)
909 # Cross-platform file locking
910 if sys.platform == 'win32':
911 import ctypes.wintypes
914 class OVERLAPPED(ctypes.Structure):
916 ('Internal', ctypes.wintypes.LPVOID),
917 ('InternalHigh', ctypes.wintypes.LPVOID),
918 ('Offset', ctypes.wintypes.DWORD),
919 ('OffsetHigh', ctypes.wintypes.DWORD),
920 ('hEvent', ctypes.wintypes.HANDLE),
923 kernel32 = ctypes.windll.kernel32
924 LockFileEx = kernel32.LockFileEx
925 LockFileEx.argtypes = [
926 ctypes.wintypes.HANDLE, # hFile
927 ctypes.wintypes.DWORD, # dwFlags
928 ctypes.wintypes.DWORD, # dwReserved
929 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
930 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
931 ctypes.POINTER(OVERLAPPED) # Overlapped
933 LockFileEx.restype = ctypes.wintypes.BOOL
934 UnlockFileEx = kernel32.UnlockFileEx
935 UnlockFileEx.argtypes = [
936 ctypes.wintypes.HANDLE, # hFile
937 ctypes.wintypes.DWORD, # dwReserved
938 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
939 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
940 ctypes.POINTER(OVERLAPPED) # Overlapped
942 UnlockFileEx.restype = ctypes.wintypes.BOOL
943 whole_low = 0xffffffff
944 whole_high = 0x7fffffff
946 def _lock_file(f, exclusive):
947 overlapped = OVERLAPPED()
948 overlapped.Offset = 0
949 overlapped.OffsetHigh = 0
950 overlapped.hEvent = 0
951 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
952 handle = msvcrt.get_osfhandle(f.fileno())
953 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
954 whole_low, whole_high, f._lock_file_overlapped_p):
955 raise OSError('Locking file failed: %r' % ctypes.FormatError())
958 assert f._lock_file_overlapped_p
959 handle = msvcrt.get_osfhandle(f.fileno())
960 if not UnlockFileEx(handle, 0,
961 whole_low, whole_high, f._lock_file_overlapped_p):
962 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
967 def _lock_file(f, exclusive):
968 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
971 fcntl.flock(f, fcntl.LOCK_UN)
974 class locked_file(object):
975 def __init__(self, filename, mode, encoding=None):
976 assert mode in ['r', 'a', 'w']
977 self.f = io.open(filename, mode, encoding=encoding)
981 exclusive = self.mode != 'r'
983 _lock_file(self.f, exclusive)
989 def __exit__(self, etype, value, traceback):
998 def write(self, *args):
999 return self.f.write(*args)
1001 def read(self, *args):
1002 return self.f.read(*args)
1005 def get_filesystem_encoding():
1006 encoding = sys.getfilesystemencoding()
1007 return encoding if encoding is not None else 'utf-8'
1010 def shell_quote(args):
1012 encoding = get_filesystem_encoding()
1014 if isinstance(a, bytes):
1015 # We may get a filename encoded with 'encodeFilename'
1016 a = a.decode(encoding)
1017 quoted_args.append(pipes.quote(a))
1018 return ' '.join(quoted_args)
1021 def takewhile_inclusive(pred, seq):
1022 """ Like itertools.takewhile, but include the latest evaluated element
1023 (the first element so that Not pred(e)) """
1030 def smuggle_url(url, data):
1031 """ Pass additional data in a URL for internal use. """
1033 sdata = compat_urllib_parse.urlencode(
1034 {'__youtubedl_smuggle': json.dumps(data)})
1035 return url + '#' + sdata
1038 def unsmuggle_url(smug_url, default=None):
1039 if '#__youtubedl_smuggle' not in smug_url:
1040 return smug_url, default
1041 url, _, sdata = smug_url.rpartition('#')
1042 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1043 data = json.loads(jsond)
1047 def format_bytes(bytes):
1050 if type(bytes) is str:
1051 bytes = float(bytes)
1055 exponent = int(math.log(bytes, 1024.0))
1056 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1057 converted = float(bytes) / float(1024 ** exponent)
1058 return '%.2f%s' % (converted, suffix)
1061 def parse_filesize(s):
1065 # The lower-case forms are of course incorrect and inofficial,
1066 # but we support those too
1104 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1106 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1110 num_str = m.group('num').replace(',', '.')
1111 mult = _UNIT_TABLE[m.group('unit')]
1112 return int(float(num_str) * mult)
1115 def get_term_width():
1116 columns = compat_getenv('COLUMNS', None)
1121 sp = subprocess.Popen(
1123 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1124 out, err = sp.communicate()
1125 return int(out.split()[1])
1131 def month_by_name(name):
1132 """ Return the number of a month by (locale-independently) English name """
1135 'January', 'February', 'March', 'April', 'May', 'June',
1136 'July', 'August', 'September', 'October', 'November', 'December']
1138 return ENGLISH_NAMES.index(name) + 1
1143 def fix_xml_ampersands(xml_str):
1144 """Replace all the '&' by '&' in XML"""
1146 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1151 def setproctitle(title):
1152 assert isinstance(title, compat_str)
1154 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1157 title_bytes = title.encode('utf-8')
1158 buf = ctypes.create_string_buffer(len(title_bytes))
1159 buf.value = title_bytes
1161 libc.prctl(15, buf, 0, 0, 0)
1162 except AttributeError:
1163 return # Strange libc, just skip this
1166 def remove_start(s, start):
1167 if s.startswith(start):
1168 return s[len(start):]
1172 def remove_end(s, end):
1174 return s[:-len(end)]
1178 def url_basename(url):
1179 path = compat_urlparse.urlparse(url).path
1180 return path.strip('/').split('/')[-1]
1183 class HEADRequest(compat_urllib_request.Request):
1184 def get_method(self):
1188 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1191 v = getattr(v, get_attr, None)
1194 return default if v is None else (int(v) * invscale // scale)
1197 def str_or_none(v, default=None):
1198 return default if v is None else compat_str(v)
1201 def str_to_int(int_str):
1202 """ A more relaxed version of int_or_none """
1205 int_str = re.sub(r'[,\.\+]', '', int_str)
1209 def float_or_none(v, scale=1, invscale=1, default=None):
1210 return default if v is None else (float(v) * invscale / scale)
1213 def parse_duration(s):
1222 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1223 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1226 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1227 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1229 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1234 if m.group('only_mins'):
1235 return float_or_none(m.group('only_mins'), invscale=60)
1236 if m.group('only_hours'):
1237 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1239 res += int(m.group('secs'))
1241 res += int(m.group('mins')) * 60
1242 if m.group('hours'):
1243 res += int(m.group('hours')) * 60 * 60
1245 res += float(m.group('ms'))
1249 def prepend_extension(filename, ext):
1250 name, real_ext = os.path.splitext(filename)
1251 return '{0}.{1}{2}'.format(name, ext, real_ext)
1254 def check_executable(exe, args=[]):
1255 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1256 args can be a list of arguments for a short output (like -version) """
1258 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1264 def get_exe_version(exe, args=['--version'],
1265 version_re=None, unrecognized='present'):
1266 """ Returns the version of the specified executable,
1267 or False if the executable is not present """
1269 out, _ = subprocess.Popen(
1271 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1274 if isinstance(out, bytes): # Python 2.x
1275 out = out.decode('ascii', 'ignore')
1276 return detect_exe_version(out, version_re, unrecognized)
1279 def detect_exe_version(output, version_re=None, unrecognized='present'):
1280 assert isinstance(output, compat_str)
1281 if version_re is None:
1282 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1283 m = re.search(version_re, output)
1290 class PagedList(object):
1292 # This is only useful for tests
1293 return len(self.getslice())
1296 class OnDemandPagedList(PagedList):
1297 def __init__(self, pagefunc, pagesize):
1298 self._pagefunc = pagefunc
1299 self._pagesize = pagesize
1301 def getslice(self, start=0, end=None):
1303 for pagenum in itertools.count(start // self._pagesize):
1304 firstid = pagenum * self._pagesize
1305 nextfirstid = pagenum * self._pagesize + self._pagesize
1306 if start >= nextfirstid:
1309 page_results = list(self._pagefunc(pagenum))
1312 start % self._pagesize
1313 if firstid <= start < nextfirstid
1317 ((end - 1) % self._pagesize) + 1
1318 if (end is not None and firstid <= end <= nextfirstid)
1321 if startv != 0 or endv is not None:
1322 page_results = page_results[startv:endv]
1323 res.extend(page_results)
1325 # A little optimization - if current page is not "full", ie. does
1326 # not contain page_size videos then we can assume that this page
1327 # is the last one - there are no more ids on further pages -
1328 # i.e. no need to query again.
1329 if len(page_results) + startv < self._pagesize:
1332 # If we got the whole page, but the next page is not interesting,
1333 # break out early as well
1334 if end == nextfirstid:
1339 class InAdvancePagedList(PagedList):
1340 def __init__(self, pagefunc, pagecount, pagesize):
1341 self._pagefunc = pagefunc
1342 self._pagecount = pagecount
1343 self._pagesize = pagesize
1345 def getslice(self, start=0, end=None):
1347 start_page = start // self._pagesize
1349 self._pagecount if end is None else (end // self._pagesize + 1))
1350 skip_elems = start - start_page * self._pagesize
1351 only_more = None if end is None else end - start
1352 for pagenum in range(start_page, end_page):
1353 page = list(self._pagefunc(pagenum))
1355 page = page[skip_elems:]
1357 if only_more is not None:
1358 if len(page) < only_more:
1359 only_more -= len(page)
1361 page = page[:only_more]
1368 def uppercase_escape(s):
1369 unicode_escape = codecs.getdecoder('unicode_escape')
1371 r'\\U[0-9a-fA-F]{8}',
1372 lambda m: unicode_escape(m.group(0))[0],
1376 def escape_rfc3986(s):
1377 """Escape non-ASCII characters as suggested by RFC 3986"""
1378 if sys.version_info < (3, 0) and isinstance(s, unicode):
1379 s = s.encode('utf-8')
1380 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1383 def escape_url(url):
1384 """Escape URL as suggested by RFC 3986"""
1385 url_parsed = compat_urllib_parse_urlparse(url)
1386 return url_parsed._replace(
1387 path=escape_rfc3986(url_parsed.path),
1388 params=escape_rfc3986(url_parsed.params),
1389 query=escape_rfc3986(url_parsed.query),
1390 fragment=escape_rfc3986(url_parsed.fragment)
1394 struct.pack('!I', 0)
1396 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1397 def struct_pack(spec, *args):
1398 if isinstance(spec, compat_str):
1399 spec = spec.encode('ascii')
1400 return struct.pack(spec, *args)
1402 def struct_unpack(spec, *args):
1403 if isinstance(spec, compat_str):
1404 spec = spec.encode('ascii')
1405 return struct.unpack(spec, *args)
1407 struct_pack = struct.pack
1408 struct_unpack = struct.unpack
1411 def read_batch_urls(batch_fd):
1413 if not isinstance(url, compat_str):
1414 url = url.decode('utf-8', 'replace')
1415 BOM_UTF8 = '\xef\xbb\xbf'
1416 if url.startswith(BOM_UTF8):
1417 url = url[len(BOM_UTF8):]
1419 if url.startswith(('#', ';', ']')):
1423 with contextlib.closing(batch_fd) as fd:
1424 return [url for url in map(fixup, fd) if url]
1427 def urlencode_postdata(*args, **kargs):
1428 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1432 etree_iter = xml.etree.ElementTree.Element.iter
1433 except AttributeError: # Python <=2.6
1434 etree_iter = lambda n: n.findall('.//*')
1438 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1439 def doctype(self, name, pubid, system):
1440 pass # Ignore doctypes
1442 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1443 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1444 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1445 # Fix up XML parser in Python 2.x
1446 if sys.version_info < (3, 0):
1447 for n in etree_iter(tree):
1448 if n.text is not None:
1449 if not isinstance(n.text, compat_str):
1450 n.text = n.text.decode('utf-8')
1463 def parse_age_limit(s):
1466 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1467 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1470 def strip_jsonp(code):
1472 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1475 def js_to_json(code):
1478 if v in ('true', 'false', 'null'):
1480 if v.startswith('"'):
1482 if v.startswith("'"):
1484 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1491 res = re.sub(r'''(?x)
1492 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1493 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1494 [a-zA-Z_][a-zA-Z_0-9]*
1496 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1500 def qualities(quality_ids):
1501 """ Get a numeric quality value out of a list of possible values """
1504 return quality_ids.index(qid)
1510 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1513 def limit_length(s, length):
1514 """ Add ellipses to overly long strings """
1519 return s[:length - len(ELLIPSES)] + ELLIPSES
1523 def version_tuple(v):
1524 return tuple(int(e) for e in re.split(r'[-.]', v))
1527 def is_outdated_version(version, limit, assume_new=True):
1529 return not assume_new
1531 return version_tuple(version) < version_tuple(limit)
1533 return not assume_new
1536 def ytdl_is_updateable():
1537 """ Returns if youtube-dl can be updated with -U """
1538 from zipimport import zipimporter
1540 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1543 def args_to_str(args):
1544 # Get a short string representation for a subprocess command
1545 return ' '.join(shlex_quote(a) for a in args)