2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
60 def preferredencoding():
61 """Get preferred encoding.
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
67 pref = locale.getpreferredencoding()
75 def write_json_file(obj, fn):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
78 fn = encodeFilename(fn)
79 if sys.version_info < (3, 0) and sys.platform != 'win32':
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
108 tf = tempfile.NamedTemporaryFile(**args)
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
120 os.rename(tf.name, fn)
129 if sys.version_info >= (2, 7):
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
134 expr = xpath + "[@%s='%s']" % (key, val)
135 return node.find(expr)
137 def find_xpath_attr(node, xpath, key, val):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
152 def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
157 replaced.append(c[0])
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
164 def xpath_text(node, xpath, name=None, fatal=False):
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
169 if n is None or n.text is None:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
183 def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
186 m = re.search(r'''(?xs)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 ''' % (re.escape(attribute), re.escape(value)), html)
198 res = m.group('content')
200 if res.startswith('"') or res.startswith("'"):
203 return unescapeHTML(res)
206 def clean_html(html):
207 """Clean an HTML snippet into a readable string"""
209 if html is None: # Convenience for sanitizing descriptions etc.
213 html = html.replace('\n', ' ')
214 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
215 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
217 html = re.sub('<.*?>', '', html)
218 # Replace html entities
219 html = unescapeHTML(html)
223 def sanitize_open(filename, open_mode):
224 """Try to open the given filename, and slightly tweak it if this fails.
226 Attempts to open the given filename. If this fails, it tries to change
227 the filename slightly, step by step, until it's either able to open it
228 or it fails and raises a final exception, like the standard open()
231 It returns the tuple (stream, definitive_file_name).
235 if sys.platform == 'win32':
237 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
238 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
239 stream = open(encodeFilename(filename), open_mode)
240 return (stream, filename)
241 except (IOError, OSError) as err:
242 if err.errno in (errno.EACCES,):
245 # In case of error, try to remove win32 forbidden chars
246 alt_filename = os.path.join(
247 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
248 for path_part in os.path.split(filename)
250 if alt_filename == filename:
253 # An exception here should be caught in the caller
254 stream = open(encodeFilename(filename), open_mode)
255 return (stream, alt_filename)
258 def timeconvert(timestr):
259 """Convert RFC 2822 defined time string into system timestamp"""
261 timetuple = email.utils.parsedate_tz(timestr)
262 if timetuple is not None:
263 timestamp = email.utils.mktime_tz(timetuple)
267 def sanitize_filename(s, restricted=False, is_id=False):
268 """Sanitizes a string so it could be used as part of a filename.
269 If restricted is set, use a stricter subset of allowed characters.
270 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
272 def replace_insane(char):
273 if char == '?' or ord(char) < 32 or ord(char) == 127:
276 return '' if restricted else '\''
278 return '_-' if restricted else ' -'
279 elif char in '\\/|*<>':
281 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
283 if restricted and ord(char) > 127:
287 result = ''.join(map(replace_insane, s))
289 while '__' in result:
290 result = result.replace('__', '_')
291 result = result.strip('_')
292 # Common case of "Foreign band name - English song title"
293 if restricted and result.startswith('-_'):
300 def orderedSet(iterable):
301 """ Remove all duplicates from the input iterable """
309 def _htmlentity_transform(entity):
310 """Transforms an HTML entity to a character."""
311 # Known non-numeric HTML entity
312 if entity in compat_html_entities.name2codepoint:
313 return compat_chr(compat_html_entities.name2codepoint[entity])
315 mobj = re.match(r'#(x?[0-9]+)', entity)
317 numstr = mobj.group(1)
318 if numstr.startswith('x'):
320 numstr = '0%s' % numstr
323 return compat_chr(int(numstr, base))
325 # Unknown entity in name, return its literal representation
326 return ('&%s;' % entity)
332 assert type(s) == compat_str
335 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
338 def encodeFilename(s, for_subprocess=False):
340 @param s The name of the file
343 assert type(s) == compat_str
345 # Python 3 has a Unicode API
346 if sys.version_info >= (3, 0):
349 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
350 # Pass '' directly to use Unicode APIs on Windows 2000 and up
351 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
352 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
353 if not for_subprocess:
356 # For subprocess calls, encode with locale encoding
357 # Refer to http://stackoverflow.com/a/9951851/35070
358 encoding = preferredencoding()
360 encoding = sys.getfilesystemencoding()
363 return s.encode(encoding, 'ignore')
366 def encodeArgument(s):
367 if not isinstance(s, compat_str):
368 # Legacy code that uses byte strings
369 # Uncomment the following line after fixing all post processors
370 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
371 s = s.decode('ascii')
372 return encodeFilename(s, True)
375 def decodeOption(optval):
378 if isinstance(optval, bytes):
379 optval = optval.decode(preferredencoding())
381 assert isinstance(optval, compat_str)
385 def formatSeconds(secs):
387 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
389 return '%d:%02d' % (secs // 60, secs % 60)
394 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
395 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
396 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
397 if opts_no_check_certificate:
398 context.verify_mode = ssl.CERT_NONE
400 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
403 # (create_default_context present but HTTPSHandler has no context=)
406 if sys.version_info < (3, 2):
409 class HTTPSConnectionV3(httplib.HTTPSConnection):
410 def __init__(self, *args, **kwargs):
411 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
414 sock = socket.create_connection((self.host, self.port), self.timeout)
415 if getattr(self, '_tunnel_host', False):
419 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
421 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
423 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
424 def https_open(self, req):
425 return self.do_open(HTTPSConnectionV3, req)
426 return HTTPSHandlerV3(**kwargs)
428 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
429 context.verify_mode = (ssl.CERT_NONE
430 if opts_no_check_certificate
431 else ssl.CERT_REQUIRED)
432 context.set_default_verify_paths()
433 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
436 class ExtractorError(Exception):
437 """Error during info extraction."""
439 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
440 """ tb, if given, is the original traceback (so that it can be printed out).
441 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
444 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
446 if video_id is not None:
447 msg = video_id + ': ' + msg
449 msg += ' (caused by %r)' % cause
451 if ytdl_is_updateable():
452 update_cmd = 'type youtube-dl -U to update'
454 update_cmd = 'see https://yt-dl.org/update on how to update'
455 msg += '; please report this issue on https://yt-dl.org/bug .'
456 msg += ' Make sure you are using the latest version; %s.' % update_cmd
457 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
458 super(ExtractorError, self).__init__(msg)
461 self.exc_info = sys.exc_info() # preserve original exception
463 self.video_id = video_id
465 def format_traceback(self):
466 if self.traceback is None:
468 return ''.join(traceback.format_tb(self.traceback))
471 class UnsupportedError(ExtractorError):
472 def __init__(self, url):
473 super(UnsupportedError, self).__init__(
474 'Unsupported URL: %s' % url, expected=True)
478 class RegexNotFoundError(ExtractorError):
479 """Error when a regex didn't match"""
483 class DownloadError(Exception):
484 """Download Error exception.
486 This exception may be thrown by FileDownloader objects if they are not
487 configured to continue on errors. They will contain the appropriate
491 def __init__(self, msg, exc_info=None):
492 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
493 super(DownloadError, self).__init__(msg)
494 self.exc_info = exc_info
497 class SameFileError(Exception):
498 """Same File exception.
500 This exception will be thrown by FileDownloader objects if they detect
501 multiple files would have to be downloaded to the same file on disk.
506 class PostProcessingError(Exception):
507 """Post Processing exception.
509 This exception may be raised by PostProcessor's .run() method to
510 indicate an error in the postprocessing task.
513 def __init__(self, msg):
517 class MaxDownloadsReached(Exception):
518 """ --max-downloads limit has been reached. """
522 class UnavailableVideoError(Exception):
523 """Unavailable Format exception.
525 This exception will be thrown when a video is requested
526 in a format that is not available for that video.
531 class ContentTooShortError(Exception):
532 """Content Too Short exception.
534 This exception may be raised by FileDownloader objects when a file they
535 download is too small for what the server announced first, indicating
536 the connection was probably interrupted.
542 def __init__(self, downloaded, expected):
543 self.downloaded = downloaded
544 self.expected = expected
547 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
548 """Handler for HTTP requests and responses.
550 This class, when installed with an OpenerDirector, automatically adds
551 the standard headers to every HTTP request and handles gzipped and
552 deflated responses from web servers. If compression is to be avoided in
553 a particular request, the original request in the program code only has
554 to include the HTTP header "Youtubedl-No-Compression", which will be
555 removed before making the real request.
557 Part of this code was copied from:
559 http://techknack.net/python-urllib2-handlers/
561 Andrew Rowls, the author of that code, agreed to release it to the
568 return zlib.decompress(data, -zlib.MAX_WBITS)
570 return zlib.decompress(data)
573 def addinfourl_wrapper(stream, headers, url, code):
574 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
575 return compat_urllib_request.addinfourl(stream, headers, url, code)
576 ret = compat_urllib_request.addinfourl(stream, headers, url)
580 def http_request(self, req):
581 for h, v in std_headers.items():
582 if h not in req.headers:
584 if 'Youtubedl-no-compression' in req.headers:
585 if 'Accept-encoding' in req.headers:
586 del req.headers['Accept-encoding']
587 del req.headers['Youtubedl-no-compression']
588 if 'Youtubedl-user-agent' in req.headers:
589 if 'User-agent' in req.headers:
590 del req.headers['User-agent']
591 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
592 del req.headers['Youtubedl-user-agent']
594 if sys.version_info < (2, 7) and '#' in req.get_full_url():
595 # Python 2.6 is brain-dead when it comes to fragments
596 req._Request__original = req._Request__original.partition('#')[0]
597 req._Request__r_type = req._Request__r_type.partition('#')[0]
601 def http_response(self, req, resp):
604 if resp.headers.get('Content-encoding', '') == 'gzip':
605 content = resp.read()
606 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
608 uncompressed = io.BytesIO(gz.read())
609 except IOError as original_ioerror:
610 # There may be junk add the end of the file
611 # See http://stackoverflow.com/q/4928560/35070 for details
612 for i in range(1, 1024):
614 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
615 uncompressed = io.BytesIO(gz.read())
620 raise original_ioerror
621 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
622 resp.msg = old_resp.msg
624 if resp.headers.get('Content-encoding', '') == 'deflate':
625 gz = io.BytesIO(self.deflate(resp.read()))
626 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
627 resp.msg = old_resp.msg
630 https_request = http_request
631 https_response = http_response
634 def parse_iso8601(date_str, delimiter='T'):
635 """ Return a UNIX timestamp from the given date """
641 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
644 timezone = datetime.timedelta()
646 date_str = date_str[:-len(m.group(0))]
647 if not m.group('sign'):
648 timezone = datetime.timedelta()
650 sign = 1 if m.group('sign') == '+' else -1
651 timezone = datetime.timedelta(
652 hours=sign * int(m.group('hours')),
653 minutes=sign * int(m.group('minutes')))
654 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
655 dt = datetime.datetime.strptime(date_str, date_format) - timezone
656 return calendar.timegm(dt.timetuple())
659 def unified_strdate(date_str, day_first=True):
660 """Return a string with the date in the format YYYYMMDD"""
666 date_str = date_str.replace(',', ' ')
667 # %z (UTC offset) is only supported in python>=3.2
668 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
669 # Remove AM/PM + timezone
670 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
672 format_expressions = [
677 '%b %dst %Y %I:%M%p',
678 '%b %dnd %Y %I:%M%p',
679 '%b %dth %Y %I:%M%p',
684 '%Y-%m-%d %H:%M:%S.%f',
687 '%Y-%m-%dT%H:%M:%SZ',
688 '%Y-%m-%dT%H:%M:%S.%fZ',
689 '%Y-%m-%dT%H:%M:%S.%f0Z',
691 '%Y-%m-%dT%H:%M:%S.%f',
695 format_expressions.extend([
702 format_expressions.extend([
708 for expression in format_expressions:
710 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
713 if upload_date is None:
714 timetuple = email.utils.parsedate_tz(date_str)
716 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
720 def determine_ext(url, default_ext='unknown_video'):
723 guess = url.partition('?')[0].rpartition('.')[2]
724 if re.match(r'^[A-Za-z0-9]+$', guess):
730 def subtitles_filename(filename, sub_lang, sub_format):
731 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
734 def date_from_str(date_str):
736 Return a datetime object from a string in the format YYYYMMDD or
737 (now|today)[+-][0-9](day|week|month|year)(s)?"""
738 today = datetime.date.today()
739 if date_str in ('now', 'today'):
741 if date_str == 'yesterday':
742 return today - datetime.timedelta(days=1)
743 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
744 if match is not None:
745 sign = match.group('sign')
746 time = int(match.group('time'))
749 unit = match.group('unit')
750 # A bad aproximation?
758 delta = datetime.timedelta(**{unit: time})
760 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
763 def hyphenate_date(date_str):
765 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
766 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
767 if match is not None:
768 return '-'.join(match.groups())
773 class DateRange(object):
774 """Represents a time interval between two dates"""
776 def __init__(self, start=None, end=None):
777 """start and end must be strings in the format accepted by date"""
778 if start is not None:
779 self.start = date_from_str(start)
781 self.start = datetime.datetime.min.date()
783 self.end = date_from_str(end)
785 self.end = datetime.datetime.max.date()
786 if self.start > self.end:
787 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
791 """Returns a range that only contains the given day"""
794 def __contains__(self, date):
795 """Check if the date is in the range"""
796 if not isinstance(date, datetime.date):
797 date = date_from_str(date)
798 return self.start <= date <= self.end
801 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
805 """ Returns the platform name as a compat_str """
806 res = platform.platform()
807 if isinstance(res, bytes):
808 res = res.decode(preferredencoding())
810 assert isinstance(res, compat_str)
814 def _windows_write_string(s, out):
815 """ Returns True if the string was written using special methods,
816 False if it has yet to be written out."""
817 # Adapted from http://stackoverflow.com/a/3259271/35070
820 import ctypes.wintypes
828 fileno = out.fileno()
829 except AttributeError:
830 # If the output stream doesn't have a fileno, it's virtual
832 if fileno not in WIN_OUTPUT_IDS:
835 GetStdHandle = ctypes.WINFUNCTYPE(
836 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
837 (b"GetStdHandle", ctypes.windll.kernel32))
838 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
840 WriteConsoleW = ctypes.WINFUNCTYPE(
841 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
842 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
843 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
844 written = ctypes.wintypes.DWORD(0)
846 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
847 FILE_TYPE_CHAR = 0x0002
848 FILE_TYPE_REMOTE = 0x8000
849 GetConsoleMode = ctypes.WINFUNCTYPE(
850 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
851 ctypes.POINTER(ctypes.wintypes.DWORD))(
852 (b"GetConsoleMode", ctypes.windll.kernel32))
853 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
855 def not_a_console(handle):
856 if handle == INVALID_HANDLE_VALUE or handle is None:
858 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
859 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
864 def next_nonbmp_pos(s):
866 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
867 except StopIteration:
871 count = min(next_nonbmp_pos(s), 1024)
874 h, s, count if count else 2, ctypes.byref(written), None)
876 raise OSError('Failed to write string')
877 if not count: # We just wrote a non-BMP character
878 assert written.value == 2
881 assert written.value > 0
882 s = s[written.value:]
886 def write_string(s, out=None, encoding=None):
889 assert type(s) == compat_str
891 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
892 if _windows_write_string(s, out):
895 if ('b' in getattr(out, 'mode', '') or
896 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
897 byt = s.encode(encoding or preferredencoding(), 'ignore')
899 elif hasattr(out, 'buffer'):
900 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
901 byt = s.encode(enc, 'ignore')
902 out.buffer.write(byt)
908 def bytes_to_intlist(bs):
911 if isinstance(bs[0], int): # Python 3
914 return [ord(c) for c in bs]
917 def intlist_to_bytes(xs):
920 return struct_pack('%dB' % len(xs), *xs)
923 # Cross-platform file locking
924 if sys.platform == 'win32':
925 import ctypes.wintypes
928 class OVERLAPPED(ctypes.Structure):
930 ('Internal', ctypes.wintypes.LPVOID),
931 ('InternalHigh', ctypes.wintypes.LPVOID),
932 ('Offset', ctypes.wintypes.DWORD),
933 ('OffsetHigh', ctypes.wintypes.DWORD),
934 ('hEvent', ctypes.wintypes.HANDLE),
937 kernel32 = ctypes.windll.kernel32
938 LockFileEx = kernel32.LockFileEx
939 LockFileEx.argtypes = [
940 ctypes.wintypes.HANDLE, # hFile
941 ctypes.wintypes.DWORD, # dwFlags
942 ctypes.wintypes.DWORD, # dwReserved
943 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
944 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
945 ctypes.POINTER(OVERLAPPED) # Overlapped
947 LockFileEx.restype = ctypes.wintypes.BOOL
948 UnlockFileEx = kernel32.UnlockFileEx
949 UnlockFileEx.argtypes = [
950 ctypes.wintypes.HANDLE, # hFile
951 ctypes.wintypes.DWORD, # dwReserved
952 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
953 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
954 ctypes.POINTER(OVERLAPPED) # Overlapped
956 UnlockFileEx.restype = ctypes.wintypes.BOOL
957 whole_low = 0xffffffff
958 whole_high = 0x7fffffff
960 def _lock_file(f, exclusive):
961 overlapped = OVERLAPPED()
962 overlapped.Offset = 0
963 overlapped.OffsetHigh = 0
964 overlapped.hEvent = 0
965 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
966 handle = msvcrt.get_osfhandle(f.fileno())
967 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
968 whole_low, whole_high, f._lock_file_overlapped_p):
969 raise OSError('Locking file failed: %r' % ctypes.FormatError())
972 assert f._lock_file_overlapped_p
973 handle = msvcrt.get_osfhandle(f.fileno())
974 if not UnlockFileEx(handle, 0,
975 whole_low, whole_high, f._lock_file_overlapped_p):
976 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
981 def _lock_file(f, exclusive):
982 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
985 fcntl.flock(f, fcntl.LOCK_UN)
988 class locked_file(object):
989 def __init__(self, filename, mode, encoding=None):
990 assert mode in ['r', 'a', 'w']
991 self.f = io.open(filename, mode, encoding=encoding)
995 exclusive = self.mode != 'r'
997 _lock_file(self.f, exclusive)
1003 def __exit__(self, etype, value, traceback):
1005 _unlock_file(self.f)
1012 def write(self, *args):
1013 return self.f.write(*args)
1015 def read(self, *args):
1016 return self.f.read(*args)
1019 def get_filesystem_encoding():
1020 encoding = sys.getfilesystemencoding()
1021 return encoding if encoding is not None else 'utf-8'
1024 def shell_quote(args):
1026 encoding = get_filesystem_encoding()
1028 if isinstance(a, bytes):
1029 # We may get a filename encoded with 'encodeFilename'
1030 a = a.decode(encoding)
1031 quoted_args.append(pipes.quote(a))
1032 return ' '.join(quoted_args)
1035 def takewhile_inclusive(pred, seq):
1036 """ Like itertools.takewhile, but include the latest evaluated element
1037 (the first element so that Not pred(e)) """
1044 def smuggle_url(url, data):
1045 """ Pass additional data in a URL for internal use. """
1047 sdata = compat_urllib_parse.urlencode(
1048 {'__youtubedl_smuggle': json.dumps(data)})
1049 return url + '#' + sdata
1052 def unsmuggle_url(smug_url, default=None):
1053 if '#__youtubedl_smuggle' not in smug_url:
1054 return smug_url, default
1055 url, _, sdata = smug_url.rpartition('#')
1056 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1057 data = json.loads(jsond)
1061 def format_bytes(bytes):
1064 if type(bytes) is str:
1065 bytes = float(bytes)
1069 exponent = int(math.log(bytes, 1024.0))
1070 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1071 converted = float(bytes) / float(1024 ** exponent)
1072 return '%.2f%s' % (converted, suffix)
1075 def parse_filesize(s):
1079 # The lower-case forms are of course incorrect and inofficial,
1080 # but we support those too
1118 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1120 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1124 num_str = m.group('num').replace(',', '.')
1125 mult = _UNIT_TABLE[m.group('unit')]
1126 return int(float(num_str) * mult)
1129 def get_term_width():
1130 columns = compat_getenv('COLUMNS', None)
1135 sp = subprocess.Popen(
1137 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1138 out, err = sp.communicate()
1139 return int(out.split()[1])
1145 def month_by_name(name):
1146 """ Return the number of a month by (locale-independently) English name """
1149 'January', 'February', 'March', 'April', 'May', 'June',
1150 'July', 'August', 'September', 'October', 'November', 'December']
1152 return ENGLISH_NAMES.index(name) + 1
1157 def fix_xml_ampersands(xml_str):
1158 """Replace all the '&' by '&' in XML"""
1160 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1165 def setproctitle(title):
1166 assert isinstance(title, compat_str)
1168 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1171 title_bytes = title.encode('utf-8')
1172 buf = ctypes.create_string_buffer(len(title_bytes))
1173 buf.value = title_bytes
1175 libc.prctl(15, buf, 0, 0, 0)
1176 except AttributeError:
1177 return # Strange libc, just skip this
1180 def remove_start(s, start):
1181 if s.startswith(start):
1182 return s[len(start):]
1186 def remove_end(s, end):
1188 return s[:-len(end)]
1192 def url_basename(url):
1193 path = compat_urlparse.urlparse(url).path
1194 return path.strip('/').split('/')[-1]
1197 class HEADRequest(compat_urllib_request.Request):
1198 def get_method(self):
1202 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1205 v = getattr(v, get_attr, None)
1208 return default if v is None else (int(v) * invscale // scale)
1211 def str_or_none(v, default=None):
1212 return default if v is None else compat_str(v)
1215 def str_to_int(int_str):
1216 """ A more relaxed version of int_or_none """
1219 int_str = re.sub(r'[,\.\+]', '', int_str)
1223 def float_or_none(v, scale=1, invscale=1, default=None):
1224 return default if v is None else (float(v) * invscale / scale)
1227 def parse_duration(s):
1236 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1237 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1240 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1241 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1243 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1248 if m.group('only_mins'):
1249 return float_or_none(m.group('only_mins'), invscale=60)
1250 if m.group('only_hours'):
1251 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1253 res += int(m.group('secs'))
1255 res += int(m.group('mins')) * 60
1256 if m.group('hours'):
1257 res += int(m.group('hours')) * 60 * 60
1259 res += float(m.group('ms'))
1263 def prepend_extension(filename, ext):
1264 name, real_ext = os.path.splitext(filename)
1265 return '{0}.{1}{2}'.format(name, ext, real_ext)
1268 def check_executable(exe, args=[]):
1269 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1270 args can be a list of arguments for a short output (like -version) """
1272 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1278 def get_exe_version(exe, args=['--version'],
1279 version_re=None, unrecognized='present'):
1280 """ Returns the version of the specified executable,
1281 or False if the executable is not present """
1283 out, _ = subprocess.Popen(
1285 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1288 if isinstance(out, bytes): # Python 2.x
1289 out = out.decode('ascii', 'ignore')
1290 return detect_exe_version(out, version_re, unrecognized)
1293 def detect_exe_version(output, version_re=None, unrecognized='present'):
1294 assert isinstance(output, compat_str)
1295 if version_re is None:
1296 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1297 m = re.search(version_re, output)
1304 class PagedList(object):
1306 # This is only useful for tests
1307 return len(self.getslice())
1310 class OnDemandPagedList(PagedList):
1311 def __init__(self, pagefunc, pagesize):
1312 self._pagefunc = pagefunc
1313 self._pagesize = pagesize
1315 def getslice(self, start=0, end=None):
1317 for pagenum in itertools.count(start // self._pagesize):
1318 firstid = pagenum * self._pagesize
1319 nextfirstid = pagenum * self._pagesize + self._pagesize
1320 if start >= nextfirstid:
1323 page_results = list(self._pagefunc(pagenum))
1326 start % self._pagesize
1327 if firstid <= start < nextfirstid
1331 ((end - 1) % self._pagesize) + 1
1332 if (end is not None and firstid <= end <= nextfirstid)
1335 if startv != 0 or endv is not None:
1336 page_results = page_results[startv:endv]
1337 res.extend(page_results)
1339 # A little optimization - if current page is not "full", ie. does
1340 # not contain page_size videos then we can assume that this page
1341 # is the last one - there are no more ids on further pages -
1342 # i.e. no need to query again.
1343 if len(page_results) + startv < self._pagesize:
1346 # If we got the whole page, but the next page is not interesting,
1347 # break out early as well
1348 if end == nextfirstid:
1353 class InAdvancePagedList(PagedList):
1354 def __init__(self, pagefunc, pagecount, pagesize):
1355 self._pagefunc = pagefunc
1356 self._pagecount = pagecount
1357 self._pagesize = pagesize
1359 def getslice(self, start=0, end=None):
1361 start_page = start // self._pagesize
1363 self._pagecount if end is None else (end // self._pagesize + 1))
1364 skip_elems = start - start_page * self._pagesize
1365 only_more = None if end is None else end - start
1366 for pagenum in range(start_page, end_page):
1367 page = list(self._pagefunc(pagenum))
1369 page = page[skip_elems:]
1371 if only_more is not None:
1372 if len(page) < only_more:
1373 only_more -= len(page)
1375 page = page[:only_more]
1382 def uppercase_escape(s):
1383 unicode_escape = codecs.getdecoder('unicode_escape')
1385 r'\\U[0-9a-fA-F]{8}',
1386 lambda m: unicode_escape(m.group(0))[0],
1390 def escape_rfc3986(s):
1391 """Escape non-ASCII characters as suggested by RFC 3986"""
1392 if sys.version_info < (3, 0) and isinstance(s, unicode):
1393 s = s.encode('utf-8')
1394 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1397 def escape_url(url):
1398 """Escape URL as suggested by RFC 3986"""
1399 url_parsed = compat_urllib_parse_urlparse(url)
1400 return url_parsed._replace(
1401 path=escape_rfc3986(url_parsed.path),
1402 params=escape_rfc3986(url_parsed.params),
1403 query=escape_rfc3986(url_parsed.query),
1404 fragment=escape_rfc3986(url_parsed.fragment)
1408 struct.pack('!I', 0)
1410 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1411 def struct_pack(spec, *args):
1412 if isinstance(spec, compat_str):
1413 spec = spec.encode('ascii')
1414 return struct.pack(spec, *args)
1416 def struct_unpack(spec, *args):
1417 if isinstance(spec, compat_str):
1418 spec = spec.encode('ascii')
1419 return struct.unpack(spec, *args)
1421 struct_pack = struct.pack
1422 struct_unpack = struct.unpack
1425 def read_batch_urls(batch_fd):
1427 if not isinstance(url, compat_str):
1428 url = url.decode('utf-8', 'replace')
1429 BOM_UTF8 = '\xef\xbb\xbf'
1430 if url.startswith(BOM_UTF8):
1431 url = url[len(BOM_UTF8):]
1433 if url.startswith(('#', ';', ']')):
1437 with contextlib.closing(batch_fd) as fd:
1438 return [url for url in map(fixup, fd) if url]
1441 def urlencode_postdata(*args, **kargs):
1442 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1446 etree_iter = xml.etree.ElementTree.Element.iter
1447 except AttributeError: # Python <=2.6
1448 etree_iter = lambda n: n.findall('.//*')
1452 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1453 def doctype(self, name, pubid, system):
1454 pass # Ignore doctypes
1456 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1457 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1458 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1459 # Fix up XML parser in Python 2.x
1460 if sys.version_info < (3, 0):
1461 for n in etree_iter(tree):
1462 if n.text is not None:
1463 if not isinstance(n.text, compat_str):
1464 n.text = n.text.decode('utf-8')
1477 def parse_age_limit(s):
1480 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1481 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1484 def strip_jsonp(code):
1486 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1489 def js_to_json(code):
1492 if v in ('true', 'false', 'null'):
1494 if v.startswith('"'):
1496 if v.startswith("'"):
1498 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1505 res = re.sub(r'''(?x)
1506 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1507 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1508 [a-zA-Z_][a-zA-Z_0-9]*
1510 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1514 def qualities(quality_ids):
1515 """ Get a numeric quality value out of a list of possible values """
1518 return quality_ids.index(qid)
1524 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1527 def limit_length(s, length):
1528 """ Add ellipses to overly long strings """
1533 return s[:length - len(ELLIPSES)] + ELLIPSES
1537 def version_tuple(v):
1538 return tuple(int(e) for e in re.split(r'[-.]', v))
1541 def is_outdated_version(version, limit, assume_new=True):
1543 return not assume_new
1545 return version_tuple(version) < version_tuple(limit)
1547 return not assume_new
1550 def ytdl_is_updateable():
1551 """ Returns if youtube-dl can be updated with -U """
1552 from zipimport import zipimporter
1554 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1557 def args_to_str(args):
1558 # Get a short string representation for a subprocess command
1559 return ' '.join(shlex_quote(a) for a in args)
1562 def urlhandle_detect_ext(url_handle):
1565 getheader = lambda h: url_handle.headers[h]
1566 except AttributeError: # Python < 3
1567 getheader = url_handle.info().getheader
1569 return getheader('Content-Type').split("/")[1]
1572 def age_restricted(content_limit, age_limit):
1573 """ Returns True iff the content should be blocked """
1575 if age_limit is None: # No limit set
1577 if content_limit is None:
1578 return False # Content available for everyone
1579 return age_limit < content_limit