2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
47 # This is not clearly defined otherwise
48 compiled_regex_type = type(re.compile(''))
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
58 def preferredencoding():
59 """Get preferred encoding.
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
65 pref = locale.getpreferredencoding()
73 def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically """
76 if sys.version_info < (3, 0):
77 encoding = get_filesystem_encoding()
78 # os.path.basename returns a bytes object, but NamedTemporaryFile
79 # will fail if the filename contains non ascii characters unless we
80 # use a unicode object
81 path_basename = lambda f: os.path.basename(fn).decode(encoding)
82 # the same for os.path.dirname
83 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
85 path_basename = os.path.basename
86 path_dirname = os.path.dirname
90 'prefix': path_basename(fn) + '.',
91 'dir': path_dirname(fn),
95 # In Python 2.x, json.dump expects a bytestream.
96 # In Python 3.x, it writes to a character stream
97 if sys.version_info < (3, 0):
105 tf = tempfile.NamedTemporaryFile(**args)
110 os.rename(tf.name, fn)
119 if sys.version_info >= (2, 7):
120 def find_xpath_attr(node, xpath, key, val):
121 """ Find the xpath xpath[@key=val] """
122 assert re.match(r'^[a-zA-Z-]+$', key)
123 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
124 expr = xpath + u"[@%s='%s']" % (key, val)
125 return node.find(expr)
127 def find_xpath_attr(node, xpath, key, val):
128 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
129 # .//node does not match if a node is a direct child of . !
130 if isinstance(xpath, unicode):
131 xpath = xpath.encode('ascii')
133 for f in node.findall(xpath):
134 if f.attrib.get(key) == val:
138 # On python2.6 the xml.etree.ElementTree.Element methods don't support
139 # the namespace parameter
140 def xpath_with_ns(path, ns_map):
141 components = [c.split(':') for c in path.split('/')]
145 replaced.append(c[0])
148 replaced.append('{%s}%s' % (ns_map[ns], tag))
149 return '/'.join(replaced)
152 def xpath_text(node, xpath, name=None, fatal=False):
153 if sys.version_info < (2, 7): # Crazy 2.6
154 xpath = xpath.encode('ascii')
159 name = xpath if name is None else name
160 raise ExtractorError('Could not find XML element %s' % name)
166 def get_element_by_id(id, html):
167 """Return the content of the tag with the specified ID in the passed HTML document"""
168 return get_element_by_attribute("id", id, html)
171 def get_element_by_attribute(attribute, value, html):
172 """Return the content of the tag with the specified attribute in the passed HTML document"""
174 m = re.search(r'''(?xs)
176 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
178 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
182 ''' % (re.escape(attribute), re.escape(value)), html)
186 res = m.group('content')
188 if res.startswith('"') or res.startswith("'"):
191 return unescapeHTML(res)
194 def clean_html(html):
195 """Clean an HTML snippet into a readable string"""
197 html = html.replace('\n', ' ')
198 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
199 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
201 html = re.sub('<.*?>', '', html)
202 # Replace html entities
203 html = unescapeHTML(html)
207 def sanitize_open(filename, open_mode):
208 """Try to open the given filename, and slightly tweak it if this fails.
210 Attempts to open the given filename. If this fails, it tries to change
211 the filename slightly, step by step, until it's either able to open it
212 or it fails and raises a final exception, like the standard open()
215 It returns the tuple (stream, definitive_file_name).
219 if sys.platform == 'win32':
221 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
222 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
223 stream = open(encodeFilename(filename), open_mode)
224 return (stream, filename)
225 except (IOError, OSError) as err:
226 if err.errno in (errno.EACCES,):
229 # In case of error, try to remove win32 forbidden chars
230 alt_filename = os.path.join(
231 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
232 for path_part in os.path.split(filename)
234 if alt_filename == filename:
237 # An exception here should be caught in the caller
238 stream = open(encodeFilename(filename), open_mode)
239 return (stream, alt_filename)
242 def timeconvert(timestr):
243 """Convert RFC 2822 defined time string into system timestamp"""
245 timetuple = email.utils.parsedate_tz(timestr)
246 if timetuple is not None:
247 timestamp = email.utils.mktime_tz(timetuple)
250 def sanitize_filename(s, restricted=False, is_id=False):
251 """Sanitizes a string so it could be used as part of a filename.
252 If restricted is set, use a stricter subset of allowed characters.
253 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
255 def replace_insane(char):
256 if char == '?' or ord(char) < 32 or ord(char) == 127:
259 return '' if restricted else '\''
261 return '_-' if restricted else ' -'
262 elif char in '\\/|*<>':
264 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
266 if restricted and ord(char) > 127:
270 result = u''.join(map(replace_insane, s))
272 while '__' in result:
273 result = result.replace('__', '_')
274 result = result.strip('_')
275 # Common case of "Foreign band name - English song title"
276 if restricted and result.startswith('-_'):
282 def orderedSet(iterable):
283 """ Remove all duplicates from the input iterable """
291 def _htmlentity_transform(entity):
292 """Transforms an HTML entity to a character."""
293 # Known non-numeric HTML entity
294 if entity in compat_html_entities.name2codepoint:
295 return compat_chr(compat_html_entities.name2codepoint[entity])
297 mobj = re.match(r'#(x?[0-9]+)', entity)
299 numstr = mobj.group(1)
300 if numstr.startswith(u'x'):
302 numstr = u'0%s' % numstr
305 return compat_chr(int(numstr, base))
307 # Unknown entity in name, return its literal representation
308 return (u'&%s;' % entity)
314 assert type(s) == compat_str
317 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
320 def encodeFilename(s, for_subprocess=False):
322 @param s The name of the file
325 assert type(s) == compat_str
327 # Python 3 has a Unicode API
328 if sys.version_info >= (3, 0):
331 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
332 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
333 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
334 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
335 if not for_subprocess:
338 # For subprocess calls, encode with locale encoding
339 # Refer to http://stackoverflow.com/a/9951851/35070
340 encoding = preferredencoding()
342 encoding = sys.getfilesystemencoding()
345 return s.encode(encoding, 'ignore')
348 def encodeArgument(s):
349 if not isinstance(s, compat_str):
350 # Legacy code that uses byte strings
351 # Uncomment the following line after fixing all post processors
352 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
353 s = s.decode('ascii')
354 return encodeFilename(s, True)
357 def decodeOption(optval):
360 if isinstance(optval, bytes):
361 optval = optval.decode(preferredencoding())
363 assert isinstance(optval, compat_str)
366 def formatSeconds(secs):
368 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
370 return '%d:%02d' % (secs // 60, secs % 60)
375 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
376 if sys.version_info < (3, 2):
379 class HTTPSConnectionV3(httplib.HTTPSConnection):
380 def __init__(self, *args, **kwargs):
381 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
384 sock = socket.create_connection((self.host, self.port), self.timeout)
385 if getattr(self, '_tunnel_host', False):
389 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
391 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
393 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
394 def https_open(self, req):
395 return self.do_open(HTTPSConnectionV3, req)
396 return HTTPSHandlerV3(**kwargs)
397 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
398 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
399 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
400 if opts_no_check_certificate:
401 context.verify_mode = ssl.CERT_NONE
402 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
404 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
405 context.verify_mode = (ssl.CERT_NONE
406 if opts_no_check_certificate
407 else ssl.CERT_REQUIRED)
408 context.set_default_verify_paths()
410 context.load_default_certs()
411 except AttributeError:
413 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
415 class ExtractorError(Exception):
416 """Error during info extraction."""
417 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
418 """ tb, if given, is the original traceback (so that it can be printed out).
419 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
422 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
424 if video_id is not None:
425 msg = video_id + ': ' + msg
427 msg += u' (caused by %r)' % cause
429 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
430 super(ExtractorError, self).__init__(msg)
433 self.exc_info = sys.exc_info() # preserve original exception
435 self.video_id = video_id
437 def format_traceback(self):
438 if self.traceback is None:
440 return u''.join(traceback.format_tb(self.traceback))
443 class RegexNotFoundError(ExtractorError):
444 """Error when a regex didn't match"""
448 class DownloadError(Exception):
449 """Download Error exception.
451 This exception may be thrown by FileDownloader objects if they are not
452 configured to continue on errors. They will contain the appropriate
455 def __init__(self, msg, exc_info=None):
456 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
457 super(DownloadError, self).__init__(msg)
458 self.exc_info = exc_info
461 class SameFileError(Exception):
462 """Same File exception.
464 This exception will be thrown by FileDownloader objects if they detect
465 multiple files would have to be downloaded to the same file on disk.
470 class PostProcessingError(Exception):
471 """Post Processing exception.
473 This exception may be raised by PostProcessor's .run() method to
474 indicate an error in the postprocessing task.
476 def __init__(self, msg):
479 class MaxDownloadsReached(Exception):
480 """ --max-downloads limit has been reached. """
484 class UnavailableVideoError(Exception):
485 """Unavailable Format exception.
487 This exception will be thrown when a video is requested
488 in a format that is not available for that video.
493 class ContentTooShortError(Exception):
494 """Content Too Short exception.
496 This exception may be raised by FileDownloader objects when a file they
497 download is too small for what the server announced first, indicating
498 the connection was probably interrupted.
504 def __init__(self, downloaded, expected):
505 self.downloaded = downloaded
506 self.expected = expected
508 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
509 """Handler for HTTP requests and responses.
511 This class, when installed with an OpenerDirector, automatically adds
512 the standard headers to every HTTP request and handles gzipped and
513 deflated responses from web servers. If compression is to be avoided in
514 a particular request, the original request in the program code only has
515 to include the HTTP header "Youtubedl-No-Compression", which will be
516 removed before making the real request.
518 Part of this code was copied from:
520 http://techknack.net/python-urllib2-handlers/
522 Andrew Rowls, the author of that code, agreed to release it to the
529 return zlib.decompress(data, -zlib.MAX_WBITS)
531 return zlib.decompress(data)
534 def addinfourl_wrapper(stream, headers, url, code):
535 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
536 return compat_urllib_request.addinfourl(stream, headers, url, code)
537 ret = compat_urllib_request.addinfourl(stream, headers, url)
541 def http_request(self, req):
542 for h, v in std_headers.items():
543 if h not in req.headers:
545 if 'Youtubedl-no-compression' in req.headers:
546 if 'Accept-encoding' in req.headers:
547 del req.headers['Accept-encoding']
548 del req.headers['Youtubedl-no-compression']
549 if 'Youtubedl-user-agent' in req.headers:
550 if 'User-agent' in req.headers:
551 del req.headers['User-agent']
552 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
553 del req.headers['Youtubedl-user-agent']
555 if sys.version_info < (2, 7) and '#' in req.get_full_url():
556 # Python 2.6 is brain-dead when it comes to fragments
557 req._Request__original = req._Request__original.partition('#')[0]
558 req._Request__r_type = req._Request__r_type.partition('#')[0]
562 def http_response(self, req, resp):
565 if resp.headers.get('Content-encoding', '') == 'gzip':
566 content = resp.read()
567 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
569 uncompressed = io.BytesIO(gz.read())
570 except IOError as original_ioerror:
571 # There may be junk add the end of the file
572 # See http://stackoverflow.com/q/4928560/35070 for details
573 for i in range(1, 1024):
575 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
576 uncompressed = io.BytesIO(gz.read())
581 raise original_ioerror
582 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
583 resp.msg = old_resp.msg
585 if resp.headers.get('Content-encoding', '') == 'deflate':
586 gz = io.BytesIO(self.deflate(resp.read()))
587 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
588 resp.msg = old_resp.msg
591 https_request = http_request
592 https_response = http_response
595 def parse_iso8601(date_str, delimiter='T'):
596 """ Return a UNIX timestamp from the given date """
602 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
605 timezone = datetime.timedelta()
607 date_str = date_str[:-len(m.group(0))]
608 if not m.group('sign'):
609 timezone = datetime.timedelta()
611 sign = 1 if m.group('sign') == '+' else -1
612 timezone = datetime.timedelta(
613 hours=sign * int(m.group('hours')),
614 minutes=sign * int(m.group('minutes')))
615 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
616 dt = datetime.datetime.strptime(date_str, date_format) - timezone
617 return calendar.timegm(dt.timetuple())
620 def unified_strdate(date_str):
621 """Return a string with the date in the format YYYYMMDD"""
628 date_str = date_str.replace(',', ' ')
629 # %z (UTC offset) is only supported in python>=3.2
630 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
631 format_expressions = [
636 '%b %dst %Y %I:%M%p',
637 '%b %dnd %Y %I:%M%p',
638 '%b %dth %Y %I:%M%p',
647 '%Y-%m-%d %H:%M:%S.%f',
650 '%Y-%m-%dT%H:%M:%SZ',
651 '%Y-%m-%dT%H:%M:%S.%fZ',
652 '%Y-%m-%dT%H:%M:%S.%f0Z',
654 '%Y-%m-%dT%H:%M:%S.%f',
657 for expression in format_expressions:
659 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
662 if upload_date is None:
663 timetuple = email.utils.parsedate_tz(date_str)
665 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
668 def determine_ext(url, default_ext=u'unknown_video'):
671 guess = url.partition(u'?')[0].rpartition(u'.')[2]
672 if re.match(r'^[A-Za-z0-9]+$', guess):
677 def subtitles_filename(filename, sub_lang, sub_format):
678 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
680 def date_from_str(date_str):
682 Return a datetime object from a string in the format YYYYMMDD or
683 (now|today)[+-][0-9](day|week|month|year)(s)?"""
684 today = datetime.date.today()
685 if date_str == 'now'or date_str == 'today':
687 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
688 if match is not None:
689 sign = match.group('sign')
690 time = int(match.group('time'))
693 unit = match.group('unit')
702 delta = datetime.timedelta(**{unit: time})
704 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
706 def hyphenate_date(date_str):
708 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
709 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
710 if match is not None:
711 return '-'.join(match.groups())
715 class DateRange(object):
716 """Represents a time interval between two dates"""
717 def __init__(self, start=None, end=None):
718 """start and end must be strings in the format accepted by date"""
719 if start is not None:
720 self.start = date_from_str(start)
722 self.start = datetime.datetime.min.date()
724 self.end = date_from_str(end)
726 self.end = datetime.datetime.max.date()
727 if self.start > self.end:
728 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
731 """Returns a range that only contains the given day"""
733 def __contains__(self, date):
734 """Check if the date is in the range"""
735 if not isinstance(date, datetime.date):
736 date = date_from_str(date)
737 return self.start <= date <= self.end
739 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
743 """ Returns the platform name as a compat_str """
744 res = platform.platform()
745 if isinstance(res, bytes):
746 res = res.decode(preferredencoding())
748 assert isinstance(res, compat_str)
752 def _windows_write_string(s, out):
753 """ Returns True if the string was written using special methods,
754 False if it has yet to be written out."""
755 # Adapted from http://stackoverflow.com/a/3259271/35070
758 import ctypes.wintypes
766 fileno = out.fileno()
767 except AttributeError:
768 # If the output stream doesn't have a fileno, it's virtual
770 if fileno not in WIN_OUTPUT_IDS:
773 GetStdHandle = ctypes.WINFUNCTYPE(
774 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
775 ("GetStdHandle", ctypes.windll.kernel32))
776 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
778 WriteConsoleW = ctypes.WINFUNCTYPE(
779 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
780 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
781 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
782 written = ctypes.wintypes.DWORD(0)
784 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
785 FILE_TYPE_CHAR = 0x0002
786 FILE_TYPE_REMOTE = 0x8000
787 GetConsoleMode = ctypes.WINFUNCTYPE(
788 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
789 ctypes.POINTER(ctypes.wintypes.DWORD))(
790 ("GetConsoleMode", ctypes.windll.kernel32))
791 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
793 def not_a_console(handle):
794 if handle == INVALID_HANDLE_VALUE or handle is None:
796 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
797 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
802 def next_nonbmp_pos(s):
804 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
805 except StopIteration:
809 count = min(next_nonbmp_pos(s), 1024)
812 h, s, count if count else 2, ctypes.byref(written), None)
814 raise OSError('Failed to write string')
815 if not count: # We just wrote a non-BMP character
816 assert written.value == 2
819 assert written.value > 0
820 s = s[written.value:]
824 def write_string(s, out=None, encoding=None):
827 assert type(s) == compat_str
829 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
830 if _windows_write_string(s, out):
833 if ('b' in getattr(out, 'mode', '') or
834 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
835 byt = s.encode(encoding or preferredencoding(), 'ignore')
837 elif hasattr(out, 'buffer'):
838 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
839 byt = s.encode(enc, 'ignore')
840 out.buffer.write(byt)
846 def bytes_to_intlist(bs):
849 if isinstance(bs[0], int): # Python 3
852 return [ord(c) for c in bs]
855 def intlist_to_bytes(xs):
858 return struct_pack('%dB' % len(xs), *xs)
861 # Cross-platform file locking
862 if sys.platform == 'win32':
863 import ctypes.wintypes
866 class OVERLAPPED(ctypes.Structure):
868 ('Internal', ctypes.wintypes.LPVOID),
869 ('InternalHigh', ctypes.wintypes.LPVOID),
870 ('Offset', ctypes.wintypes.DWORD),
871 ('OffsetHigh', ctypes.wintypes.DWORD),
872 ('hEvent', ctypes.wintypes.HANDLE),
875 kernel32 = ctypes.windll.kernel32
876 LockFileEx = kernel32.LockFileEx
877 LockFileEx.argtypes = [
878 ctypes.wintypes.HANDLE, # hFile
879 ctypes.wintypes.DWORD, # dwFlags
880 ctypes.wintypes.DWORD, # dwReserved
881 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
882 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
883 ctypes.POINTER(OVERLAPPED) # Overlapped
885 LockFileEx.restype = ctypes.wintypes.BOOL
886 UnlockFileEx = kernel32.UnlockFileEx
887 UnlockFileEx.argtypes = [
888 ctypes.wintypes.HANDLE, # hFile
889 ctypes.wintypes.DWORD, # dwReserved
890 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
891 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
892 ctypes.POINTER(OVERLAPPED) # Overlapped
894 UnlockFileEx.restype = ctypes.wintypes.BOOL
895 whole_low = 0xffffffff
896 whole_high = 0x7fffffff
898 def _lock_file(f, exclusive):
899 overlapped = OVERLAPPED()
900 overlapped.Offset = 0
901 overlapped.OffsetHigh = 0
902 overlapped.hEvent = 0
903 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
904 handle = msvcrt.get_osfhandle(f.fileno())
905 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
906 whole_low, whole_high, f._lock_file_overlapped_p):
907 raise OSError('Locking file failed: %r' % ctypes.FormatError())
910 assert f._lock_file_overlapped_p
911 handle = msvcrt.get_osfhandle(f.fileno())
912 if not UnlockFileEx(handle, 0,
913 whole_low, whole_high, f._lock_file_overlapped_p):
914 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
919 def _lock_file(f, exclusive):
920 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
923 fcntl.flock(f, fcntl.LOCK_UN)
926 class locked_file(object):
927 def __init__(self, filename, mode, encoding=None):
928 assert mode in ['r', 'a', 'w']
929 self.f = io.open(filename, mode, encoding=encoding)
933 exclusive = self.mode != 'r'
935 _lock_file(self.f, exclusive)
941 def __exit__(self, etype, value, traceback):
950 def write(self, *args):
951 return self.f.write(*args)
953 def read(self, *args):
954 return self.f.read(*args)
957 def get_filesystem_encoding():
958 encoding = sys.getfilesystemencoding()
959 return encoding if encoding is not None else 'utf-8'
962 def shell_quote(args):
964 encoding = get_filesystem_encoding()
966 if isinstance(a, bytes):
967 # We may get a filename encoded with 'encodeFilename'
968 a = a.decode(encoding)
969 quoted_args.append(pipes.quote(a))
970 return u' '.join(quoted_args)
973 def takewhile_inclusive(pred, seq):
974 """ Like itertools.takewhile, but include the latest evaluated element
975 (the first element so that Not pred(e)) """
982 def smuggle_url(url, data):
983 """ Pass additional data in a URL for internal use. """
985 sdata = compat_urllib_parse.urlencode(
986 {u'__youtubedl_smuggle': json.dumps(data)})
987 return url + u'#' + sdata
990 def unsmuggle_url(smug_url, default=None):
991 if not '#__youtubedl_smuggle' in smug_url:
992 return smug_url, default
993 url, _, sdata = smug_url.rpartition(u'#')
994 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
995 data = json.loads(jsond)
999 def format_bytes(bytes):
1002 if type(bytes) is str:
1003 bytes = float(bytes)
1007 exponent = int(math.log(bytes, 1024.0))
1008 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1009 converted = float(bytes) / float(1024 ** exponent)
1010 return u'%.2f%s' % (converted, suffix)
1013 def get_term_width():
1014 columns = compat_getenv('COLUMNS', None)
1019 sp = subprocess.Popen(
1021 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1022 out, err = sp.communicate()
1023 return int(out.split()[1])
1029 def month_by_name(name):
1030 """ Return the number of a month by (locale-independently) English name """
1033 u'January', u'February', u'March', u'April', u'May', u'June',
1034 u'July', u'August', u'September', u'October', u'November', u'December']
1036 return ENGLISH_NAMES.index(name) + 1
1041 def fix_xml_ampersands(xml_str):
1042 """Replace all the '&' by '&' in XML"""
1044 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1049 def setproctitle(title):
1050 assert isinstance(title, compat_str)
1052 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1055 title_bytes = title.encode('utf-8')
1056 buf = ctypes.create_string_buffer(len(title_bytes))
1057 buf.value = title_bytes
1059 libc.prctl(15, buf, 0, 0, 0)
1060 except AttributeError:
1061 return # Strange libc, just skip this
1064 def remove_start(s, start):
1065 if s.startswith(start):
1066 return s[len(start):]
1070 def remove_end(s, end):
1072 return s[:-len(end)]
1076 def url_basename(url):
1077 path = compat_urlparse.urlparse(url).path
1078 return path.strip(u'/').split(u'/')[-1]
1081 class HEADRequest(compat_urllib_request.Request):
1082 def get_method(self):
1086 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1089 v = getattr(v, get_attr, None)
1092 return default if v is None else (int(v) * invscale // scale)
1095 def str_or_none(v, default=None):
1096 return default if v is None else compat_str(v)
1099 def str_to_int(int_str):
1100 """ A more relaxed version of int_or_none """
1103 int_str = re.sub(r'[,\.\+]', u'', int_str)
1107 def float_or_none(v, scale=1, invscale=1, default=None):
1108 return default if v is None else (float(v) * invscale / scale)
1111 def parse_duration(s):
1118 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1121 res = int(m.group('secs'))
1123 res += int(m.group('mins')) * 60
1124 if m.group('hours'):
1125 res += int(m.group('hours')) * 60 * 60
1127 res += float(m.group('ms'))
1131 def prepend_extension(filename, ext):
1132 name, real_ext = os.path.splitext(filename)
1133 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1136 def check_executable(exe, args=[]):
1137 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1138 args can be a list of arguments for a short output (like -version) """
1140 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1146 def get_exe_version(exe, args=['--version'],
1147 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1148 unrecognized=u'present'):
1149 """ Returns the version of the specified executable,
1150 or False if the executable is not present """
1152 out, err = subprocess.Popen(
1154 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1157 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1158 m = re.search(version_re, firstline)
1165 class PagedList(object):
1167 # This is only useful for tests
1168 return len(self.getslice())
1171 class OnDemandPagedList(PagedList):
1172 def __init__(self, pagefunc, pagesize):
1173 self._pagefunc = pagefunc
1174 self._pagesize = pagesize
1176 def getslice(self, start=0, end=None):
1178 for pagenum in itertools.count(start // self._pagesize):
1179 firstid = pagenum * self._pagesize
1180 nextfirstid = pagenum * self._pagesize + self._pagesize
1181 if start >= nextfirstid:
1184 page_results = list(self._pagefunc(pagenum))
1187 start % self._pagesize
1188 if firstid <= start < nextfirstid
1192 ((end - 1) % self._pagesize) + 1
1193 if (end is not None and firstid <= end <= nextfirstid)
1196 if startv != 0 or endv is not None:
1197 page_results = page_results[startv:endv]
1198 res.extend(page_results)
1200 # A little optimization - if current page is not "full", ie. does
1201 # not contain page_size videos then we can assume that this page
1202 # is the last one - there are no more ids on further pages -
1203 # i.e. no need to query again.
1204 if len(page_results) + startv < self._pagesize:
1207 # If we got the whole page, but the next page is not interesting,
1208 # break out early as well
1209 if end == nextfirstid:
1214 class InAdvancePagedList(PagedList):
1215 def __init__(self, pagefunc, pagecount, pagesize):
1216 self._pagefunc = pagefunc
1217 self._pagecount = pagecount
1218 self._pagesize = pagesize
1220 def getslice(self, start=0, end=None):
1222 start_page = start // self._pagesize
1224 self._pagecount if end is None else (end // self._pagesize + 1))
1225 skip_elems = start - start_page * self._pagesize
1226 only_more = None if end is None else end - start
1227 for pagenum in range(start_page, end_page):
1228 page = list(self._pagefunc(pagenum))
1230 page = page[skip_elems:]
1232 if only_more is not None:
1233 if len(page) < only_more:
1234 only_more -= len(page)
1236 page = page[:only_more]
1243 def uppercase_escape(s):
1244 unicode_escape = codecs.getdecoder('unicode_escape')
1246 r'\\U[0-9a-fA-F]{8}',
1247 lambda m: unicode_escape(m.group(0))[0],
1251 def escape_rfc3986(s):
1252 """Escape non-ASCII characters as suggested by RFC 3986"""
1253 if sys.version_info < (3, 0) and isinstance(s, unicode):
1254 s = s.encode('utf-8')
1255 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1258 def escape_url(url):
1259 """Escape URL as suggested by RFC 3986"""
1260 url_parsed = compat_urllib_parse_urlparse(url)
1261 return url_parsed._replace(
1262 path=escape_rfc3986(url_parsed.path),
1263 params=escape_rfc3986(url_parsed.params),
1264 query=escape_rfc3986(url_parsed.query),
1265 fragment=escape_rfc3986(url_parsed.fragment)
1269 struct.pack(u'!I', 0)
1271 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1272 def struct_pack(spec, *args):
1273 if isinstance(spec, compat_str):
1274 spec = spec.encode('ascii')
1275 return struct.pack(spec, *args)
1277 def struct_unpack(spec, *args):
1278 if isinstance(spec, compat_str):
1279 spec = spec.encode('ascii')
1280 return struct.unpack(spec, *args)
1282 struct_pack = struct.pack
1283 struct_unpack = struct.unpack
1286 def read_batch_urls(batch_fd):
1288 if not isinstance(url, compat_str):
1289 url = url.decode('utf-8', 'replace')
1290 BOM_UTF8 = u'\xef\xbb\xbf'
1291 if url.startswith(BOM_UTF8):
1292 url = url[len(BOM_UTF8):]
1294 if url.startswith(('#', ';', ']')):
1298 with contextlib.closing(batch_fd) as fd:
1299 return [url for url in map(fixup, fd) if url]
1302 def urlencode_postdata(*args, **kargs):
1303 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1307 etree_iter = xml.etree.ElementTree.Element.iter
1308 except AttributeError: # Python <=2.6
1309 etree_iter = lambda n: n.findall('.//*')
1313 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1314 def doctype(self, name, pubid, system):
1315 pass # Ignore doctypes
1317 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1318 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1319 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1320 # Fix up XML parser in Python 2.x
1321 if sys.version_info < (3, 0):
1322 for n in etree_iter(tree):
1323 if n.text is not None:
1324 if not isinstance(n.text, compat_str):
1325 n.text = n.text.decode('utf-8')
1338 def parse_age_limit(s):
1341 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1342 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1345 def strip_jsonp(code):
1347 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1350 def js_to_json(code):
1353 if v in ('true', 'false', 'null'):
1355 if v.startswith('"'):
1357 if v.startswith("'"):
1359 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1366 res = re.sub(r'''(?x)
1367 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1368 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1369 [a-zA-Z_][a-zA-Z_0-9]*
1371 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1375 def qualities(quality_ids):
1376 """ Get a numeric quality value out of a list of possible values """
1379 return quality_ids.index(qid)
1385 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1388 def limit_length(s, length):
1389 """ Add ellipses to overly long strings """
1394 return s[:length - len(ELLIPSES)] + ELLIPSES
1398 def version_tuple(v):
1399 return [int(e) for e in v.split('.')]
1402 def is_outdated_version(version, limit, assume_new=True):
1404 return not assume_new
1406 return version_tuple(version) < version_tuple(limit)
1408 return not assume_new