2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
47 # This is not clearly defined otherwise
48 compiled_regex_type = type(re.compile(''))
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
58 def preferredencoding():
59 """Get preferred encoding.
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
65 pref = locale.getpreferredencoding()
73 def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically """
78 'prefix': os.path.basename(fn) + '.',
79 'dir': os.path.dirname(fn),
83 # In Python 2.x, json.dump expects a bytestream.
84 # In Python 3.x, it writes to a character stream
85 if sys.version_info < (3, 0):
93 tf = tempfile.NamedTemporaryFile(**args)
98 os.rename(tf.name, fn)
107 if sys.version_info >= (2, 7):
108 def find_xpath_attr(node, xpath, key, val):
109 """ Find the xpath xpath[@key=val] """
110 assert re.match(r'^[a-zA-Z-]+$', key)
111 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
112 expr = xpath + u"[@%s='%s']" % (key, val)
113 return node.find(expr)
115 def find_xpath_attr(node, xpath, key, val):
116 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
117 # .//node does not match if a node is a direct child of . !
118 if isinstance(xpath, unicode):
119 xpath = xpath.encode('ascii')
121 for f in node.findall(xpath):
122 if f.attrib.get(key) == val:
126 # On python2.6 the xml.etree.ElementTree.Element methods don't support
127 # the namespace parameter
128 def xpath_with_ns(path, ns_map):
129 components = [c.split(':') for c in path.split('/')]
133 replaced.append(c[0])
136 replaced.append('{%s}%s' % (ns_map[ns], tag))
137 return '/'.join(replaced)
140 def xpath_text(node, xpath, name=None, fatal=False):
141 if sys.version_info < (2, 7): # Crazy 2.6
142 xpath = xpath.encode('ascii')
147 name = xpath if name is None else name
148 raise ExtractorError('Could not find XML element %s' % name)
154 def get_element_by_id(id, html):
155 """Return the content of the tag with the specified ID in the passed HTML document"""
156 return get_element_by_attribute("id", id, html)
159 def get_element_by_attribute(attribute, value, html):
160 """Return the content of the tag with the specified attribute in the passed HTML document"""
162 m = re.search(r'''(?xs)
164 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
166 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
170 ''' % (re.escape(attribute), re.escape(value)), html)
174 res = m.group('content')
176 if res.startswith('"') or res.startswith("'"):
179 return unescapeHTML(res)
182 def clean_html(html):
183 """Clean an HTML snippet into a readable string"""
185 html = html.replace('\n', ' ')
186 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
187 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
189 html = re.sub('<.*?>', '', html)
190 # Replace html entities
191 html = unescapeHTML(html)
195 def sanitize_open(filename, open_mode):
196 """Try to open the given filename, and slightly tweak it if this fails.
198 Attempts to open the given filename. If this fails, it tries to change
199 the filename slightly, step by step, until it's either able to open it
200 or it fails and raises a final exception, like the standard open()
203 It returns the tuple (stream, definitive_file_name).
207 if sys.platform == 'win32':
209 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
210 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
211 stream = open(encodeFilename(filename), open_mode)
212 return (stream, filename)
213 except (IOError, OSError) as err:
214 if err.errno in (errno.EACCES,):
217 # In case of error, try to remove win32 forbidden chars
218 alt_filename = os.path.join(
219 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
220 for path_part in os.path.split(filename)
222 if alt_filename == filename:
225 # An exception here should be caught in the caller
226 stream = open(encodeFilename(filename), open_mode)
227 return (stream, alt_filename)
230 def timeconvert(timestr):
231 """Convert RFC 2822 defined time string into system timestamp"""
233 timetuple = email.utils.parsedate_tz(timestr)
234 if timetuple is not None:
235 timestamp = email.utils.mktime_tz(timetuple)
238 def sanitize_filename(s, restricted=False, is_id=False):
239 """Sanitizes a string so it could be used as part of a filename.
240 If restricted is set, use a stricter subset of allowed characters.
241 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
243 def replace_insane(char):
244 if char == '?' or ord(char) < 32 or ord(char) == 127:
247 return '' if restricted else '\''
249 return '_-' if restricted else ' -'
250 elif char in '\\/|*<>':
252 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
254 if restricted and ord(char) > 127:
258 result = u''.join(map(replace_insane, s))
260 while '__' in result:
261 result = result.replace('__', '_')
262 result = result.strip('_')
263 # Common case of "Foreign band name - English song title"
264 if restricted and result.startswith('-_'):
270 def orderedSet(iterable):
271 """ Remove all duplicates from the input iterable """
279 def _htmlentity_transform(entity):
280 """Transforms an HTML entity to a character."""
281 # Known non-numeric HTML entity
282 if entity in compat_html_entities.name2codepoint:
283 return compat_chr(compat_html_entities.name2codepoint[entity])
285 mobj = re.match(r'#(x?[0-9]+)', entity)
287 numstr = mobj.group(1)
288 if numstr.startswith(u'x'):
290 numstr = u'0%s' % numstr
293 return compat_chr(int(numstr, base))
295 # Unknown entity in name, return its literal representation
296 return (u'&%s;' % entity)
302 assert type(s) == compat_str
305 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
308 def encodeFilename(s, for_subprocess=False):
310 @param s The name of the file
313 assert type(s) == compat_str
315 # Python 3 has a Unicode API
316 if sys.version_info >= (3, 0):
319 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
320 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
321 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
322 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
323 if not for_subprocess:
326 # For subprocess calls, encode with locale encoding
327 # Refer to http://stackoverflow.com/a/9951851/35070
328 encoding = preferredencoding()
330 encoding = sys.getfilesystemencoding()
333 return s.encode(encoding, 'ignore')
336 def encodeArgument(s):
337 if not isinstance(s, compat_str):
338 # Legacy code that uses byte strings
339 # Uncomment the following line after fixing all post processors
340 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
341 s = s.decode('ascii')
342 return encodeFilename(s, True)
345 def decodeOption(optval):
348 if isinstance(optval, bytes):
349 optval = optval.decode(preferredencoding())
351 assert isinstance(optval, compat_str)
354 def formatSeconds(secs):
356 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
358 return '%d:%02d' % (secs // 60, secs % 60)
363 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
364 if sys.version_info < (3, 2):
367 class HTTPSConnectionV3(httplib.HTTPSConnection):
368 def __init__(self, *args, **kwargs):
369 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
372 sock = socket.create_connection((self.host, self.port), self.timeout)
373 if getattr(self, '_tunnel_host', False):
377 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
379 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
381 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
382 def https_open(self, req):
383 return self.do_open(HTTPSConnectionV3, req)
384 return HTTPSHandlerV3(**kwargs)
385 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
386 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
387 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
388 if opts_no_check_certificate:
389 context.verify_mode = ssl.CERT_NONE
390 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
392 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
393 context.verify_mode = (ssl.CERT_NONE
394 if opts_no_check_certificate
395 else ssl.CERT_REQUIRED)
396 context.set_default_verify_paths()
398 context.load_default_certs()
399 except AttributeError:
401 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
403 class ExtractorError(Exception):
404 """Error during info extraction."""
405 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
406 """ tb, if given, is the original traceback (so that it can be printed out).
407 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
410 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
412 if video_id is not None:
413 msg = video_id + ': ' + msg
415 msg += u' (caused by %r)' % cause
417 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
418 super(ExtractorError, self).__init__(msg)
421 self.exc_info = sys.exc_info() # preserve original exception
423 self.video_id = video_id
425 def format_traceback(self):
426 if self.traceback is None:
428 return u''.join(traceback.format_tb(self.traceback))
431 class RegexNotFoundError(ExtractorError):
432 """Error when a regex didn't match"""
436 class DownloadError(Exception):
437 """Download Error exception.
439 This exception may be thrown by FileDownloader objects if they are not
440 configured to continue on errors. They will contain the appropriate
443 def __init__(self, msg, exc_info=None):
444 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
445 super(DownloadError, self).__init__(msg)
446 self.exc_info = exc_info
449 class SameFileError(Exception):
450 """Same File exception.
452 This exception will be thrown by FileDownloader objects if they detect
453 multiple files would have to be downloaded to the same file on disk.
458 class PostProcessingError(Exception):
459 """Post Processing exception.
461 This exception may be raised by PostProcessor's .run() method to
462 indicate an error in the postprocessing task.
464 def __init__(self, msg):
467 class MaxDownloadsReached(Exception):
468 """ --max-downloads limit has been reached. """
472 class UnavailableVideoError(Exception):
473 """Unavailable Format exception.
475 This exception will be thrown when a video is requested
476 in a format that is not available for that video.
481 class ContentTooShortError(Exception):
482 """Content Too Short exception.
484 This exception may be raised by FileDownloader objects when a file they
485 download is too small for what the server announced first, indicating
486 the connection was probably interrupted.
492 def __init__(self, downloaded, expected):
493 self.downloaded = downloaded
494 self.expected = expected
496 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
497 """Handler for HTTP requests and responses.
499 This class, when installed with an OpenerDirector, automatically adds
500 the standard headers to every HTTP request and handles gzipped and
501 deflated responses from web servers. If compression is to be avoided in
502 a particular request, the original request in the program code only has
503 to include the HTTP header "Youtubedl-No-Compression", which will be
504 removed before making the real request.
506 Part of this code was copied from:
508 http://techknack.net/python-urllib2-handlers/
510 Andrew Rowls, the author of that code, agreed to release it to the
517 return zlib.decompress(data, -zlib.MAX_WBITS)
519 return zlib.decompress(data)
522 def addinfourl_wrapper(stream, headers, url, code):
523 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
524 return compat_urllib_request.addinfourl(stream, headers, url, code)
525 ret = compat_urllib_request.addinfourl(stream, headers, url)
529 def http_request(self, req):
530 for h, v in std_headers.items():
531 if h not in req.headers:
533 if 'Youtubedl-no-compression' in req.headers:
534 if 'Accept-encoding' in req.headers:
535 del req.headers['Accept-encoding']
536 del req.headers['Youtubedl-no-compression']
537 if 'Youtubedl-user-agent' in req.headers:
538 if 'User-agent' in req.headers:
539 del req.headers['User-agent']
540 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
541 del req.headers['Youtubedl-user-agent']
543 if sys.version_info < (2, 7) and '#' in req.get_full_url():
544 # Python 2.6 is brain-dead when it comes to fragments
545 req._Request__original = req._Request__original.partition('#')[0]
546 req._Request__r_type = req._Request__r_type.partition('#')[0]
550 def http_response(self, req, resp):
553 if resp.headers.get('Content-encoding', '') == 'gzip':
554 content = resp.read()
555 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
557 uncompressed = io.BytesIO(gz.read())
558 except IOError as original_ioerror:
559 # There may be junk add the end of the file
560 # See http://stackoverflow.com/q/4928560/35070 for details
561 for i in range(1, 1024):
563 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
564 uncompressed = io.BytesIO(gz.read())
569 raise original_ioerror
570 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
571 resp.msg = old_resp.msg
573 if resp.headers.get('Content-encoding', '') == 'deflate':
574 gz = io.BytesIO(self.deflate(resp.read()))
575 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
576 resp.msg = old_resp.msg
579 https_request = http_request
580 https_response = http_response
583 def parse_iso8601(date_str, delimiter='T'):
584 """ Return a UNIX timestamp from the given date """
590 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
593 timezone = datetime.timedelta()
595 date_str = date_str[:-len(m.group(0))]
596 if not m.group('sign'):
597 timezone = datetime.timedelta()
599 sign = 1 if m.group('sign') == '+' else -1
600 timezone = datetime.timedelta(
601 hours=sign * int(m.group('hours')),
602 minutes=sign * int(m.group('minutes')))
603 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
604 dt = datetime.datetime.strptime(date_str, date_format) - timezone
605 return calendar.timegm(dt.timetuple())
608 def unified_strdate(date_str):
609 """Return a string with the date in the format YYYYMMDD"""
616 date_str = date_str.replace(',', ' ')
617 # %z (UTC offset) is only supported in python>=3.2
618 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
619 format_expressions = [
624 '%b %dst %Y %I:%M%p',
625 '%b %dnd %Y %I:%M%p',
626 '%b %dth %Y %I:%M%p',
635 '%Y-%m-%d %H:%M:%S.%f',
638 '%Y-%m-%dT%H:%M:%SZ',
639 '%Y-%m-%dT%H:%M:%S.%fZ',
640 '%Y-%m-%dT%H:%M:%S.%f0Z',
642 '%Y-%m-%dT%H:%M:%S.%f',
645 for expression in format_expressions:
647 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
650 if upload_date is None:
651 timetuple = email.utils.parsedate_tz(date_str)
653 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
656 def determine_ext(url, default_ext=u'unknown_video'):
659 guess = url.partition(u'?')[0].rpartition(u'.')[2]
660 if re.match(r'^[A-Za-z0-9]+$', guess):
665 def subtitles_filename(filename, sub_lang, sub_format):
666 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
668 def date_from_str(date_str):
670 Return a datetime object from a string in the format YYYYMMDD or
671 (now|today)[+-][0-9](day|week|month|year)(s)?"""
672 today = datetime.date.today()
673 if date_str == 'now'or date_str == 'today':
675 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
676 if match is not None:
677 sign = match.group('sign')
678 time = int(match.group('time'))
681 unit = match.group('unit')
690 delta = datetime.timedelta(**{unit: time})
692 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
694 def hyphenate_date(date_str):
696 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
697 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
698 if match is not None:
699 return '-'.join(match.groups())
703 class DateRange(object):
704 """Represents a time interval between two dates"""
705 def __init__(self, start=None, end=None):
706 """start and end must be strings in the format accepted by date"""
707 if start is not None:
708 self.start = date_from_str(start)
710 self.start = datetime.datetime.min.date()
712 self.end = date_from_str(end)
714 self.end = datetime.datetime.max.date()
715 if self.start > self.end:
716 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
719 """Returns a range that only contains the given day"""
721 def __contains__(self, date):
722 """Check if the date is in the range"""
723 if not isinstance(date, datetime.date):
724 date = date_from_str(date)
725 return self.start <= date <= self.end
727 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
731 """ Returns the platform name as a compat_str """
732 res = platform.platform()
733 if isinstance(res, bytes):
734 res = res.decode(preferredencoding())
736 assert isinstance(res, compat_str)
740 def _windows_write_string(s, out):
741 """ Returns True if the string was written using special methods,
742 False if it has yet to be written out."""
743 # Adapted from http://stackoverflow.com/a/3259271/35070
746 import ctypes.wintypes
754 fileno = out.fileno()
755 except AttributeError:
756 # If the output stream doesn't have a fileno, it's virtual
758 if fileno not in WIN_OUTPUT_IDS:
761 GetStdHandle = ctypes.WINFUNCTYPE(
762 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
763 ("GetStdHandle", ctypes.windll.kernel32))
764 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
766 WriteConsoleW = ctypes.WINFUNCTYPE(
767 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
768 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
769 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
770 written = ctypes.wintypes.DWORD(0)
772 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
773 FILE_TYPE_CHAR = 0x0002
774 FILE_TYPE_REMOTE = 0x8000
775 GetConsoleMode = ctypes.WINFUNCTYPE(
776 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
777 ctypes.POINTER(ctypes.wintypes.DWORD))(
778 ("GetConsoleMode", ctypes.windll.kernel32))
779 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
781 def not_a_console(handle):
782 if handle == INVALID_HANDLE_VALUE or handle is None:
784 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
785 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
790 def next_nonbmp_pos(s):
792 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
793 except StopIteration:
797 count = min(next_nonbmp_pos(s), 1024)
800 h, s, count if count else 2, ctypes.byref(written), None)
802 raise OSError('Failed to write string')
803 if not count: # We just wrote a non-BMP character
804 assert written.value == 2
807 assert written.value > 0
808 s = s[written.value:]
812 def write_string(s, out=None, encoding=None):
815 assert type(s) == compat_str
817 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
818 if _windows_write_string(s, out):
821 if ('b' in getattr(out, 'mode', '') or
822 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
823 byt = s.encode(encoding or preferredencoding(), 'ignore')
825 elif hasattr(out, 'buffer'):
826 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
827 byt = s.encode(enc, 'ignore')
828 out.buffer.write(byt)
834 def bytes_to_intlist(bs):
837 if isinstance(bs[0], int): # Python 3
840 return [ord(c) for c in bs]
843 def intlist_to_bytes(xs):
846 if isinstance(chr(0), bytes): # Python 2
847 return ''.join([chr(x) for x in xs])
852 # Cross-platform file locking
853 if sys.platform == 'win32':
854 import ctypes.wintypes
857 class OVERLAPPED(ctypes.Structure):
859 ('Internal', ctypes.wintypes.LPVOID),
860 ('InternalHigh', ctypes.wintypes.LPVOID),
861 ('Offset', ctypes.wintypes.DWORD),
862 ('OffsetHigh', ctypes.wintypes.DWORD),
863 ('hEvent', ctypes.wintypes.HANDLE),
866 kernel32 = ctypes.windll.kernel32
867 LockFileEx = kernel32.LockFileEx
868 LockFileEx.argtypes = [
869 ctypes.wintypes.HANDLE, # hFile
870 ctypes.wintypes.DWORD, # dwFlags
871 ctypes.wintypes.DWORD, # dwReserved
872 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
873 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
874 ctypes.POINTER(OVERLAPPED) # Overlapped
876 LockFileEx.restype = ctypes.wintypes.BOOL
877 UnlockFileEx = kernel32.UnlockFileEx
878 UnlockFileEx.argtypes = [
879 ctypes.wintypes.HANDLE, # hFile
880 ctypes.wintypes.DWORD, # dwReserved
881 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
882 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
883 ctypes.POINTER(OVERLAPPED) # Overlapped
885 UnlockFileEx.restype = ctypes.wintypes.BOOL
886 whole_low = 0xffffffff
887 whole_high = 0x7fffffff
889 def _lock_file(f, exclusive):
890 overlapped = OVERLAPPED()
891 overlapped.Offset = 0
892 overlapped.OffsetHigh = 0
893 overlapped.hEvent = 0
894 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
895 handle = msvcrt.get_osfhandle(f.fileno())
896 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
897 whole_low, whole_high, f._lock_file_overlapped_p):
898 raise OSError('Locking file failed: %r' % ctypes.FormatError())
901 assert f._lock_file_overlapped_p
902 handle = msvcrt.get_osfhandle(f.fileno())
903 if not UnlockFileEx(handle, 0,
904 whole_low, whole_high, f._lock_file_overlapped_p):
905 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
910 def _lock_file(f, exclusive):
911 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
914 fcntl.flock(f, fcntl.LOCK_UN)
917 class locked_file(object):
918 def __init__(self, filename, mode, encoding=None):
919 assert mode in ['r', 'a', 'w']
920 self.f = io.open(filename, mode, encoding=encoding)
924 exclusive = self.mode != 'r'
926 _lock_file(self.f, exclusive)
932 def __exit__(self, etype, value, traceback):
941 def write(self, *args):
942 return self.f.write(*args)
944 def read(self, *args):
945 return self.f.read(*args)
948 def get_filesystem_encoding():
949 encoding = sys.getfilesystemencoding()
950 return encoding if encoding is not None else 'utf-8'
953 def shell_quote(args):
955 encoding = get_filesystem_encoding()
957 if isinstance(a, bytes):
958 # We may get a filename encoded with 'encodeFilename'
959 a = a.decode(encoding)
960 quoted_args.append(pipes.quote(a))
961 return u' '.join(quoted_args)
964 def takewhile_inclusive(pred, seq):
965 """ Like itertools.takewhile, but include the latest evaluated element
966 (the first element so that Not pred(e)) """
973 def smuggle_url(url, data):
974 """ Pass additional data in a URL for internal use. """
976 sdata = compat_urllib_parse.urlencode(
977 {u'__youtubedl_smuggle': json.dumps(data)})
978 return url + u'#' + sdata
981 def unsmuggle_url(smug_url, default=None):
982 if not '#__youtubedl_smuggle' in smug_url:
983 return smug_url, default
984 url, _, sdata = smug_url.rpartition(u'#')
985 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
986 data = json.loads(jsond)
990 def format_bytes(bytes):
993 if type(bytes) is str:
998 exponent = int(math.log(bytes, 1024.0))
999 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1000 converted = float(bytes) / float(1024 ** exponent)
1001 return u'%.2f%s' % (converted, suffix)
1004 def get_term_width():
1005 columns = compat_getenv('COLUMNS', None)
1010 sp = subprocess.Popen(
1012 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1013 out, err = sp.communicate()
1014 return int(out.split()[1])
1020 def month_by_name(name):
1021 """ Return the number of a month by (locale-independently) English name """
1024 u'January', u'February', u'March', u'April', u'May', u'June',
1025 u'July', u'August', u'September', u'October', u'November', u'December']
1027 return ENGLISH_NAMES.index(name) + 1
1032 def fix_xml_ampersands(xml_str):
1033 """Replace all the '&' by '&' in XML"""
1035 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1040 def setproctitle(title):
1041 assert isinstance(title, compat_str)
1043 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1046 title_bytes = title.encode('utf-8')
1047 buf = ctypes.create_string_buffer(len(title_bytes))
1048 buf.value = title_bytes
1050 libc.prctl(15, buf, 0, 0, 0)
1051 except AttributeError:
1052 return # Strange libc, just skip this
1055 def remove_start(s, start):
1056 if s.startswith(start):
1057 return s[len(start):]
1061 def remove_end(s, end):
1063 return s[:-len(end)]
1067 def url_basename(url):
1068 path = compat_urlparse.urlparse(url).path
1069 return path.strip(u'/').split(u'/')[-1]
1072 class HEADRequest(compat_urllib_request.Request):
1073 def get_method(self):
1077 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1080 v = getattr(v, get_attr, None)
1083 return default if v is None else (int(v) * invscale // scale)
1086 def str_or_none(v, default=None):
1087 return default if v is None else compat_str(v)
1090 def str_to_int(int_str):
1091 """ A more relaxed version of int_or_none """
1094 int_str = re.sub(r'[,\.\+]', u'', int_str)
1098 def float_or_none(v, scale=1, invscale=1, default=None):
1099 return default if v is None else (float(v) * invscale / scale)
1102 def parse_duration(s):
1109 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1112 res = int(m.group('secs'))
1114 res += int(m.group('mins')) * 60
1115 if m.group('hours'):
1116 res += int(m.group('hours')) * 60 * 60
1118 res += float(m.group('ms'))
1122 def prepend_extension(filename, ext):
1123 name, real_ext = os.path.splitext(filename)
1124 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1127 def check_executable(exe, args=[]):
1128 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1129 args can be a list of arguments for a short output (like -version) """
1131 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1137 def get_exe_version(exe, args=['--version'],
1138 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1139 unrecognized=u'present'):
1140 """ Returns the version of the specified executable,
1141 or False if the executable is not present """
1143 out, err = subprocess.Popen(
1145 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1148 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1149 m = re.search(version_re, firstline)
1156 class PagedList(object):
1158 # This is only useful for tests
1159 return len(self.getslice())
1162 class OnDemandPagedList(PagedList):
1163 def __init__(self, pagefunc, pagesize):
1164 self._pagefunc = pagefunc
1165 self._pagesize = pagesize
1167 def getslice(self, start=0, end=None):
1169 for pagenum in itertools.count(start // self._pagesize):
1170 firstid = pagenum * self._pagesize
1171 nextfirstid = pagenum * self._pagesize + self._pagesize
1172 if start >= nextfirstid:
1175 page_results = list(self._pagefunc(pagenum))
1178 start % self._pagesize
1179 if firstid <= start < nextfirstid
1183 ((end - 1) % self._pagesize) + 1
1184 if (end is not None and firstid <= end <= nextfirstid)
1187 if startv != 0 or endv is not None:
1188 page_results = page_results[startv:endv]
1189 res.extend(page_results)
1191 # A little optimization - if current page is not "full", ie. does
1192 # not contain page_size videos then we can assume that this page
1193 # is the last one - there are no more ids on further pages -
1194 # i.e. no need to query again.
1195 if len(page_results) + startv < self._pagesize:
1198 # If we got the whole page, but the next page is not interesting,
1199 # break out early as well
1200 if end == nextfirstid:
1205 class InAdvancePagedList(PagedList):
1206 def __init__(self, pagefunc, pagecount, pagesize):
1207 self._pagefunc = pagefunc
1208 self._pagecount = pagecount
1209 self._pagesize = pagesize
1211 def getslice(self, start=0, end=None):
1213 start_page = start // self._pagesize
1215 self._pagecount if end is None else (end // self._pagesize + 1))
1216 skip_elems = start - start_page * self._pagesize
1217 only_more = None if end is None else end - start
1218 for pagenum in range(start_page, end_page):
1219 page = list(self._pagefunc(pagenum))
1221 page = page[skip_elems:]
1223 if only_more is not None:
1224 if len(page) < only_more:
1225 only_more -= len(page)
1227 page = page[:only_more]
1234 def uppercase_escape(s):
1235 unicode_escape = codecs.getdecoder('unicode_escape')
1237 r'\\U[0-9a-fA-F]{8}',
1238 lambda m: unicode_escape(m.group(0))[0],
1242 def escape_rfc3986(s):
1243 """Escape non-ASCII characters as suggested by RFC 3986"""
1244 if sys.version_info < (3, 0) and isinstance(s, unicode):
1245 s = s.encode('utf-8')
1246 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1249 def escape_url(url):
1250 """Escape URL as suggested by RFC 3986"""
1251 url_parsed = compat_urllib_parse_urlparse(url)
1252 return url_parsed._replace(
1253 path=escape_rfc3986(url_parsed.path),
1254 params=escape_rfc3986(url_parsed.params),
1255 query=escape_rfc3986(url_parsed.query),
1256 fragment=escape_rfc3986(url_parsed.fragment)
1260 struct.pack(u'!I', 0)
1262 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1263 def struct_pack(spec, *args):
1264 if isinstance(spec, compat_str):
1265 spec = spec.encode('ascii')
1266 return struct.pack(spec, *args)
1268 def struct_unpack(spec, *args):
1269 if isinstance(spec, compat_str):
1270 spec = spec.encode('ascii')
1271 return struct.unpack(spec, *args)
1273 struct_pack = struct.pack
1274 struct_unpack = struct.unpack
1277 def read_batch_urls(batch_fd):
1279 if not isinstance(url, compat_str):
1280 url = url.decode('utf-8', 'replace')
1281 BOM_UTF8 = u'\xef\xbb\xbf'
1282 if url.startswith(BOM_UTF8):
1283 url = url[len(BOM_UTF8):]
1285 if url.startswith(('#', ';', ']')):
1289 with contextlib.closing(batch_fd) as fd:
1290 return [url for url in map(fixup, fd) if url]
1293 def urlencode_postdata(*args, **kargs):
1294 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1298 etree_iter = xml.etree.ElementTree.Element.iter
1299 except AttributeError: # Python <=2.6
1300 etree_iter = lambda n: n.findall('.//*')
1304 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1305 def doctype(self, name, pubid, system):
1306 pass # Ignore doctypes
1308 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1309 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1310 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1311 # Fix up XML parser in Python 2.x
1312 if sys.version_info < (3, 0):
1313 for n in etree_iter(tree):
1314 if n.text is not None:
1315 if not isinstance(n.text, compat_str):
1316 n.text = n.text.decode('utf-8')
1329 def parse_age_limit(s):
1332 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1333 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1336 def strip_jsonp(code):
1337 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1340 def js_to_json(code):
1343 if v in ('true', 'false', 'null'):
1345 if v.startswith('"'):
1347 if v.startswith("'"):
1349 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1356 res = re.sub(r'''(?x)
1357 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1358 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1359 [a-zA-Z_][a-zA-Z_0-9]*
1361 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1365 def qualities(quality_ids):
1366 """ Get a numeric quality value out of a list of possible values """
1369 return quality_ids.index(qid)
1375 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1378 def limit_length(s, length):
1379 """ Add ellipses to overly long strings """
1384 return s[:length - len(ELLIPSES)] + ELLIPSES
1388 def version_tuple(v):
1389 return [int(e) for e in v.split('.')]
1392 def is_outdated_version(version, limit, assume_new=True):
1394 return not assume_new
1396 return version_tuple(version) < version_tuple(limit)
1398 return not assume_new