2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
47 # This is not clearly defined otherwise
48 compiled_regex_type = type(re.compile(''))
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
58 def preferredencoding():
59 """Get preferred encoding.
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
65 pref = locale.getpreferredencoding()
73 def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically """
78 'prefix': os.path.basename(fn) + '.',
79 'dir': os.path.dirname(fn),
83 # In Python 2.x, json.dump expects a bytestream.
84 # In Python 3.x, it writes to a character stream
85 if sys.version_info < (3, 0):
93 tf = tempfile.NamedTemporaryFile(**args)
98 os.rename(tf.name, fn)
107 if sys.version_info >= (2, 7):
108 def find_xpath_attr(node, xpath, key, val):
109 """ Find the xpath xpath[@key=val] """
110 assert re.match(r'^[a-zA-Z-]+$', key)
111 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
112 expr = xpath + u"[@%s='%s']" % (key, val)
113 return node.find(expr)
115 def find_xpath_attr(node, xpath, key, val):
116 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
117 # .//node does not match if a node is a direct child of . !
118 if isinstance(xpath, unicode):
119 xpath = xpath.encode('ascii')
121 for f in node.findall(xpath):
122 if f.attrib.get(key) == val:
126 # On python2.6 the xml.etree.ElementTree.Element methods don't support
127 # the namespace parameter
128 def xpath_with_ns(path, ns_map):
129 components = [c.split(':') for c in path.split('/')]
133 replaced.append(c[0])
136 replaced.append('{%s}%s' % (ns_map[ns], tag))
137 return '/'.join(replaced)
140 def xpath_text(node, xpath, name=None, fatal=False):
141 if sys.version_info < (2, 7): # Crazy 2.6
142 xpath = xpath.encode('ascii')
147 name = xpath if name is None else name
148 raise ExtractorError('Could not find XML element %s' % name)
154 def get_element_by_id(id, html):
155 """Return the content of the tag with the specified ID in the passed HTML document"""
156 return get_element_by_attribute("id", id, html)
159 def get_element_by_attribute(attribute, value, html):
160 """Return the content of the tag with the specified attribute in the passed HTML document"""
162 m = re.search(r'''(?xs)
164 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
166 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
170 ''' % (re.escape(attribute), re.escape(value)), html)
174 res = m.group('content')
176 if res.startswith('"') or res.startswith("'"):
179 return unescapeHTML(res)
182 def clean_html(html):
183 """Clean an HTML snippet into a readable string"""
185 html = html.replace('\n', ' ')
186 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
187 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
189 html = re.sub('<.*?>', '', html)
190 # Replace html entities
191 html = unescapeHTML(html)
195 def sanitize_open(filename, open_mode):
196 """Try to open the given filename, and slightly tweak it if this fails.
198 Attempts to open the given filename. If this fails, it tries to change
199 the filename slightly, step by step, until it's either able to open it
200 or it fails and raises a final exception, like the standard open()
203 It returns the tuple (stream, definitive_file_name).
207 if sys.platform == 'win32':
209 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
210 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
211 stream = open(encodeFilename(filename), open_mode)
212 return (stream, filename)
213 except (IOError, OSError) as err:
214 if err.errno in (errno.EACCES,):
217 # In case of error, try to remove win32 forbidden chars
218 alt_filename = os.path.join(
219 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
220 for path_part in os.path.split(filename)
222 if alt_filename == filename:
225 # An exception here should be caught in the caller
226 stream = open(encodeFilename(filename), open_mode)
227 return (stream, alt_filename)
230 def timeconvert(timestr):
231 """Convert RFC 2822 defined time string into system timestamp"""
233 timetuple = email.utils.parsedate_tz(timestr)
234 if timetuple is not None:
235 timestamp = email.utils.mktime_tz(timetuple)
238 def sanitize_filename(s, restricted=False, is_id=False):
239 """Sanitizes a string so it could be used as part of a filename.
240 If restricted is set, use a stricter subset of allowed characters.
241 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
243 def replace_insane(char):
244 if char == '?' or ord(char) < 32 or ord(char) == 127:
247 return '' if restricted else '\''
249 return '_-' if restricted else ' -'
250 elif char in '\\/|*<>':
252 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
254 if restricted and ord(char) > 127:
258 result = u''.join(map(replace_insane, s))
260 while '__' in result:
261 result = result.replace('__', '_')
262 result = result.strip('_')
263 # Common case of "Foreign band name - English song title"
264 if restricted and result.startswith('-_'):
270 def orderedSet(iterable):
271 """ Remove all duplicates from the input iterable """
279 def _htmlentity_transform(entity):
280 """Transforms an HTML entity to a character."""
281 # Known non-numeric HTML entity
282 if entity in compat_html_entities.name2codepoint:
283 return compat_chr(compat_html_entities.name2codepoint[entity])
285 mobj = re.match(r'#(x?[0-9]+)', entity)
287 numstr = mobj.group(1)
288 if numstr.startswith(u'x'):
290 numstr = u'0%s' % numstr
293 return compat_chr(int(numstr, base))
295 # Unknown entity in name, return its literal representation
296 return (u'&%s;' % entity)
302 assert type(s) == compat_str
305 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
308 def encodeFilename(s, for_subprocess=False):
310 @param s The name of the file
313 assert type(s) == compat_str
315 # Python 3 has a Unicode API
316 if sys.version_info >= (3, 0):
319 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
320 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
321 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
322 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
323 if not for_subprocess:
326 # For subprocess calls, encode with locale encoding
327 # Refer to http://stackoverflow.com/a/9951851/35070
328 encoding = preferredencoding()
330 encoding = sys.getfilesystemencoding()
333 return s.encode(encoding, 'ignore')
336 def encodeArgument(s):
337 if not isinstance(s, compat_str):
338 # Legacy code that uses byte strings
339 # Uncomment the following line after fixing all post processors
340 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
341 s = s.decode('ascii')
342 return encodeFilename(s, True)
345 def decodeOption(optval):
348 if isinstance(optval, bytes):
349 optval = optval.decode(preferredencoding())
351 assert isinstance(optval, compat_str)
354 def formatSeconds(secs):
356 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
358 return '%d:%02d' % (secs // 60, secs % 60)
363 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
364 if sys.version_info < (3, 2):
367 class HTTPSConnectionV3(httplib.HTTPSConnection):
368 def __init__(self, *args, **kwargs):
369 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
372 sock = socket.create_connection((self.host, self.port), self.timeout)
373 if getattr(self, '_tunnel_host', False):
377 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
379 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
381 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
382 def https_open(self, req):
383 return self.do_open(HTTPSConnectionV3, req)
384 return HTTPSHandlerV3(**kwargs)
385 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
386 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
387 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
388 if opts_no_check_certificate:
389 context.verify_mode = ssl.CERT_NONE
390 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
392 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
393 context.verify_mode = (ssl.CERT_NONE
394 if opts_no_check_certificate
395 else ssl.CERT_REQUIRED)
396 context.set_default_verify_paths()
398 context.load_default_certs()
399 except AttributeError:
401 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
403 class ExtractorError(Exception):
404 """Error during info extraction."""
405 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
406 """ tb, if given, is the original traceback (so that it can be printed out).
407 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
410 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
412 if video_id is not None:
413 msg = video_id + ': ' + msg
415 msg += u' (caused by %r)' % cause
417 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
418 super(ExtractorError, self).__init__(msg)
421 self.exc_info = sys.exc_info() # preserve original exception
423 self.video_id = video_id
425 def format_traceback(self):
426 if self.traceback is None:
428 return u''.join(traceback.format_tb(self.traceback))
431 class RegexNotFoundError(ExtractorError):
432 """Error when a regex didn't match"""
436 class DownloadError(Exception):
437 """Download Error exception.
439 This exception may be thrown by FileDownloader objects if they are not
440 configured to continue on errors. They will contain the appropriate
443 def __init__(self, msg, exc_info=None):
444 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
445 super(DownloadError, self).__init__(msg)
446 self.exc_info = exc_info
449 class SameFileError(Exception):
450 """Same File exception.
452 This exception will be thrown by FileDownloader objects if they detect
453 multiple files would have to be downloaded to the same file on disk.
458 class PostProcessingError(Exception):
459 """Post Processing exception.
461 This exception may be raised by PostProcessor's .run() method to
462 indicate an error in the postprocessing task.
464 def __init__(self, msg):
467 class MaxDownloadsReached(Exception):
468 """ --max-downloads limit has been reached. """
472 class UnavailableVideoError(Exception):
473 """Unavailable Format exception.
475 This exception will be thrown when a video is requested
476 in a format that is not available for that video.
481 class ContentTooShortError(Exception):
482 """Content Too Short exception.
484 This exception may be raised by FileDownloader objects when a file they
485 download is too small for what the server announced first, indicating
486 the connection was probably interrupted.
492 def __init__(self, downloaded, expected):
493 self.downloaded = downloaded
494 self.expected = expected
496 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
497 """Handler for HTTP requests and responses.
499 This class, when installed with an OpenerDirector, automatically adds
500 the standard headers to every HTTP request and handles gzipped and
501 deflated responses from web servers. If compression is to be avoided in
502 a particular request, the original request in the program code only has
503 to include the HTTP header "Youtubedl-No-Compression", which will be
504 removed before making the real request.
506 Part of this code was copied from:
508 http://techknack.net/python-urllib2-handlers/
510 Andrew Rowls, the author of that code, agreed to release it to the
517 return zlib.decompress(data, -zlib.MAX_WBITS)
519 return zlib.decompress(data)
522 def addinfourl_wrapper(stream, headers, url, code):
523 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
524 return compat_urllib_request.addinfourl(stream, headers, url, code)
525 ret = compat_urllib_request.addinfourl(stream, headers, url)
529 def http_request(self, req):
530 for h, v in std_headers.items():
531 if h not in req.headers:
533 if 'Youtubedl-no-compression' in req.headers:
534 if 'Accept-encoding' in req.headers:
535 del req.headers['Accept-encoding']
536 del req.headers['Youtubedl-no-compression']
537 if 'Youtubedl-user-agent' in req.headers:
538 if 'User-agent' in req.headers:
539 del req.headers['User-agent']
540 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
541 del req.headers['Youtubedl-user-agent']
543 if sys.version_info < (2, 7) and '#' in req.get_full_url():
544 # Python 2.6 is brain-dead when it comes to fragments
545 req._Request__original = req._Request__original.partition('#')[0]
546 req._Request__r_type = req._Request__r_type.partition('#')[0]
550 def http_response(self, req, resp):
553 if resp.headers.get('Content-encoding', '') == 'gzip':
554 content = resp.read()
555 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
557 uncompressed = io.BytesIO(gz.read())
558 except IOError as original_ioerror:
559 # There may be junk add the end of the file
560 # See http://stackoverflow.com/q/4928560/35070 for details
561 for i in range(1, 1024):
563 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
564 uncompressed = io.BytesIO(gz.read())
569 raise original_ioerror
570 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
571 resp.msg = old_resp.msg
573 if resp.headers.get('Content-encoding', '') == 'deflate':
574 gz = io.BytesIO(self.deflate(resp.read()))
575 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
576 resp.msg = old_resp.msg
579 https_request = http_request
580 https_response = http_response
583 def parse_iso8601(date_str, delimiter='T'):
584 """ Return a UNIX timestamp from the given date """
590 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
593 timezone = datetime.timedelta()
595 date_str = date_str[:-len(m.group(0))]
596 if not m.group('sign'):
597 timezone = datetime.timedelta()
599 sign = 1 if m.group('sign') == '+' else -1
600 timezone = datetime.timedelta(
601 hours=sign * int(m.group('hours')),
602 minutes=sign * int(m.group('minutes')))
603 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
604 dt = datetime.datetime.strptime(date_str, date_format) - timezone
605 return calendar.timegm(dt.timetuple())
608 def unified_strdate(date_str):
609 """Return a string with the date in the format YYYYMMDD"""
616 date_str = date_str.replace(',', ' ')
617 # %z (UTC offset) is only supported in python>=3.2
618 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
619 format_expressions = [
624 '%b %dst %Y %I:%M%p',
625 '%b %dnd %Y %I:%M%p',
626 '%b %dth %Y %I:%M%p',
635 '%Y-%m-%d %H:%M:%S.%f',
638 '%Y-%m-%dT%H:%M:%SZ',
639 '%Y-%m-%dT%H:%M:%S.%fZ',
640 '%Y-%m-%dT%H:%M:%S.%f0Z',
642 '%Y-%m-%dT%H:%M:%S.%f',
645 for expression in format_expressions:
647 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
650 if upload_date is None:
651 timetuple = email.utils.parsedate_tz(date_str)
653 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
656 def determine_ext(url, default_ext=u'unknown_video'):
659 guess = url.partition(u'?')[0].rpartition(u'.')[2]
660 if re.match(r'^[A-Za-z0-9]+$', guess):
665 def subtitles_filename(filename, sub_lang, sub_format):
666 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
668 def date_from_str(date_str):
670 Return a datetime object from a string in the format YYYYMMDD or
671 (now|today)[+-][0-9](day|week|month|year)(s)?"""
672 today = datetime.date.today()
673 if date_str == 'now'or date_str == 'today':
675 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
676 if match is not None:
677 sign = match.group('sign')
678 time = int(match.group('time'))
681 unit = match.group('unit')
690 delta = datetime.timedelta(**{unit: time})
692 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
694 def hyphenate_date(date_str):
696 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
697 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
698 if match is not None:
699 return '-'.join(match.groups())
703 class DateRange(object):
704 """Represents a time interval between two dates"""
705 def __init__(self, start=None, end=None):
706 """start and end must be strings in the format accepted by date"""
707 if start is not None:
708 self.start = date_from_str(start)
710 self.start = datetime.datetime.min.date()
712 self.end = date_from_str(end)
714 self.end = datetime.datetime.max.date()
715 if self.start > self.end:
716 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
719 """Returns a range that only contains the given day"""
721 def __contains__(self, date):
722 """Check if the date is in the range"""
723 if not isinstance(date, datetime.date):
724 date = date_from_str(date)
725 return self.start <= date <= self.end
727 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
731 """ Returns the platform name as a compat_str """
732 res = platform.platform()
733 if isinstance(res, bytes):
734 res = res.decode(preferredencoding())
736 assert isinstance(res, compat_str)
740 def _windows_write_string(s, out):
741 """ Returns True if the string was written using special methods,
742 False if it has yet to be written out."""
743 # Adapted from http://stackoverflow.com/a/3259271/35070
746 import ctypes.wintypes
754 fileno = out.fileno()
755 except AttributeError:
756 # If the output stream doesn't have a fileno, it's virtual
758 if fileno not in WIN_OUTPUT_IDS:
761 GetStdHandle = ctypes.WINFUNCTYPE(
762 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
763 ("GetStdHandle", ctypes.windll.kernel32))
764 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
766 WriteConsoleW = ctypes.WINFUNCTYPE(
767 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
768 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
769 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
770 written = ctypes.wintypes.DWORD(0)
772 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
773 FILE_TYPE_CHAR = 0x0002
774 FILE_TYPE_REMOTE = 0x8000
775 GetConsoleMode = ctypes.WINFUNCTYPE(
776 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
777 ctypes.POINTER(ctypes.wintypes.DWORD))(
778 ("GetConsoleMode", ctypes.windll.kernel32))
779 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
781 def not_a_console(handle):
782 if handle == INVALID_HANDLE_VALUE or handle is None:
784 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
785 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
790 def next_nonbmp_pos(s):
792 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
793 except StopIteration:
797 count = min(next_nonbmp_pos(s), 1024)
800 h, s, count if count else 2, ctypes.byref(written), None)
802 raise OSError('Failed to write string')
803 if not count: # We just wrote a non-BMP character
804 assert written.value == 2
807 assert written.value > 0
808 s = s[written.value:]
812 def write_string(s, out=None, encoding=None):
815 assert type(s) == compat_str
817 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
818 if _windows_write_string(s, out):
821 if ('b' in getattr(out, 'mode', '') or
822 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
823 byt = s.encode(encoding or preferredencoding(), 'ignore')
825 elif hasattr(out, 'buffer'):
826 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
827 byt = s.encode(enc, 'ignore')
828 out.buffer.write(byt)
834 def bytes_to_intlist(bs):
837 if isinstance(bs[0], int): # Python 3
840 return [ord(c) for c in bs]
843 def intlist_to_bytes(xs):
846 return struct_pack('%dB' % len(xs), *xs)
849 # Cross-platform file locking
850 if sys.platform == 'win32':
851 import ctypes.wintypes
854 class OVERLAPPED(ctypes.Structure):
856 ('Internal', ctypes.wintypes.LPVOID),
857 ('InternalHigh', ctypes.wintypes.LPVOID),
858 ('Offset', ctypes.wintypes.DWORD),
859 ('OffsetHigh', ctypes.wintypes.DWORD),
860 ('hEvent', ctypes.wintypes.HANDLE),
863 kernel32 = ctypes.windll.kernel32
864 LockFileEx = kernel32.LockFileEx
865 LockFileEx.argtypes = [
866 ctypes.wintypes.HANDLE, # hFile
867 ctypes.wintypes.DWORD, # dwFlags
868 ctypes.wintypes.DWORD, # dwReserved
869 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
870 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
871 ctypes.POINTER(OVERLAPPED) # Overlapped
873 LockFileEx.restype = ctypes.wintypes.BOOL
874 UnlockFileEx = kernel32.UnlockFileEx
875 UnlockFileEx.argtypes = [
876 ctypes.wintypes.HANDLE, # hFile
877 ctypes.wintypes.DWORD, # dwReserved
878 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
879 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
880 ctypes.POINTER(OVERLAPPED) # Overlapped
882 UnlockFileEx.restype = ctypes.wintypes.BOOL
883 whole_low = 0xffffffff
884 whole_high = 0x7fffffff
886 def _lock_file(f, exclusive):
887 overlapped = OVERLAPPED()
888 overlapped.Offset = 0
889 overlapped.OffsetHigh = 0
890 overlapped.hEvent = 0
891 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
892 handle = msvcrt.get_osfhandle(f.fileno())
893 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
894 whole_low, whole_high, f._lock_file_overlapped_p):
895 raise OSError('Locking file failed: %r' % ctypes.FormatError())
898 assert f._lock_file_overlapped_p
899 handle = msvcrt.get_osfhandle(f.fileno())
900 if not UnlockFileEx(handle, 0,
901 whole_low, whole_high, f._lock_file_overlapped_p):
902 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
907 def _lock_file(f, exclusive):
908 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
911 fcntl.flock(f, fcntl.LOCK_UN)
914 class locked_file(object):
915 def __init__(self, filename, mode, encoding=None):
916 assert mode in ['r', 'a', 'w']
917 self.f = io.open(filename, mode, encoding=encoding)
921 exclusive = self.mode != 'r'
923 _lock_file(self.f, exclusive)
929 def __exit__(self, etype, value, traceback):
938 def write(self, *args):
939 return self.f.write(*args)
941 def read(self, *args):
942 return self.f.read(*args)
945 def get_filesystem_encoding():
946 encoding = sys.getfilesystemencoding()
947 return encoding if encoding is not None else 'utf-8'
950 def shell_quote(args):
952 encoding = get_filesystem_encoding()
954 if isinstance(a, bytes):
955 # We may get a filename encoded with 'encodeFilename'
956 a = a.decode(encoding)
957 quoted_args.append(pipes.quote(a))
958 return u' '.join(quoted_args)
961 def takewhile_inclusive(pred, seq):
962 """ Like itertools.takewhile, but include the latest evaluated element
963 (the first element so that Not pred(e)) """
970 def smuggle_url(url, data):
971 """ Pass additional data in a URL for internal use. """
973 sdata = compat_urllib_parse.urlencode(
974 {u'__youtubedl_smuggle': json.dumps(data)})
975 return url + u'#' + sdata
978 def unsmuggle_url(smug_url, default=None):
979 if not '#__youtubedl_smuggle' in smug_url:
980 return smug_url, default
981 url, _, sdata = smug_url.rpartition(u'#')
982 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
983 data = json.loads(jsond)
987 def format_bytes(bytes):
990 if type(bytes) is str:
995 exponent = int(math.log(bytes, 1024.0))
996 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
997 converted = float(bytes) / float(1024 ** exponent)
998 return u'%.2f%s' % (converted, suffix)
1001 def get_term_width():
1002 columns = compat_getenv('COLUMNS', None)
1007 sp = subprocess.Popen(
1009 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1010 out, err = sp.communicate()
1011 return int(out.split()[1])
1017 def month_by_name(name):
1018 """ Return the number of a month by (locale-independently) English name """
1021 u'January', u'February', u'March', u'April', u'May', u'June',
1022 u'July', u'August', u'September', u'October', u'November', u'December']
1024 return ENGLISH_NAMES.index(name) + 1
1029 def fix_xml_ampersands(xml_str):
1030 """Replace all the '&' by '&' in XML"""
1032 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1037 def setproctitle(title):
1038 assert isinstance(title, compat_str)
1040 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1043 title_bytes = title.encode('utf-8')
1044 buf = ctypes.create_string_buffer(len(title_bytes))
1045 buf.value = title_bytes
1047 libc.prctl(15, buf, 0, 0, 0)
1048 except AttributeError:
1049 return # Strange libc, just skip this
1052 def remove_start(s, start):
1053 if s.startswith(start):
1054 return s[len(start):]
1058 def remove_end(s, end):
1060 return s[:-len(end)]
1064 def url_basename(url):
1065 path = compat_urlparse.urlparse(url).path
1066 return path.strip(u'/').split(u'/')[-1]
1069 class HEADRequest(compat_urllib_request.Request):
1070 def get_method(self):
1074 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1077 v = getattr(v, get_attr, None)
1080 return default if v is None else (int(v) * invscale // scale)
1083 def str_or_none(v, default=None):
1084 return default if v is None else compat_str(v)
1087 def str_to_int(int_str):
1088 """ A more relaxed version of int_or_none """
1091 int_str = re.sub(r'[,\.\+]', u'', int_str)
1095 def float_or_none(v, scale=1, invscale=1, default=None):
1096 return default if v is None else (float(v) * invscale / scale)
1099 def parse_duration(s):
1106 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1109 res = int(m.group('secs'))
1111 res += int(m.group('mins')) * 60
1112 if m.group('hours'):
1113 res += int(m.group('hours')) * 60 * 60
1115 res += float(m.group('ms'))
1119 def prepend_extension(filename, ext):
1120 name, real_ext = os.path.splitext(filename)
1121 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1124 def check_executable(exe, args=[]):
1125 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1126 args can be a list of arguments for a short output (like -version) """
1128 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1134 def get_exe_version(exe, args=['--version'],
1135 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1136 unrecognized=u'present'):
1137 """ Returns the version of the specified executable,
1138 or False if the executable is not present """
1140 out, err = subprocess.Popen(
1142 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1145 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1146 m = re.search(version_re, firstline)
1153 class PagedList(object):
1155 # This is only useful for tests
1156 return len(self.getslice())
1159 class OnDemandPagedList(PagedList):
1160 def __init__(self, pagefunc, pagesize):
1161 self._pagefunc = pagefunc
1162 self._pagesize = pagesize
1164 def getslice(self, start=0, end=None):
1166 for pagenum in itertools.count(start // self._pagesize):
1167 firstid = pagenum * self._pagesize
1168 nextfirstid = pagenum * self._pagesize + self._pagesize
1169 if start >= nextfirstid:
1172 page_results = list(self._pagefunc(pagenum))
1175 start % self._pagesize
1176 if firstid <= start < nextfirstid
1180 ((end - 1) % self._pagesize) + 1
1181 if (end is not None and firstid <= end <= nextfirstid)
1184 if startv != 0 or endv is not None:
1185 page_results = page_results[startv:endv]
1186 res.extend(page_results)
1188 # A little optimization - if current page is not "full", ie. does
1189 # not contain page_size videos then we can assume that this page
1190 # is the last one - there are no more ids on further pages -
1191 # i.e. no need to query again.
1192 if len(page_results) + startv < self._pagesize:
1195 # If we got the whole page, but the next page is not interesting,
1196 # break out early as well
1197 if end == nextfirstid:
1202 class InAdvancePagedList(PagedList):
1203 def __init__(self, pagefunc, pagecount, pagesize):
1204 self._pagefunc = pagefunc
1205 self._pagecount = pagecount
1206 self._pagesize = pagesize
1208 def getslice(self, start=0, end=None):
1210 start_page = start // self._pagesize
1212 self._pagecount if end is None else (end // self._pagesize + 1))
1213 skip_elems = start - start_page * self._pagesize
1214 only_more = None if end is None else end - start
1215 for pagenum in range(start_page, end_page):
1216 page = list(self._pagefunc(pagenum))
1218 page = page[skip_elems:]
1220 if only_more is not None:
1221 if len(page) < only_more:
1222 only_more -= len(page)
1224 page = page[:only_more]
1231 def uppercase_escape(s):
1232 unicode_escape = codecs.getdecoder('unicode_escape')
1234 r'\\U[0-9a-fA-F]{8}',
1235 lambda m: unicode_escape(m.group(0))[0],
1239 def escape_rfc3986(s):
1240 """Escape non-ASCII characters as suggested by RFC 3986"""
1241 if sys.version_info < (3, 0) and isinstance(s, unicode):
1242 s = s.encode('utf-8')
1243 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1246 def escape_url(url):
1247 """Escape URL as suggested by RFC 3986"""
1248 url_parsed = compat_urllib_parse_urlparse(url)
1249 return url_parsed._replace(
1250 path=escape_rfc3986(url_parsed.path),
1251 params=escape_rfc3986(url_parsed.params),
1252 query=escape_rfc3986(url_parsed.query),
1253 fragment=escape_rfc3986(url_parsed.fragment)
1257 struct.pack(u'!I', 0)
1259 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1260 def struct_pack(spec, *args):
1261 if isinstance(spec, compat_str):
1262 spec = spec.encode('ascii')
1263 return struct.pack(spec, *args)
1265 def struct_unpack(spec, *args):
1266 if isinstance(spec, compat_str):
1267 spec = spec.encode('ascii')
1268 return struct.unpack(spec, *args)
1270 struct_pack = struct.pack
1271 struct_unpack = struct.unpack
1274 def read_batch_urls(batch_fd):
1276 if not isinstance(url, compat_str):
1277 url = url.decode('utf-8', 'replace')
1278 BOM_UTF8 = u'\xef\xbb\xbf'
1279 if url.startswith(BOM_UTF8):
1280 url = url[len(BOM_UTF8):]
1282 if url.startswith(('#', ';', ']')):
1286 with contextlib.closing(batch_fd) as fd:
1287 return [url for url in map(fixup, fd) if url]
1290 def urlencode_postdata(*args, **kargs):
1291 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1295 etree_iter = xml.etree.ElementTree.Element.iter
1296 except AttributeError: # Python <=2.6
1297 etree_iter = lambda n: n.findall('.//*')
1301 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1302 def doctype(self, name, pubid, system):
1303 pass # Ignore doctypes
1305 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1306 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1307 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1308 # Fix up XML parser in Python 2.x
1309 if sys.version_info < (3, 0):
1310 for n in etree_iter(tree):
1311 if n.text is not None:
1312 if not isinstance(n.text, compat_str):
1313 n.text = n.text.decode('utf-8')
1326 def parse_age_limit(s):
1329 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1330 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1333 def strip_jsonp(code):
1335 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1338 def js_to_json(code):
1341 if v in ('true', 'false', 'null'):
1343 if v.startswith('"'):
1345 if v.startswith("'"):
1347 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1354 res = re.sub(r'''(?x)
1355 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1356 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1357 [a-zA-Z_][a-zA-Z_0-9]*
1359 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1363 def qualities(quality_ids):
1364 """ Get a numeric quality value out of a list of possible values """
1367 return quality_ids.index(qid)
1373 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1376 def limit_length(s, length):
1377 """ Add ellipses to overly long strings """
1382 return s[:length - len(ELLIPSES)] + ELLIPSES
1386 def version_tuple(v):
1387 return [int(e) for e in v.split('.')]
1390 def is_outdated_version(version, limit, assume_new=True):
1392 return not assume_new
1394 return version_tuple(version) < version_tuple(limit)
1396 return not assume_new