2 # -*- coding: utf-8 -*-
28 import xml.etree.ElementTree
40 compat_urllib_parse_urlparse,
41 compat_urllib_request,
46 # This is not clearly defined otherwise
47 compiled_regex_type = type(re.compile(''))
50 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
51 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
52 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
53 'Accept-Encoding': 'gzip, deflate',
54 'Accept-Language': 'en-us,en;q=0.5',
57 def preferredencoding():
58 """Get preferred encoding.
60 Returns the best encoding scheme for the system, based on
61 locale.getpreferredencoding() and some further tweaks.
64 pref = locale.getpreferredencoding()
72 def write_json_file(obj, fn):
73 """ Encode obj as JSON and write it to fn, atomically """
77 'prefix': os.path.basename(fn) + '.',
78 'dir': os.path.dirname(fn),
82 # In Python 2.x, json.dump expects a bytestream.
83 # In Python 3.x, it writes to a character stream
84 if sys.version_info < (3, 0):
92 tf = tempfile.NamedTemporaryFile(**args)
97 os.rename(tf.name, fn)
106 if sys.version_info >= (2, 7):
107 def find_xpath_attr(node, xpath, key, val):
108 """ Find the xpath xpath[@key=val] """
109 assert re.match(r'^[a-zA-Z-]+$', key)
110 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
111 expr = xpath + u"[@%s='%s']" % (key, val)
112 return node.find(expr)
114 def find_xpath_attr(node, xpath, key, val):
115 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
116 # .//node does not match if a node is a direct child of . !
117 if isinstance(xpath, unicode):
118 xpath = xpath.encode('ascii')
120 for f in node.findall(xpath):
121 if f.attrib.get(key) == val:
125 # On python2.6 the xml.etree.ElementTree.Element methods don't support
126 # the namespace parameter
127 def xpath_with_ns(path, ns_map):
128 components = [c.split(':') for c in path.split('/')]
132 replaced.append(c[0])
135 replaced.append('{%s}%s' % (ns_map[ns], tag))
136 return '/'.join(replaced)
139 def xpath_text(node, xpath, name=None, fatal=False):
140 if sys.version_info < (2, 7): # Crazy 2.6
141 xpath = xpath.encode('ascii')
146 name = xpath if name is None else name
147 raise ExtractorError('Could not find XML element %s' % name)
153 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
154 class BaseHTMLParser(compat_html_parser.HTMLParser):
156 compat_html_parser.HTMLParser.__init__(self)
159 def loads(self, html):
164 class AttrParser(BaseHTMLParser):
165 """Modified HTMLParser that isolates a tag with the specified attribute"""
166 def __init__(self, attribute, value):
167 self.attribute = attribute
172 self.watch_startpos = False
174 BaseHTMLParser.__init__(self)
176 def error(self, message):
177 if self.error_count > 10 or self.started:
178 raise compat_html_parser.HTMLParseError(message, self.getpos())
179 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
180 self.error_count += 1
183 def handle_starttag(self, tag, attrs):
186 self.find_startpos(None)
187 if self.attribute in attrs and attrs[self.attribute] == self.value:
190 self.watch_startpos = True
192 if not tag in self.depth: self.depth[tag] = 0
195 def handle_endtag(self, tag):
197 if tag in self.depth: self.depth[tag] -= 1
198 if self.depth[self.result[0]] == 0:
200 self.result.append(self.getpos())
202 def find_startpos(self, x):
203 """Needed to put the start position of the result (self.result[1])
204 after the opening tag with the requested id"""
205 if self.watch_startpos:
206 self.watch_startpos = False
207 self.result.append(self.getpos())
208 handle_entityref = handle_charref = handle_data = handle_comment = \
209 handle_decl = handle_pi = unknown_decl = find_startpos
211 def get_result(self):
212 if self.result is None:
214 if len(self.result) != 3:
216 lines = self.html.split('\n')
217 lines = lines[self.result[1][0]-1:self.result[2][0]]
218 lines[0] = lines[0][self.result[1][1]:]
220 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
221 lines[-1] = lines[-1][:self.result[2][1]]
222 return '\n'.join(lines).strip()
223 # Hack for https://github.com/rg3/youtube-dl/issues/662
224 if sys.version_info < (2, 7, 3):
225 AttrParser.parse_endtag = (lambda self, i:
226 i + len("</scr'+'ipt>")
227 if self.rawdata[i:].startswith("</scr'+'ipt>")
228 else compat_html_parser.HTMLParser.parse_endtag(self, i))
230 def get_element_by_id(id, html):
231 """Return the content of the tag with the specified ID in the passed HTML document"""
232 return get_element_by_attribute("id", id, html)
234 def get_element_by_attribute(attribute, value, html):
235 """Return the content of the tag with the specified attribute in the passed HTML document"""
236 parser = AttrParser(attribute, value)
239 except compat_html_parser.HTMLParseError:
241 return parser.get_result()
243 class MetaParser(BaseHTMLParser):
245 Modified HTMLParser that isolates a meta tag with the specified name
248 def __init__(self, name):
249 BaseHTMLParser.__init__(self)
254 def handle_starttag(self, tag, attrs):
258 if attrs.get('name') == self.name:
259 self.result = attrs.get('content')
261 def get_result(self):
264 def get_meta_content(name, html):
266 Return the content attribute from the meta tag with the given name attribute.
268 parser = MetaParser(name)
271 except compat_html_parser.HTMLParseError:
273 return parser.get_result()
276 def clean_html(html):
277 """Clean an HTML snippet into a readable string"""
279 html = html.replace('\n', ' ')
280 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
281 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
283 html = re.sub('<.*?>', '', html)
284 # Replace html entities
285 html = unescapeHTML(html)
289 def sanitize_open(filename, open_mode):
290 """Try to open the given filename, and slightly tweak it if this fails.
292 Attempts to open the given filename. If this fails, it tries to change
293 the filename slightly, step by step, until it's either able to open it
294 or it fails and raises a final exception, like the standard open()
297 It returns the tuple (stream, definitive_file_name).
301 if sys.platform == 'win32':
303 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
304 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
305 stream = open(encodeFilename(filename), open_mode)
306 return (stream, filename)
307 except (IOError, OSError) as err:
308 if err.errno in (errno.EACCES,):
311 # In case of error, try to remove win32 forbidden chars
312 alt_filename = os.path.join(
313 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
314 for path_part in os.path.split(filename)
316 if alt_filename == filename:
319 # An exception here should be caught in the caller
320 stream = open(encodeFilename(filename), open_mode)
321 return (stream, alt_filename)
324 def timeconvert(timestr):
325 """Convert RFC 2822 defined time string into system timestamp"""
327 timetuple = email.utils.parsedate_tz(timestr)
328 if timetuple is not None:
329 timestamp = email.utils.mktime_tz(timetuple)
332 def sanitize_filename(s, restricted=False, is_id=False):
333 """Sanitizes a string so it could be used as part of a filename.
334 If restricted is set, use a stricter subset of allowed characters.
335 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
337 def replace_insane(char):
338 if char == '?' or ord(char) < 32 or ord(char) == 127:
341 return '' if restricted else '\''
343 return '_-' if restricted else ' -'
344 elif char in '\\/|*<>':
346 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
348 if restricted and ord(char) > 127:
352 result = u''.join(map(replace_insane, s))
354 while '__' in result:
355 result = result.replace('__', '_')
356 result = result.strip('_')
357 # Common case of "Foreign band name - English song title"
358 if restricted and result.startswith('-_'):
364 def orderedSet(iterable):
365 """ Remove all duplicates from the input iterable """
373 def _htmlentity_transform(entity):
374 """Transforms an HTML entity to a character."""
375 # Known non-numeric HTML entity
376 if entity in compat_html_entities.name2codepoint:
377 return compat_chr(compat_html_entities.name2codepoint[entity])
379 mobj = re.match(r'#(x?[0-9]+)', entity)
381 numstr = mobj.group(1)
382 if numstr.startswith(u'x'):
384 numstr = u'0%s' % numstr
387 return compat_chr(int(numstr, base))
389 # Unknown entity in name, return its literal representation
390 return (u'&%s;' % entity)
396 assert type(s) == compat_str
399 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
402 def encodeFilename(s, for_subprocess=False):
404 @param s The name of the file
407 assert type(s) == compat_str
409 # Python 3 has a Unicode API
410 if sys.version_info >= (3, 0):
413 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
414 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
415 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
416 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
417 if not for_subprocess:
420 # For subprocess calls, encode with locale encoding
421 # Refer to http://stackoverflow.com/a/9951851/35070
422 encoding = preferredencoding()
424 encoding = sys.getfilesystemencoding()
427 return s.encode(encoding, 'ignore')
430 def encodeArgument(s):
431 if not isinstance(s, compat_str):
432 # Legacy code that uses byte strings
433 # Uncomment the following line after fixing all post processors
434 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
435 s = s.decode('ascii')
436 return encodeFilename(s, True)
439 def decodeOption(optval):
442 if isinstance(optval, bytes):
443 optval = optval.decode(preferredencoding())
445 assert isinstance(optval, compat_str)
448 def formatSeconds(secs):
450 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
452 return '%d:%02d' % (secs // 60, secs % 60)
457 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
458 if sys.version_info < (3, 2):
461 class HTTPSConnectionV3(httplib.HTTPSConnection):
462 def __init__(self, *args, **kwargs):
463 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
466 sock = socket.create_connection((self.host, self.port), self.timeout)
467 if getattr(self, '_tunnel_host', False):
471 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
473 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
475 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
476 def https_open(self, req):
477 return self.do_open(HTTPSConnectionV3, req)
478 return HTTPSHandlerV3(**kwargs)
479 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
480 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
481 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
482 if opts_no_check_certificate:
483 context.verify_mode = ssl.CERT_NONE
484 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
486 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
487 context.verify_mode = (ssl.CERT_NONE
488 if opts_no_check_certificate
489 else ssl.CERT_REQUIRED)
490 context.set_default_verify_paths()
492 context.load_default_certs()
493 except AttributeError:
495 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
497 class ExtractorError(Exception):
498 """Error during info extraction."""
499 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
500 """ tb, if given, is the original traceback (so that it can be printed out).
501 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
504 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
506 if video_id is not None:
507 msg = video_id + ': ' + msg
509 msg += u' (caused by %r)' % cause
511 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
512 super(ExtractorError, self).__init__(msg)
515 self.exc_info = sys.exc_info() # preserve original exception
517 self.video_id = video_id
519 def format_traceback(self):
520 if self.traceback is None:
522 return u''.join(traceback.format_tb(self.traceback))
525 class RegexNotFoundError(ExtractorError):
526 """Error when a regex didn't match"""
530 class DownloadError(Exception):
531 """Download Error exception.
533 This exception may be thrown by FileDownloader objects if they are not
534 configured to continue on errors. They will contain the appropriate
537 def __init__(self, msg, exc_info=None):
538 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
539 super(DownloadError, self).__init__(msg)
540 self.exc_info = exc_info
543 class SameFileError(Exception):
544 """Same File exception.
546 This exception will be thrown by FileDownloader objects if they detect
547 multiple files would have to be downloaded to the same file on disk.
552 class PostProcessingError(Exception):
553 """Post Processing exception.
555 This exception may be raised by PostProcessor's .run() method to
556 indicate an error in the postprocessing task.
558 def __init__(self, msg):
561 class MaxDownloadsReached(Exception):
562 """ --max-downloads limit has been reached. """
566 class UnavailableVideoError(Exception):
567 """Unavailable Format exception.
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
575 class ContentTooShortError(Exception):
576 """Content Too Short exception.
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
586 def __init__(self, downloaded, expected):
587 self.downloaded = downloaded
588 self.expected = expected
590 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
591 """Handler for HTTP requests and responses.
593 This class, when installed with an OpenerDirector, automatically adds
594 the standard headers to every HTTP request and handles gzipped and
595 deflated responses from web servers. If compression is to be avoided in
596 a particular request, the original request in the program code only has
597 to include the HTTP header "Youtubedl-No-Compression", which will be
598 removed before making the real request.
600 Part of this code was copied from:
602 http://techknack.net/python-urllib2-handlers/
604 Andrew Rowls, the author of that code, agreed to release it to the
611 return zlib.decompress(data, -zlib.MAX_WBITS)
613 return zlib.decompress(data)
616 def addinfourl_wrapper(stream, headers, url, code):
617 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
618 return compat_urllib_request.addinfourl(stream, headers, url, code)
619 ret = compat_urllib_request.addinfourl(stream, headers, url)
623 def http_request(self, req):
624 for h, v in std_headers.items():
625 if h not in req.headers:
627 if 'Youtubedl-no-compression' in req.headers:
628 if 'Accept-encoding' in req.headers:
629 del req.headers['Accept-encoding']
630 del req.headers['Youtubedl-no-compression']
631 if 'Youtubedl-user-agent' in req.headers:
632 if 'User-agent' in req.headers:
633 del req.headers['User-agent']
634 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
635 del req.headers['Youtubedl-user-agent']
637 if sys.version_info < (2, 7) and '#' in req.get_full_url():
638 # Python 2.6 is brain-dead when it comes to fragments
639 req._Request__original = req._Request__original.partition('#')[0]
640 req._Request__r_type = req._Request__r_type.partition('#')[0]
644 def http_response(self, req, resp):
647 if resp.headers.get('Content-encoding', '') == 'gzip':
648 content = resp.read()
649 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
651 uncompressed = io.BytesIO(gz.read())
652 except IOError as original_ioerror:
653 # There may be junk add the end of the file
654 # See http://stackoverflow.com/q/4928560/35070 for details
655 for i in range(1, 1024):
657 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
658 uncompressed = io.BytesIO(gz.read())
663 raise original_ioerror
664 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
665 resp.msg = old_resp.msg
667 if resp.headers.get('Content-encoding', '') == 'deflate':
668 gz = io.BytesIO(self.deflate(resp.read()))
669 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
670 resp.msg = old_resp.msg
673 https_request = http_request
674 https_response = http_response
677 def parse_iso8601(date_str, delimiter='T'):
678 """ Return a UNIX timestamp from the given date """
684 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
687 timezone = datetime.timedelta()
689 date_str = date_str[:-len(m.group(0))]
690 if not m.group('sign'):
691 timezone = datetime.timedelta()
693 sign = 1 if m.group('sign') == '+' else -1
694 timezone = datetime.timedelta(
695 hours=sign * int(m.group('hours')),
696 minutes=sign * int(m.group('minutes')))
697 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
698 dt = datetime.datetime.strptime(date_str, date_format) - timezone
699 return calendar.timegm(dt.timetuple())
702 def unified_strdate(date_str):
703 """Return a string with the date in the format YYYYMMDD"""
710 date_str = date_str.replace(',', ' ')
711 # %z (UTC offset) is only supported in python>=3.2
712 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
713 format_expressions = [
718 '%b %dst %Y %I:%M%p',
719 '%b %dnd %Y %I:%M%p',
720 '%b %dth %Y %I:%M%p',
729 '%Y-%m-%d %H:%M:%S.%f',
732 '%Y-%m-%dT%H:%M:%SZ',
733 '%Y-%m-%dT%H:%M:%S.%fZ',
734 '%Y-%m-%dT%H:%M:%S.%f0Z',
736 '%Y-%m-%dT%H:%M:%S.%f',
739 for expression in format_expressions:
741 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
744 if upload_date is None:
745 timetuple = email.utils.parsedate_tz(date_str)
747 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
750 def determine_ext(url, default_ext=u'unknown_video'):
753 guess = url.partition(u'?')[0].rpartition(u'.')[2]
754 if re.match(r'^[A-Za-z0-9]+$', guess):
759 def subtitles_filename(filename, sub_lang, sub_format):
760 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
762 def date_from_str(date_str):
764 Return a datetime object from a string in the format YYYYMMDD or
765 (now|today)[+-][0-9](day|week|month|year)(s)?"""
766 today = datetime.date.today()
767 if date_str == 'now'or date_str == 'today':
769 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
770 if match is not None:
771 sign = match.group('sign')
772 time = int(match.group('time'))
775 unit = match.group('unit')
784 delta = datetime.timedelta(**{unit: time})
786 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
788 def hyphenate_date(date_str):
790 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
791 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
792 if match is not None:
793 return '-'.join(match.groups())
797 class DateRange(object):
798 """Represents a time interval between two dates"""
799 def __init__(self, start=None, end=None):
800 """start and end must be strings in the format accepted by date"""
801 if start is not None:
802 self.start = date_from_str(start)
804 self.start = datetime.datetime.min.date()
806 self.end = date_from_str(end)
808 self.end = datetime.datetime.max.date()
809 if self.start > self.end:
810 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
813 """Returns a range that only contains the given day"""
815 def __contains__(self, date):
816 """Check if the date is in the range"""
817 if not isinstance(date, datetime.date):
818 date = date_from_str(date)
819 return self.start <= date <= self.end
821 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
825 """ Returns the platform name as a compat_str """
826 res = platform.platform()
827 if isinstance(res, bytes):
828 res = res.decode(preferredencoding())
830 assert isinstance(res, compat_str)
834 def _windows_write_string(s, out):
835 """ Returns True if the string was written using special methods,
836 False if it has yet to be written out."""
837 # Adapted from http://stackoverflow.com/a/3259271/35070
840 import ctypes.wintypes
848 fileno = out.fileno()
849 except AttributeError:
850 # If the output stream doesn't have a fileno, it's virtual
852 if fileno not in WIN_OUTPUT_IDS:
855 GetStdHandle = ctypes.WINFUNCTYPE(
856 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
857 ("GetStdHandle", ctypes.windll.kernel32))
858 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
860 WriteConsoleW = ctypes.WINFUNCTYPE(
861 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
862 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
863 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
864 written = ctypes.wintypes.DWORD(0)
866 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
867 FILE_TYPE_CHAR = 0x0002
868 FILE_TYPE_REMOTE = 0x8000
869 GetConsoleMode = ctypes.WINFUNCTYPE(
870 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
871 ctypes.POINTER(ctypes.wintypes.DWORD))(
872 ("GetConsoleMode", ctypes.windll.kernel32))
873 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
875 def not_a_console(handle):
876 if handle == INVALID_HANDLE_VALUE or handle is None:
878 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
879 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
884 def next_nonbmp_pos(s):
886 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
887 except StopIteration:
891 count = min(next_nonbmp_pos(s), 1024)
894 h, s, count if count else 2, ctypes.byref(written), None)
896 raise OSError('Failed to write string')
897 if not count: # We just wrote a non-BMP character
898 assert written.value == 2
901 assert written.value > 0
902 s = s[written.value:]
906 def write_string(s, out=None, encoding=None):
909 assert type(s) == compat_str
911 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
912 if _windows_write_string(s, out):
915 if ('b' in getattr(out, 'mode', '') or
916 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
917 byt = s.encode(encoding or preferredencoding(), 'ignore')
919 elif hasattr(out, 'buffer'):
920 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
921 byt = s.encode(enc, 'ignore')
922 out.buffer.write(byt)
928 def bytes_to_intlist(bs):
931 if isinstance(bs[0], int): # Python 3
934 return [ord(c) for c in bs]
937 def intlist_to_bytes(xs):
940 if isinstance(chr(0), bytes): # Python 2
941 return ''.join([chr(x) for x in xs])
946 # Cross-platform file locking
947 if sys.platform == 'win32':
948 import ctypes.wintypes
951 class OVERLAPPED(ctypes.Structure):
953 ('Internal', ctypes.wintypes.LPVOID),
954 ('InternalHigh', ctypes.wintypes.LPVOID),
955 ('Offset', ctypes.wintypes.DWORD),
956 ('OffsetHigh', ctypes.wintypes.DWORD),
957 ('hEvent', ctypes.wintypes.HANDLE),
960 kernel32 = ctypes.windll.kernel32
961 LockFileEx = kernel32.LockFileEx
962 LockFileEx.argtypes = [
963 ctypes.wintypes.HANDLE, # hFile
964 ctypes.wintypes.DWORD, # dwFlags
965 ctypes.wintypes.DWORD, # dwReserved
966 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
967 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
968 ctypes.POINTER(OVERLAPPED) # Overlapped
970 LockFileEx.restype = ctypes.wintypes.BOOL
971 UnlockFileEx = kernel32.UnlockFileEx
972 UnlockFileEx.argtypes = [
973 ctypes.wintypes.HANDLE, # hFile
974 ctypes.wintypes.DWORD, # dwReserved
975 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
976 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
977 ctypes.POINTER(OVERLAPPED) # Overlapped
979 UnlockFileEx.restype = ctypes.wintypes.BOOL
980 whole_low = 0xffffffff
981 whole_high = 0x7fffffff
983 def _lock_file(f, exclusive):
984 overlapped = OVERLAPPED()
985 overlapped.Offset = 0
986 overlapped.OffsetHigh = 0
987 overlapped.hEvent = 0
988 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
989 handle = msvcrt.get_osfhandle(f.fileno())
990 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
991 whole_low, whole_high, f._lock_file_overlapped_p):
992 raise OSError('Locking file failed: %r' % ctypes.FormatError())
995 assert f._lock_file_overlapped_p
996 handle = msvcrt.get_osfhandle(f.fileno())
997 if not UnlockFileEx(handle, 0,
998 whole_low, whole_high, f._lock_file_overlapped_p):
999 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1004 def _lock_file(f, exclusive):
1005 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1007 def _unlock_file(f):
1008 fcntl.flock(f, fcntl.LOCK_UN)
1011 class locked_file(object):
1012 def __init__(self, filename, mode, encoding=None):
1013 assert mode in ['r', 'a', 'w']
1014 self.f = io.open(filename, mode, encoding=encoding)
1017 def __enter__(self):
1018 exclusive = self.mode != 'r'
1020 _lock_file(self.f, exclusive)
1026 def __exit__(self, etype, value, traceback):
1028 _unlock_file(self.f)
1035 def write(self, *args):
1036 return self.f.write(*args)
1038 def read(self, *args):
1039 return self.f.read(*args)
1042 def get_filesystem_encoding():
1043 encoding = sys.getfilesystemencoding()
1044 return encoding if encoding is not None else 'utf-8'
1047 def shell_quote(args):
1049 encoding = get_filesystem_encoding()
1051 if isinstance(a, bytes):
1052 # We may get a filename encoded with 'encodeFilename'
1053 a = a.decode(encoding)
1054 quoted_args.append(pipes.quote(a))
1055 return u' '.join(quoted_args)
1058 def takewhile_inclusive(pred, seq):
1059 """ Like itertools.takewhile, but include the latest evaluated element
1060 (the first element so that Not pred(e)) """
1067 def smuggle_url(url, data):
1068 """ Pass additional data in a URL for internal use. """
1070 sdata = compat_urllib_parse.urlencode(
1071 {u'__youtubedl_smuggle': json.dumps(data)})
1072 return url + u'#' + sdata
1075 def unsmuggle_url(smug_url, default=None):
1076 if not '#__youtubedl_smuggle' in smug_url:
1077 return smug_url, default
1078 url, _, sdata = smug_url.rpartition(u'#')
1079 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1080 data = json.loads(jsond)
1084 def format_bytes(bytes):
1087 if type(bytes) is str:
1088 bytes = float(bytes)
1092 exponent = int(math.log(bytes, 1024.0))
1093 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1094 converted = float(bytes) / float(1024 ** exponent)
1095 return u'%.2f%s' % (converted, suffix)
1098 def get_term_width():
1099 columns = compat_getenv('COLUMNS', None)
1104 sp = subprocess.Popen(
1106 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1107 out, err = sp.communicate()
1108 return int(out.split()[1])
1114 def month_by_name(name):
1115 """ Return the number of a month by (locale-independently) English name """
1118 u'January', u'February', u'March', u'April', u'May', u'June',
1119 u'July', u'August', u'September', u'October', u'November', u'December']
1121 return ENGLISH_NAMES.index(name) + 1
1126 def fix_xml_ampersands(xml_str):
1127 """Replace all the '&' by '&' in XML"""
1129 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1134 def setproctitle(title):
1135 assert isinstance(title, compat_str)
1137 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1140 title_bytes = title.encode('utf-8')
1141 buf = ctypes.create_string_buffer(len(title_bytes))
1142 buf.value = title_bytes
1144 libc.prctl(15, buf, 0, 0, 0)
1145 except AttributeError:
1146 return # Strange libc, just skip this
1149 def remove_start(s, start):
1150 if s.startswith(start):
1151 return s[len(start):]
1155 def remove_end(s, end):
1157 return s[:-len(end)]
1161 def url_basename(url):
1162 path = compat_urlparse.urlparse(url).path
1163 return path.strip(u'/').split(u'/')[-1]
1166 class HEADRequest(compat_urllib_request.Request):
1167 def get_method(self):
1171 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1174 v = getattr(v, get_attr, None)
1177 return default if v is None else (int(v) * invscale // scale)
1180 def str_or_none(v, default=None):
1181 return default if v is None else compat_str(v)
1184 def str_to_int(int_str):
1185 """ A more relaxed version of int_or_none """
1188 int_str = re.sub(r'[,\.\+]', u'', int_str)
1192 def float_or_none(v, scale=1, invscale=1, default=None):
1193 return default if v is None else (float(v) * invscale / scale)
1196 def parse_duration(s):
1203 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1206 res = int(m.group('secs'))
1208 res += int(m.group('mins')) * 60
1209 if m.group('hours'):
1210 res += int(m.group('hours')) * 60 * 60
1212 res += float(m.group('ms'))
1216 def prepend_extension(filename, ext):
1217 name, real_ext = os.path.splitext(filename)
1218 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1221 def check_executable(exe, args=[]):
1222 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1223 args can be a list of arguments for a short output (like -version) """
1225 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1231 def get_exe_version(exe, args=['--version'],
1232 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1233 unrecognized=u'present'):
1234 """ Returns the version of the specified executable,
1235 or False if the executable is not present """
1237 out, err = subprocess.Popen(
1239 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1242 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1243 m = re.search(version_re, firstline)
1250 class PagedList(object):
1252 # This is only useful for tests
1253 return len(self.getslice())
1256 class OnDemandPagedList(PagedList):
1257 def __init__(self, pagefunc, pagesize):
1258 self._pagefunc = pagefunc
1259 self._pagesize = pagesize
1261 def getslice(self, start=0, end=None):
1263 for pagenum in itertools.count(start // self._pagesize):
1264 firstid = pagenum * self._pagesize
1265 nextfirstid = pagenum * self._pagesize + self._pagesize
1266 if start >= nextfirstid:
1269 page_results = list(self._pagefunc(pagenum))
1272 start % self._pagesize
1273 if firstid <= start < nextfirstid
1277 ((end - 1) % self._pagesize) + 1
1278 if (end is not None and firstid <= end <= nextfirstid)
1281 if startv != 0 or endv is not None:
1282 page_results = page_results[startv:endv]
1283 res.extend(page_results)
1285 # A little optimization - if current page is not "full", ie. does
1286 # not contain page_size videos then we can assume that this page
1287 # is the last one - there are no more ids on further pages -
1288 # i.e. no need to query again.
1289 if len(page_results) + startv < self._pagesize:
1292 # If we got the whole page, but the next page is not interesting,
1293 # break out early as well
1294 if end == nextfirstid:
1299 class InAdvancePagedList(PagedList):
1300 def __init__(self, pagefunc, pagecount, pagesize):
1301 self._pagefunc = pagefunc
1302 self._pagecount = pagecount
1303 self._pagesize = pagesize
1305 def getslice(self, start=0, end=None):
1307 start_page = start // self._pagesize
1309 self._pagecount if end is None else (end // self._pagesize + 1))
1310 skip_elems = start - start_page * self._pagesize
1311 only_more = None if end is None else end - start
1312 for pagenum in range(start_page, end_page):
1313 page = list(self._pagefunc(pagenum))
1315 page = page[skip_elems:]
1317 if only_more is not None:
1318 if len(page) < only_more:
1319 only_more -= len(page)
1321 page = page[:only_more]
1328 def uppercase_escape(s):
1329 unicode_escape = codecs.getdecoder('unicode_escape')
1331 r'\\U[0-9a-fA-F]{8}',
1332 lambda m: unicode_escape(m.group(0))[0],
1336 def escape_rfc3986(s):
1337 """Escape non-ASCII characters as suggested by RFC 3986"""
1338 if sys.version_info < (3, 0) and isinstance(s, unicode):
1339 s = s.encode('utf-8')
1340 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
1343 def escape_url(url):
1344 """Escape URL as suggested by RFC 3986"""
1345 url_parsed = compat_urllib_parse_urlparse(url)
1346 return url_parsed._replace(
1347 path=escape_rfc3986(url_parsed.path),
1348 params=escape_rfc3986(url_parsed.params),
1349 query=escape_rfc3986(url_parsed.query),
1350 fragment=escape_rfc3986(url_parsed.fragment)
1354 struct.pack(u'!I', 0)
1356 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1357 def struct_pack(spec, *args):
1358 if isinstance(spec, compat_str):
1359 spec = spec.encode('ascii')
1360 return struct.pack(spec, *args)
1362 def struct_unpack(spec, *args):
1363 if isinstance(spec, compat_str):
1364 spec = spec.encode('ascii')
1365 return struct.unpack(spec, *args)
1367 struct_pack = struct.pack
1368 struct_unpack = struct.unpack
1371 def read_batch_urls(batch_fd):
1373 if not isinstance(url, compat_str):
1374 url = url.decode('utf-8', 'replace')
1375 BOM_UTF8 = u'\xef\xbb\xbf'
1376 if url.startswith(BOM_UTF8):
1377 url = url[len(BOM_UTF8):]
1379 if url.startswith(('#', ';', ']')):
1383 with contextlib.closing(batch_fd) as fd:
1384 return [url for url in map(fixup, fd) if url]
1387 def urlencode_postdata(*args, **kargs):
1388 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1392 etree_iter = xml.etree.ElementTree.Element.iter
1393 except AttributeError: # Python <=2.6
1394 etree_iter = lambda n: n.findall('.//*')
1398 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1399 def doctype(self, name, pubid, system):
1400 pass # Ignore doctypes
1402 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1403 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1404 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1405 # Fix up XML parser in Python 2.x
1406 if sys.version_info < (3, 0):
1407 for n in etree_iter(tree):
1408 if n.text is not None:
1409 if not isinstance(n.text, compat_str):
1410 n.text = n.text.decode('utf-8')
1423 def parse_age_limit(s):
1426 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1427 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1430 def strip_jsonp(code):
1431 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1434 def js_to_json(code):
1437 if v in ('true', 'false', 'null'):
1439 if v.startswith('"'):
1441 if v.startswith("'"):
1443 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1450 res = re.sub(r'''(?x)
1451 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1452 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1453 [a-zA-Z_][a-zA-Z_0-9]*
1455 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1459 def qualities(quality_ids):
1460 """ Get a numeric quality value out of a list of possible values """
1463 return quality_ids.index(qid)
1469 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1472 def limit_length(s, length):
1473 """ Add ellipses to overly long strings """
1478 return s[:length - len(ELLIPSES)] + ELLIPSES
1482 def version_tuple(v):
1483 return [int(e) for e in v.split('.')]
1486 def is_outdated_version(version, limit, assume_new=True):
1488 return not assume_new
1490 return version_tuple(version) < version_tuple(limit)
1492 return not assume_new