2 # -*- coding: utf-8 -*-
4 from __future__ import unicode_literals
30 import xml.etree.ElementTree
42 compat_urllib_parse_urlparse,
43 compat_urllib_request,
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
59 def preferredencoding():
60 """Get preferred encoding.
62 Returns the best encoding scheme for the system, based on
63 locale.getpreferredencoding() and some further tweaks.
66 pref = locale.getpreferredencoding()
74 def write_json_file(obj, fn):
75 """ Encode obj as JSON and write it to fn, atomically """
79 'prefix': os.path.basename(fn) + '.',
80 'dir': os.path.dirname(fn),
84 # In Python 2.x, json.dump expects a bytestream.
85 # In Python 3.x, it writes to a character stream
86 if sys.version_info < (3, 0):
94 tf = tempfile.NamedTemporaryFile(**args)
99 os.rename(tf.name, fn)
108 if sys.version_info >= (2, 7):
109 def find_xpath_attr(node, xpath, key, val):
110 """ Find the xpath xpath[@key=val] """
111 assert re.match(r'^[a-zA-Z-]+$', key)
112 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
113 expr = xpath + u"[@%s='%s']" % (key, val)
114 return node.find(expr)
116 def find_xpath_attr(node, xpath, key, val):
117 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
118 # .//node does not match if a node is a direct child of . !
119 if isinstance(xpath, unicode):
120 xpath = xpath.encode('ascii')
122 for f in node.findall(xpath):
123 if f.attrib.get(key) == val:
127 # On python2.6 the xml.etree.ElementTree.Element methods don't support
128 # the namespace parameter
129 def xpath_with_ns(path, ns_map):
130 components = [c.split(':') for c in path.split('/')]
134 replaced.append(c[0])
137 replaced.append('{%s}%s' % (ns_map[ns], tag))
138 return '/'.join(replaced)
141 def xpath_text(node, xpath, name=None, fatal=False):
142 if sys.version_info < (2, 7): # Crazy 2.6
143 xpath = xpath.encode('ascii')
148 name = xpath if name is None else name
149 raise ExtractorError('Could not find XML element %s' % name)
155 if sys.version_info < (2, 7):
156 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
158 class BaseHTMLParser(compat_html_parser.HTMLParser):
160 compat_html_parser.HTMLParser.__init__(self)
163 def loads(self, html):
168 class AttrParser(BaseHTMLParser):
169 """Modified HTMLParser that isolates a tag with the specified attribute"""
170 def __init__(self, attribute, value):
171 self.attribute = attribute
176 self.watch_startpos = False
178 BaseHTMLParser.__init__(self)
180 def error(self, message):
181 if self.error_count > 10 or self.started:
182 raise compat_html_parser.HTMLParseError(message, self.getpos())
183 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
184 self.error_count += 1
187 def handle_starttag(self, tag, attrs):
190 self.find_startpos(None)
191 if self.attribute in attrs and attrs[self.attribute] == self.value:
194 self.watch_startpos = True
196 if not tag in self.depth: self.depth[tag] = 0
199 def handle_endtag(self, tag):
201 if tag in self.depth: self.depth[tag] -= 1
202 if self.depth[self.result[0]] == 0:
204 self.result.append(self.getpos())
206 def find_startpos(self, x):
207 """Needed to put the start position of the result (self.result[1])
208 after the opening tag with the requested id"""
209 if self.watch_startpos:
210 self.watch_startpos = False
211 self.result.append(self.getpos())
212 handle_entityref = handle_charref = handle_data = handle_comment = \
213 handle_decl = handle_pi = unknown_decl = find_startpos
215 def get_result(self):
216 if self.result is None:
218 if len(self.result) != 3:
220 lines = self.html.split('\n')
221 lines = lines[self.result[1][0]-1:self.result[2][0]]
222 lines[0] = lines[0][self.result[1][1]:]
224 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
225 lines[-1] = lines[-1][:self.result[2][1]]
226 return '\n'.join(lines).strip()
227 # Hack for https://github.com/rg3/youtube-dl/issues/662
228 if sys.version_info < (2, 7, 3):
229 AttrParser.parse_endtag = (lambda self, i:
230 i + len("</scr'+'ipt>")
231 if self.rawdata[i:].startswith("</scr'+'ipt>")
232 else compat_html_parser.HTMLParser.parse_endtag(self, i))
234 def get_element_by_id(id, html):
235 """Return the content of the tag with the specified ID in the passed HTML document"""
236 return get_element_by_attribute("id", id, html)
238 def get_element_by_attribute(attribute, value, html):
239 """Return the content of the tag with the specified attribute in the passed HTML document"""
240 parser = AttrParser(attribute, value)
243 except compat_html_parser.HTMLParseError:
245 return parser.get_result()
247 class MetaParser(BaseHTMLParser):
249 Modified HTMLParser that isolates a meta tag with the specified name
252 def __init__(self, name):
253 BaseHTMLParser.__init__(self)
258 def handle_starttag(self, tag, attrs):
262 if attrs.get('name') == self.name:
263 self.result = attrs.get('content')
265 def get_result(self):
268 def get_meta_content(name, html):
270 Return the content attribute from the meta tag with the given name attribute.
272 parser = MetaParser(name)
275 except compat_html_parser.HTMLParseError:
277 return parser.get_result()
280 def clean_html(html):
281 """Clean an HTML snippet into a readable string"""
283 html = html.replace('\n', ' ')
284 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
285 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
287 html = re.sub('<.*?>', '', html)
288 # Replace html entities
289 html = unescapeHTML(html)
293 def sanitize_open(filename, open_mode):
294 """Try to open the given filename, and slightly tweak it if this fails.
296 Attempts to open the given filename. If this fails, it tries to change
297 the filename slightly, step by step, until it's either able to open it
298 or it fails and raises a final exception, like the standard open()
301 It returns the tuple (stream, definitive_file_name).
305 if sys.platform == 'win32':
307 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
308 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
309 stream = open(encodeFilename(filename), open_mode)
310 return (stream, filename)
311 except (IOError, OSError) as err:
312 if err.errno in (errno.EACCES,):
315 # In case of error, try to remove win32 forbidden chars
316 alt_filename = os.path.join(
317 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
318 for path_part in os.path.split(filename)
320 if alt_filename == filename:
323 # An exception here should be caught in the caller
324 stream = open(encodeFilename(filename), open_mode)
325 return (stream, alt_filename)
328 def timeconvert(timestr):
329 """Convert RFC 2822 defined time string into system timestamp"""
331 timetuple = email.utils.parsedate_tz(timestr)
332 if timetuple is not None:
333 timestamp = email.utils.mktime_tz(timetuple)
336 def sanitize_filename(s, restricted=False, is_id=False):
337 """Sanitizes a string so it could be used as part of a filename.
338 If restricted is set, use a stricter subset of allowed characters.
339 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
341 def replace_insane(char):
342 if char == '?' or ord(char) < 32 or ord(char) == 127:
345 return '' if restricted else '\''
347 return '_-' if restricted else ' -'
348 elif char in '\\/|*<>':
350 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
352 if restricted and ord(char) > 127:
356 result = u''.join(map(replace_insane, s))
358 while '__' in result:
359 result = result.replace('__', '_')
360 result = result.strip('_')
361 # Common case of "Foreign band name - English song title"
362 if restricted and result.startswith('-_'):
368 def orderedSet(iterable):
369 """ Remove all duplicates from the input iterable """
377 def _htmlentity_transform(entity):
378 """Transforms an HTML entity to a character."""
379 # Known non-numeric HTML entity
380 if entity in compat_html_entities.name2codepoint:
381 return compat_chr(compat_html_entities.name2codepoint[entity])
383 mobj = re.match(r'#(x?[0-9]+)', entity)
385 numstr = mobj.group(1)
386 if numstr.startswith(u'x'):
388 numstr = u'0%s' % numstr
391 return compat_chr(int(numstr, base))
393 # Unknown entity in name, return its literal representation
394 return (u'&%s;' % entity)
400 assert type(s) == compat_str
403 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
406 def encodeFilename(s, for_subprocess=False):
408 @param s The name of the file
411 assert type(s) == compat_str
413 # Python 3 has a Unicode API
414 if sys.version_info >= (3, 0):
417 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
418 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
419 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
420 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
421 if not for_subprocess:
424 # For subprocess calls, encode with locale encoding
425 # Refer to http://stackoverflow.com/a/9951851/35070
426 encoding = preferredencoding()
428 encoding = sys.getfilesystemencoding()
431 return s.encode(encoding, 'ignore')
434 def encodeArgument(s):
435 if not isinstance(s, compat_str):
436 # Legacy code that uses byte strings
437 # Uncomment the following line after fixing all post processors
438 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
439 s = s.decode('ascii')
440 return encodeFilename(s, True)
443 def decodeOption(optval):
446 if isinstance(optval, bytes):
447 optval = optval.decode(preferredencoding())
449 assert isinstance(optval, compat_str)
452 def formatSeconds(secs):
454 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
456 return '%d:%02d' % (secs // 60, secs % 60)
461 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
462 if sys.version_info < (3, 2):
465 class HTTPSConnectionV3(httplib.HTTPSConnection):
466 def __init__(self, *args, **kwargs):
467 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
470 sock = socket.create_connection((self.host, self.port), self.timeout)
471 if getattr(self, '_tunnel_host', False):
475 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
477 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
479 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
480 def https_open(self, req):
481 return self.do_open(HTTPSConnectionV3, req)
482 return HTTPSHandlerV3(**kwargs)
483 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
484 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
485 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
486 if opts_no_check_certificate:
487 context.verify_mode = ssl.CERT_NONE
488 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
490 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
491 context.verify_mode = (ssl.CERT_NONE
492 if opts_no_check_certificate
493 else ssl.CERT_REQUIRED)
494 context.set_default_verify_paths()
496 context.load_default_certs()
497 except AttributeError:
499 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
501 class ExtractorError(Exception):
502 """Error during info extraction."""
503 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
504 """ tb, if given, is the original traceback (so that it can be printed out).
505 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
508 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
510 if video_id is not None:
511 msg = video_id + ': ' + msg
513 msg += u' (caused by %r)' % cause
515 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
516 super(ExtractorError, self).__init__(msg)
519 self.exc_info = sys.exc_info() # preserve original exception
521 self.video_id = video_id
523 def format_traceback(self):
524 if self.traceback is None:
526 return u''.join(traceback.format_tb(self.traceback))
529 class RegexNotFoundError(ExtractorError):
530 """Error when a regex didn't match"""
534 class DownloadError(Exception):
535 """Download Error exception.
537 This exception may be thrown by FileDownloader objects if they are not
538 configured to continue on errors. They will contain the appropriate
541 def __init__(self, msg, exc_info=None):
542 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
543 super(DownloadError, self).__init__(msg)
544 self.exc_info = exc_info
547 class SameFileError(Exception):
548 """Same File exception.
550 This exception will be thrown by FileDownloader objects if they detect
551 multiple files would have to be downloaded to the same file on disk.
556 class PostProcessingError(Exception):
557 """Post Processing exception.
559 This exception may be raised by PostProcessor's .run() method to
560 indicate an error in the postprocessing task.
562 def __init__(self, msg):
565 class MaxDownloadsReached(Exception):
566 """ --max-downloads limit has been reached. """
570 class UnavailableVideoError(Exception):
571 """Unavailable Format exception.
573 This exception will be thrown when a video is requested
574 in a format that is not available for that video.
579 class ContentTooShortError(Exception):
580 """Content Too Short exception.
582 This exception may be raised by FileDownloader objects when a file they
583 download is too small for what the server announced first, indicating
584 the connection was probably interrupted.
590 def __init__(self, downloaded, expected):
591 self.downloaded = downloaded
592 self.expected = expected
594 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
595 """Handler for HTTP requests and responses.
597 This class, when installed with an OpenerDirector, automatically adds
598 the standard headers to every HTTP request and handles gzipped and
599 deflated responses from web servers. If compression is to be avoided in
600 a particular request, the original request in the program code only has
601 to include the HTTP header "Youtubedl-No-Compression", which will be
602 removed before making the real request.
604 Part of this code was copied from:
606 http://techknack.net/python-urllib2-handlers/
608 Andrew Rowls, the author of that code, agreed to release it to the
615 return zlib.decompress(data, -zlib.MAX_WBITS)
617 return zlib.decompress(data)
620 def addinfourl_wrapper(stream, headers, url, code):
621 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
622 return compat_urllib_request.addinfourl(stream, headers, url, code)
623 ret = compat_urllib_request.addinfourl(stream, headers, url)
627 def http_request(self, req):
628 for h, v in std_headers.items():
629 if h not in req.headers:
631 if 'Youtubedl-no-compression' in req.headers:
632 if 'Accept-encoding' in req.headers:
633 del req.headers['Accept-encoding']
634 del req.headers['Youtubedl-no-compression']
635 if 'Youtubedl-user-agent' in req.headers:
636 if 'User-agent' in req.headers:
637 del req.headers['User-agent']
638 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
639 del req.headers['Youtubedl-user-agent']
641 if sys.version_info < (2, 7) and '#' in req.get_full_url():
642 # Python 2.6 is brain-dead when it comes to fragments
643 req._Request__original = req._Request__original.partition('#')[0]
644 req._Request__r_type = req._Request__r_type.partition('#')[0]
648 def http_response(self, req, resp):
651 if resp.headers.get('Content-encoding', '') == 'gzip':
652 content = resp.read()
653 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
655 uncompressed = io.BytesIO(gz.read())
656 except IOError as original_ioerror:
657 # There may be junk add the end of the file
658 # See http://stackoverflow.com/q/4928560/35070 for details
659 for i in range(1, 1024):
661 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
662 uncompressed = io.BytesIO(gz.read())
667 raise original_ioerror
668 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
669 resp.msg = old_resp.msg
671 if resp.headers.get('Content-encoding', '') == 'deflate':
672 gz = io.BytesIO(self.deflate(resp.read()))
673 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
674 resp.msg = old_resp.msg
677 https_request = http_request
678 https_response = http_response
681 def parse_iso8601(date_str, delimiter='T'):
682 """ Return a UNIX timestamp from the given date """
688 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
691 timezone = datetime.timedelta()
693 date_str = date_str[:-len(m.group(0))]
694 if not m.group('sign'):
695 timezone = datetime.timedelta()
697 sign = 1 if m.group('sign') == '+' else -1
698 timezone = datetime.timedelta(
699 hours=sign * int(m.group('hours')),
700 minutes=sign * int(m.group('minutes')))
701 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
702 dt = datetime.datetime.strptime(date_str, date_format) - timezone
703 return calendar.timegm(dt.timetuple())
706 def unified_strdate(date_str):
707 """Return a string with the date in the format YYYYMMDD"""
714 date_str = date_str.replace(',', ' ')
715 # %z (UTC offset) is only supported in python>=3.2
716 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
717 format_expressions = [
722 '%b %dst %Y %I:%M%p',
723 '%b %dnd %Y %I:%M%p',
724 '%b %dth %Y %I:%M%p',
733 '%Y-%m-%d %H:%M:%S.%f',
736 '%Y-%m-%dT%H:%M:%SZ',
737 '%Y-%m-%dT%H:%M:%S.%fZ',
738 '%Y-%m-%dT%H:%M:%S.%f0Z',
740 '%Y-%m-%dT%H:%M:%S.%f',
743 for expression in format_expressions:
745 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
748 if upload_date is None:
749 timetuple = email.utils.parsedate_tz(date_str)
751 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
754 def determine_ext(url, default_ext=u'unknown_video'):
757 guess = url.partition(u'?')[0].rpartition(u'.')[2]
758 if re.match(r'^[A-Za-z0-9]+$', guess):
763 def subtitles_filename(filename, sub_lang, sub_format):
764 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
766 def date_from_str(date_str):
768 Return a datetime object from a string in the format YYYYMMDD or
769 (now|today)[+-][0-9](day|week|month|year)(s)?"""
770 today = datetime.date.today()
771 if date_str == 'now'or date_str == 'today':
773 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
774 if match is not None:
775 sign = match.group('sign')
776 time = int(match.group('time'))
779 unit = match.group('unit')
788 delta = datetime.timedelta(**{unit: time})
790 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
792 def hyphenate_date(date_str):
794 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
795 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
796 if match is not None:
797 return '-'.join(match.groups())
801 class DateRange(object):
802 """Represents a time interval between two dates"""
803 def __init__(self, start=None, end=None):
804 """start and end must be strings in the format accepted by date"""
805 if start is not None:
806 self.start = date_from_str(start)
808 self.start = datetime.datetime.min.date()
810 self.end = date_from_str(end)
812 self.end = datetime.datetime.max.date()
813 if self.start > self.end:
814 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
817 """Returns a range that only contains the given day"""
819 def __contains__(self, date):
820 """Check if the date is in the range"""
821 if not isinstance(date, datetime.date):
822 date = date_from_str(date)
823 return self.start <= date <= self.end
825 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
829 """ Returns the platform name as a compat_str """
830 res = platform.platform()
831 if isinstance(res, bytes):
832 res = res.decode(preferredencoding())
834 assert isinstance(res, compat_str)
838 def _windows_write_string(s, out):
839 """ Returns True if the string was written using special methods,
840 False if it has yet to be written out."""
841 # Adapted from http://stackoverflow.com/a/3259271/35070
844 import ctypes.wintypes
852 fileno = out.fileno()
853 except AttributeError:
854 # If the output stream doesn't have a fileno, it's virtual
856 if fileno not in WIN_OUTPUT_IDS:
859 GetStdHandle = ctypes.WINFUNCTYPE(
860 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
861 ("GetStdHandle", ctypes.windll.kernel32))
862 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
864 WriteConsoleW = ctypes.WINFUNCTYPE(
865 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
866 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
867 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
868 written = ctypes.wintypes.DWORD(0)
870 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
871 FILE_TYPE_CHAR = 0x0002
872 FILE_TYPE_REMOTE = 0x8000
873 GetConsoleMode = ctypes.WINFUNCTYPE(
874 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
875 ctypes.POINTER(ctypes.wintypes.DWORD))(
876 ("GetConsoleMode", ctypes.windll.kernel32))
877 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
879 def not_a_console(handle):
880 if handle == INVALID_HANDLE_VALUE or handle is None:
882 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
883 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
888 def next_nonbmp_pos(s):
890 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
891 except StopIteration:
895 count = min(next_nonbmp_pos(s), 1024)
898 h, s, count if count else 2, ctypes.byref(written), None)
900 raise OSError('Failed to write string')
901 if not count: # We just wrote a non-BMP character
902 assert written.value == 2
905 assert written.value > 0
906 s = s[written.value:]
910 def write_string(s, out=None, encoding=None):
913 assert type(s) == compat_str
915 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
916 if _windows_write_string(s, out):
919 if ('b' in getattr(out, 'mode', '') or
920 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
921 byt = s.encode(encoding or preferredencoding(), 'ignore')
923 elif hasattr(out, 'buffer'):
924 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
925 byt = s.encode(enc, 'ignore')
926 out.buffer.write(byt)
932 def bytes_to_intlist(bs):
935 if isinstance(bs[0], int): # Python 3
938 return [ord(c) for c in bs]
941 def intlist_to_bytes(xs):
944 if isinstance(chr(0), bytes): # Python 2
945 return ''.join([chr(x) for x in xs])
950 # Cross-platform file locking
951 if sys.platform == 'win32':
952 import ctypes.wintypes
955 class OVERLAPPED(ctypes.Structure):
957 ('Internal', ctypes.wintypes.LPVOID),
958 ('InternalHigh', ctypes.wintypes.LPVOID),
959 ('Offset', ctypes.wintypes.DWORD),
960 ('OffsetHigh', ctypes.wintypes.DWORD),
961 ('hEvent', ctypes.wintypes.HANDLE),
964 kernel32 = ctypes.windll.kernel32
965 LockFileEx = kernel32.LockFileEx
966 LockFileEx.argtypes = [
967 ctypes.wintypes.HANDLE, # hFile
968 ctypes.wintypes.DWORD, # dwFlags
969 ctypes.wintypes.DWORD, # dwReserved
970 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
971 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
972 ctypes.POINTER(OVERLAPPED) # Overlapped
974 LockFileEx.restype = ctypes.wintypes.BOOL
975 UnlockFileEx = kernel32.UnlockFileEx
976 UnlockFileEx.argtypes = [
977 ctypes.wintypes.HANDLE, # hFile
978 ctypes.wintypes.DWORD, # dwReserved
979 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
980 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
981 ctypes.POINTER(OVERLAPPED) # Overlapped
983 UnlockFileEx.restype = ctypes.wintypes.BOOL
984 whole_low = 0xffffffff
985 whole_high = 0x7fffffff
987 def _lock_file(f, exclusive):
988 overlapped = OVERLAPPED()
989 overlapped.Offset = 0
990 overlapped.OffsetHigh = 0
991 overlapped.hEvent = 0
992 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
993 handle = msvcrt.get_osfhandle(f.fileno())
994 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
995 whole_low, whole_high, f._lock_file_overlapped_p):
996 raise OSError('Locking file failed: %r' % ctypes.FormatError())
999 assert f._lock_file_overlapped_p
1000 handle = msvcrt.get_osfhandle(f.fileno())
1001 if not UnlockFileEx(handle, 0,
1002 whole_low, whole_high, f._lock_file_overlapped_p):
1003 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1008 def _lock_file(f, exclusive):
1009 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1011 def _unlock_file(f):
1012 fcntl.flock(f, fcntl.LOCK_UN)
1015 class locked_file(object):
1016 def __init__(self, filename, mode, encoding=None):
1017 assert mode in ['r', 'a', 'w']
1018 self.f = io.open(filename, mode, encoding=encoding)
1021 def __enter__(self):
1022 exclusive = self.mode != 'r'
1024 _lock_file(self.f, exclusive)
1030 def __exit__(self, etype, value, traceback):
1032 _unlock_file(self.f)
1039 def write(self, *args):
1040 return self.f.write(*args)
1042 def read(self, *args):
1043 return self.f.read(*args)
1046 def get_filesystem_encoding():
1047 encoding = sys.getfilesystemencoding()
1048 return encoding if encoding is not None else 'utf-8'
1051 def shell_quote(args):
1053 encoding = get_filesystem_encoding()
1055 if isinstance(a, bytes):
1056 # We may get a filename encoded with 'encodeFilename'
1057 a = a.decode(encoding)
1058 quoted_args.append(pipes.quote(a))
1059 return u' '.join(quoted_args)
1062 def takewhile_inclusive(pred, seq):
1063 """ Like itertools.takewhile, but include the latest evaluated element
1064 (the first element so that Not pred(e)) """
1071 def smuggle_url(url, data):
1072 """ Pass additional data in a URL for internal use. """
1074 sdata = compat_urllib_parse.urlencode(
1075 {u'__youtubedl_smuggle': json.dumps(data)})
1076 return url + u'#' + sdata
1079 def unsmuggle_url(smug_url, default=None):
1080 if not '#__youtubedl_smuggle' in smug_url:
1081 return smug_url, default
1082 url, _, sdata = smug_url.rpartition(u'#')
1083 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1084 data = json.loads(jsond)
1088 def format_bytes(bytes):
1091 if type(bytes) is str:
1092 bytes = float(bytes)
1096 exponent = int(math.log(bytes, 1024.0))
1097 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1098 converted = float(bytes) / float(1024 ** exponent)
1099 return u'%.2f%s' % (converted, suffix)
1102 def get_term_width():
1103 columns = compat_getenv('COLUMNS', None)
1108 sp = subprocess.Popen(
1110 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1111 out, err = sp.communicate()
1112 return int(out.split()[1])
1118 def month_by_name(name):
1119 """ Return the number of a month by (locale-independently) English name """
1122 u'January', u'February', u'March', u'April', u'May', u'June',
1123 u'July', u'August', u'September', u'October', u'November', u'December']
1125 return ENGLISH_NAMES.index(name) + 1
1130 def fix_xml_ampersands(xml_str):
1131 """Replace all the '&' by '&' in XML"""
1133 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1138 def setproctitle(title):
1139 assert isinstance(title, compat_str)
1141 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1144 title_bytes = title.encode('utf-8')
1145 buf = ctypes.create_string_buffer(len(title_bytes))
1146 buf.value = title_bytes
1148 libc.prctl(15, buf, 0, 0, 0)
1149 except AttributeError:
1150 return # Strange libc, just skip this
1153 def remove_start(s, start):
1154 if s.startswith(start):
1155 return s[len(start):]
1159 def remove_end(s, end):
1161 return s[:-len(end)]
1165 def url_basename(url):
1166 path = compat_urlparse.urlparse(url).path
1167 return path.strip(u'/').split(u'/')[-1]
1170 class HEADRequest(compat_urllib_request.Request):
1171 def get_method(self):
1175 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1178 v = getattr(v, get_attr, None)
1181 return default if v is None else (int(v) * invscale // scale)
1184 def str_or_none(v, default=None):
1185 return default if v is None else compat_str(v)
1188 def str_to_int(int_str):
1189 """ A more relaxed version of int_or_none """
1192 int_str = re.sub(r'[,\.\+]', u'', int_str)
1196 def float_or_none(v, scale=1, invscale=1, default=None):
1197 return default if v is None else (float(v) * invscale / scale)
1200 def parse_duration(s):
1207 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1210 res = int(m.group('secs'))
1212 res += int(m.group('mins')) * 60
1213 if m.group('hours'):
1214 res += int(m.group('hours')) * 60 * 60
1216 res += float(m.group('ms'))
1220 def prepend_extension(filename, ext):
1221 name, real_ext = os.path.splitext(filename)
1222 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1225 def check_executable(exe, args=[]):
1226 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1227 args can be a list of arguments for a short output (like -version) """
1229 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1235 def get_exe_version(exe, args=['--version'],
1236 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1237 unrecognized=u'present'):
1238 """ Returns the version of the specified executable,
1239 or False if the executable is not present """
1241 out, err = subprocess.Popen(
1243 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1246 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1247 m = re.search(version_re, firstline)
1254 class PagedList(object):
1256 # This is only useful for tests
1257 return len(self.getslice())
1260 class OnDemandPagedList(PagedList):
1261 def __init__(self, pagefunc, pagesize):
1262 self._pagefunc = pagefunc
1263 self._pagesize = pagesize
1265 def getslice(self, start=0, end=None):
1267 for pagenum in itertools.count(start // self._pagesize):
1268 firstid = pagenum * self._pagesize
1269 nextfirstid = pagenum * self._pagesize + self._pagesize
1270 if start >= nextfirstid:
1273 page_results = list(self._pagefunc(pagenum))
1276 start % self._pagesize
1277 if firstid <= start < nextfirstid
1281 ((end - 1) % self._pagesize) + 1
1282 if (end is not None and firstid <= end <= nextfirstid)
1285 if startv != 0 or endv is not None:
1286 page_results = page_results[startv:endv]
1287 res.extend(page_results)
1289 # A little optimization - if current page is not "full", ie. does
1290 # not contain page_size videos then we can assume that this page
1291 # is the last one - there are no more ids on further pages -
1292 # i.e. no need to query again.
1293 if len(page_results) + startv < self._pagesize:
1296 # If we got the whole page, but the next page is not interesting,
1297 # break out early as well
1298 if end == nextfirstid:
1303 class InAdvancePagedList(PagedList):
1304 def __init__(self, pagefunc, pagecount, pagesize):
1305 self._pagefunc = pagefunc
1306 self._pagecount = pagecount
1307 self._pagesize = pagesize
1309 def getslice(self, start=0, end=None):
1311 start_page = start // self._pagesize
1313 self._pagecount if end is None else (end // self._pagesize + 1))
1314 skip_elems = start - start_page * self._pagesize
1315 only_more = None if end is None else end - start
1316 for pagenum in range(start_page, end_page):
1317 page = list(self._pagefunc(pagenum))
1319 page = page[skip_elems:]
1321 if only_more is not None:
1322 if len(page) < only_more:
1323 only_more -= len(page)
1325 page = page[:only_more]
1332 def uppercase_escape(s):
1333 unicode_escape = codecs.getdecoder('unicode_escape')
1335 r'\\U[0-9a-fA-F]{8}',
1336 lambda m: unicode_escape(m.group(0))[0],
1340 def escape_rfc3986(s):
1341 """Escape non-ASCII characters as suggested by RFC 3986"""
1342 if sys.version_info < (3, 0) and isinstance(s, unicode):
1343 s = s.encode('utf-8')
1344 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1347 def escape_url(url):
1348 """Escape URL as suggested by RFC 3986"""
1349 url_parsed = compat_urllib_parse_urlparse(url)
1350 return url_parsed._replace(
1351 path=escape_rfc3986(url_parsed.path),
1352 params=escape_rfc3986(url_parsed.params),
1353 query=escape_rfc3986(url_parsed.query),
1354 fragment=escape_rfc3986(url_parsed.fragment)
1358 struct.pack(u'!I', 0)
1360 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1361 def struct_pack(spec, *args):
1362 if isinstance(spec, compat_str):
1363 spec = spec.encode('ascii')
1364 return struct.pack(spec, *args)
1366 def struct_unpack(spec, *args):
1367 if isinstance(spec, compat_str):
1368 spec = spec.encode('ascii')
1369 return struct.unpack(spec, *args)
1371 struct_pack = struct.pack
1372 struct_unpack = struct.unpack
1375 def read_batch_urls(batch_fd):
1377 if not isinstance(url, compat_str):
1378 url = url.decode('utf-8', 'replace')
1379 BOM_UTF8 = u'\xef\xbb\xbf'
1380 if url.startswith(BOM_UTF8):
1381 url = url[len(BOM_UTF8):]
1383 if url.startswith(('#', ';', ']')):
1387 with contextlib.closing(batch_fd) as fd:
1388 return [url for url in map(fixup, fd) if url]
1391 def urlencode_postdata(*args, **kargs):
1392 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1396 etree_iter = xml.etree.ElementTree.Element.iter
1397 except AttributeError: # Python <=2.6
1398 etree_iter = lambda n: n.findall('.//*')
1402 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1403 def doctype(self, name, pubid, system):
1404 pass # Ignore doctypes
1406 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1407 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1408 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1409 # Fix up XML parser in Python 2.x
1410 if sys.version_info < (3, 0):
1411 for n in etree_iter(tree):
1412 if n.text is not None:
1413 if not isinstance(n.text, compat_str):
1414 n.text = n.text.decode('utf-8')
1427 def parse_age_limit(s):
1430 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1431 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1434 def strip_jsonp(code):
1435 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1438 def js_to_json(code):
1441 if v in ('true', 'false', 'null'):
1443 if v.startswith('"'):
1445 if v.startswith("'"):
1447 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1454 res = re.sub(r'''(?x)
1455 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1456 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1457 [a-zA-Z_][a-zA-Z_0-9]*
1459 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1463 def qualities(quality_ids):
1464 """ Get a numeric quality value out of a list of possible values """
1467 return quality_ids.index(qid)
1473 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1476 def limit_length(s, length):
1477 """ Add ellipses to overly long strings """
1482 return s[:length - len(ELLIPSES)] + ELLIPSES
1486 def version_tuple(v):
1487 return [int(e) for e in v.split('.')]
1490 def is_outdated_version(version, limit, assume_new=True):
1492 return not assume_new
1494 return version_tuple(version) < version_tuple(limit)
1496 return not assume_new