From 32973ea2749f8efd05b543dd774763513013b38b Mon Sep 17 00:00:00 2001 From: Matt Corallo Date: Mon, 2 Oct 2023 16:43:48 +0000 Subject: [PATCH] [Java] Properly convert strings to/from Modified UTF-8 Fixes #136 --- java_strings.py | 132 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 120 insertions(+), 12 deletions(-) diff --git a/java_strings.py b/java_strings.py index c3e4cb32..946e1650 100644 --- a/java_strings.py +++ b/java_strings.py @@ -463,25 +463,133 @@ typedef jlongArray int64_tArray; typedef jbyteArray int8_tArray; typedef jshortArray int16_tArray; -static inline jstring str_ref_to_java(JNIEnv *env, const char* chars, size_t len) { - // Sadly we need to create a temporary because Java can't accept a char* without a 0-terminator - char* conv_buf = MALLOC(len + 1, "str conv buf"); - memcpy(conv_buf, chars, len); - conv_buf[len] = 0; - jstring ret = (*env)->NewStringUTF(env, conv_buf); - FREE(conv_buf); +static inline jstring str_ref_to_java(JNIEnv *env, const unsigned char* chars, size_t len) { + // Java uses "Modified UTF-8" rather than UTF-8. This requires special + // handling for codepoints above 0xFFFF, which get converted from four + // bytes to six. We don't know upfront how many codepoints in the string + // are above 0xFFFF, so we just allocate an extra 33% up front and waste a + // bit of space. + unsigned char* java_chars = MALLOC(len * 3 / 2 + 1, "str conv buf"); + unsigned char* next_java_char = java_chars; + const unsigned char* next_in_char = chars; + const unsigned char* end = chars + len; + #define COPY_CHAR_TO_JAVA do { *next_java_char = *next_in_char; next_java_char++; next_in_char++; } while (0) + + while (next_in_char < end) { + if (!*next_in_char) break; + if (!(*next_in_char & 0b10000000)) { + COPY_CHAR_TO_JAVA; + } else if ((*next_in_char & 0b11100000) == 0b11000000) { + if (next_in_char + 2 > end) { CHECK(false); break; } // bad string + COPY_CHAR_TO_JAVA; + COPY_CHAR_TO_JAVA; + } else if ((*next_in_char & 0b11110000) == 0b11100000) { + if (next_in_char + 3 > end) { CHECK(false); break; } // bad string + COPY_CHAR_TO_JAVA; + COPY_CHAR_TO_JAVA; + COPY_CHAR_TO_JAVA; + } else if ((*next_in_char & 0b11111000) == 0b11110000) { + if (next_in_char + 4 > end) { CHECK(false); break; } // bad string + uint32_t codepoint = 0; + codepoint |= (((uint32_t)*(next_in_char )) & 0b00000111) << 18; + codepoint |= (((uint32_t)*(next_in_char + 1)) & 0b00111111) << 12; + codepoint |= (((uint32_t)*(next_in_char + 2)) & 0b00111111) << 6; + codepoint |= (((uint32_t)*(next_in_char + 3)) & 0b00111111) << 0; + codepoint -= 0x10000; + *next_java_char = 0b11101101; + next_java_char++; + *next_java_char = 0b10100000 | ((codepoint >> 16) & 0b00001111); + next_java_char++; + *next_java_char = 0b10000000 | ((codepoint >> 10) & 0b00111111); + next_java_char++; + *next_java_char = 0b11101101; + next_java_char++; + *next_java_char = 0b10110000 | ((codepoint >> 6) & 0b00001111); + next_java_char++; + *next_java_char = 0b10000000 | ((codepoint >> 0) & 0b00111111); + next_java_char++; + next_in_char += 4; + } else { + // Bad string + CHECK(false); + break; + } + } + *next_java_char = 0; + jstring ret = (*env)->NewStringUTF(env, java_chars); + FREE(java_chars); return ret; } static inline LDKStr java_to_owned_str(JNIEnv *env, jstring str) { uint64_t str_len = (*env)->GetStringUTFLength(env, str); - char* newchars = MALLOC(str_len + 1, "String chars"); - const char* jchars = (*env)->GetStringUTFChars(env, str, NULL); - memcpy(newchars, jchars, str_len); - newchars[str_len] = 0; + // Java uses "Modified UTF-8" rather than UTF-8. This requires special + // handling for codepoints above 0xFFFF, which we implement below. + unsigned char* newchars = MALLOC(str_len, "String chars"); + unsigned char* next_newchar = newchars; + uint64_t utf8_len = 0; + + const unsigned char* jchars = (*env)->GetStringUTFChars(env, str, NULL); + const unsigned char* next_char = jchars; + const unsigned char* end = jchars + str_len; + + #define COPY_CHAR_FROM_JAVA do { *next_newchar = *next_char; next_newchar++; next_char++; utf8_len++; } while (0) + + while (next_char < end) { + if (!(*next_char & 0b10000000)) { + CHECK(*next_char != 0); // Bad Modified UTF-8 string, but we'll just cut here + COPY_CHAR_FROM_JAVA; + } else if ((*next_char & 0b11100000) == 0b11000000) { + if (next_char + 2 > end) { CHECK(false); break; } // bad string + uint16_t codepoint = 0; + codepoint |= (((uint16_t)(*next_char & 0x1f)) << 6); + codepoint |= *(next_char + 1) & 0x3f; + if (codepoint == 0) { + // We should really never get null codepoints, but java allows them. + // Just skip it. + next_char += 2; + } else { + COPY_CHAR_FROM_JAVA; + COPY_CHAR_FROM_JAVA; + } + } else if ((*next_char & 0b11110000) == 0b11100000) { + if (next_char + 3 > end) { CHECK(false); break; } // bad string + if (*next_char == 0b11101101 && (*(next_char + 1) & 0b11110000) == 0b10100000) { + // Surrogate code unit shoul indicate we have a codepoint above + // 0xFFFF, which is where Modified UTF-8 and UTF-8 diverge. + if (next_char + 6 > end) { CHECK(false); break; } // bad string + CHECK(*(next_char + 3) == 0b11101101); + CHECK((*(next_char + 4) & 0b11110000) == 0b10110000); + // Calculate the codepoint per https://docs.oracle.com/javase/1.5.0/docs/guide/jni/spec/types.html#wp16542 + uint32_t codepoint = 0x10000; + codepoint += ((((uint32_t)*(next_char + 1)) & 0x0f) << 16); + codepoint += ((((uint32_t)*(next_char + 2)) & 0x3f) << 10); + codepoint += ((((uint32_t)*(next_char + 4)) & 0x0f) << 6); + codepoint += (((uint32_t)*(next_char + 5)) & 0x3f); + *next_newchar = 0b11110000 | ((codepoint >> 18) & 0b111); + next_newchar++; + *next_newchar = 0b10000000 | ((codepoint >> 12) & 0b111111); + next_newchar++; + *next_newchar = 0b10000000 | ((codepoint >> 6) & 0b111111); + next_newchar++; + *next_newchar = 0b10000000 | ( codepoint & 0b111111); + next_newchar++; + next_char += 6; + utf8_len += 4; + } else { + COPY_CHAR_FROM_JAVA; + COPY_CHAR_FROM_JAVA; + COPY_CHAR_FROM_JAVA; + } + } else { + // Bad string + CHECK(false); + break; + } + } (*env)->ReleaseStringUTFChars(env, str, jchars); LDKStr res = { .chars = newchars, - .len = str_len, + .len = utf8_len, .chars_is_owned = true }; return res; -- 2.39.5