+ // Java uses "Modified UTF-8" rather than UTF-8. This requires special
+ // handling for codepoints above 0xFFFF, which we implement below.
+ unsigned char* newchars = MALLOC(str_len, "String chars");
+ unsigned char* next_newchar = newchars;
+ uint64_t utf8_len = 0;
+
+ const unsigned char* jchars = (*env)->GetStringUTFChars(env, str, NULL);
+ const unsigned char* next_char = jchars;
+ const unsigned char* end = jchars + str_len;
+
+ #define COPY_CHAR_FROM_JAVA do { *next_newchar = *next_char; next_newchar++; next_char++; utf8_len++; } while (0)
+
+ while (next_char < end) {
+ if (!(*next_char & 0b10000000)) {
+ CHECK(*next_char != 0); // Bad Modified UTF-8 string, but we'll just cut here
+ COPY_CHAR_FROM_JAVA;
+ } else if ((*next_char & 0b11100000) == 0b11000000) {
+ if (next_char + 2 > end) { CHECK(false); break; } // bad string
+ uint16_t codepoint = 0;
+ codepoint |= (((uint16_t)(*next_char & 0x1f)) << 6);
+ codepoint |= *(next_char + 1) & 0x3f;
+ if (codepoint == 0) {
+ // We should really never get null codepoints, but java allows them.
+ // Just skip it.
+ next_char += 2;
+ } else {
+ COPY_CHAR_FROM_JAVA;
+ COPY_CHAR_FROM_JAVA;
+ }
+ } else if ((*next_char & 0b11110000) == 0b11100000) {
+ if (next_char + 3 > end) { CHECK(false); break; } // bad string
+ if (*next_char == 0b11101101 && (*(next_char + 1) & 0b11110000) == 0b10100000) {
+ // Surrogate code unit shoul indicate we have a codepoint above
+ // 0xFFFF, which is where Modified UTF-8 and UTF-8 diverge.
+ if (next_char + 6 > end) { CHECK(false); break; } // bad string
+ CHECK(*(next_char + 3) == 0b11101101);
+ CHECK((*(next_char + 4) & 0b11110000) == 0b10110000);
+ // Calculate the codepoint per https://docs.oracle.com/javase/1.5.0/docs/guide/jni/spec/types.html#wp16542
+ uint32_t codepoint = 0x10000;
+ codepoint += ((((uint32_t)*(next_char + 1)) & 0x0f) << 16);
+ codepoint += ((((uint32_t)*(next_char + 2)) & 0x3f) << 10);
+ codepoint += ((((uint32_t)*(next_char + 4)) & 0x0f) << 6);
+ codepoint += (((uint32_t)*(next_char + 5)) & 0x3f);
+ *next_newchar = 0b11110000 | ((codepoint >> 18) & 0b111);
+ next_newchar++;
+ *next_newchar = 0b10000000 | ((codepoint >> 12) & 0b111111);
+ next_newchar++;
+ *next_newchar = 0b10000000 | ((codepoint >> 6) & 0b111111);
+ next_newchar++;
+ *next_newchar = 0b10000000 | ( codepoint & 0b111111);
+ next_newchar++;
+ next_char += 6;
+ utf8_len += 4;
+ } else {
+ COPY_CHAR_FROM_JAVA;
+ COPY_CHAR_FROM_JAVA;
+ COPY_CHAR_FROM_JAVA;
+ }
+ } else {
+ // Bad string
+ CHECK(false);
+ break;
+ }
+ }