mirror of
https://github.com/protobufjs/protobuf.js.git
synced 2025-12-08 20:58:55 +00:00
* fix utf8 -> utf16 decoding bug on surrogate pairs This fixes https://github.com/protobufjs/protobuf.js/issues/1473 The custom utf8 -> utf16 decoder appears to be subtly flawed. From my reading it appears the chunking mechanism doesn't account for surrogate pairs at the end of a chunk causing variable size chunks. A larger chunk followed by a smaller chunk leaves behind garbage that'll be included in the latter chunk. It looks like the chunking mechanism was added to prevent stack overflows when calling `formCharCode` with too many args. From some benchmarking it appears putting utf16 code units in an array and spreading that into `fromCharCode` wasn't helping performance much anyway. I simplified it significantly. Here's a repro of the existing encoding bug in a fuzzing suite https://repl.it/@turbio/oh-no-our-strings#decoder.js * fix lint * add test case for surrogate pair bug Co-authored-by: Alexander Fenster <fenster@google.com>
97 lines
2.9 KiB
JavaScript
97 lines
2.9 KiB
JavaScript
"use strict";
|
|
|
|
/**
|
|
* A minimal UTF8 implementation for number arrays.
|
|
* @memberof util
|
|
* @namespace
|
|
*/
|
|
var utf8 = exports;
|
|
|
|
/**
|
|
* Calculates the UTF8 byte length of a string.
|
|
* @param {string} string String
|
|
* @returns {number} Byte length
|
|
*/
|
|
utf8.length = function utf8_length(string) {
|
|
var len = 0,
|
|
c = 0;
|
|
for (var i = 0; i < string.length; ++i) {
|
|
c = string.charCodeAt(i);
|
|
if (c < 128)
|
|
len += 1;
|
|
else if (c < 2048)
|
|
len += 2;
|
|
else if ((c & 0xFC00) === 0xD800 && (string.charCodeAt(i + 1) & 0xFC00) === 0xDC00) {
|
|
++i;
|
|
len += 4;
|
|
} else
|
|
len += 3;
|
|
}
|
|
return len;
|
|
};
|
|
|
|
/**
|
|
* Reads UTF8 bytes as a string.
|
|
* @param {Uint8Array} buffer Source buffer
|
|
* @param {number} start Source start
|
|
* @param {number} end Source end
|
|
* @returns {string} String read
|
|
*/
|
|
utf8.read = function utf8_read(buffer, start, end) {
|
|
if (end - start < 1) {
|
|
return "";
|
|
}
|
|
|
|
var str = "";
|
|
for (var i = start; i < end;) {
|
|
var t = buffer[i++];
|
|
if (t <= 0x7F) {
|
|
str += String.fromCharCode(t);
|
|
} else if (t >= 0xC0 && t < 0xE0) {
|
|
str += String.fromCharCode((t & 0x1F) << 6 | buffer[i++] & 0x3F);
|
|
} else if (t >= 0xE0 && t < 0xF0) {
|
|
str += String.fromCharCode((t & 0xF) << 12 | (buffer[i++] & 0x3F) << 6 | buffer[i++] & 0x3F);
|
|
} else if (t >= 0xF0) {
|
|
var t2 = ((t & 7) << 18 | (buffer[i++] & 0x3F) << 12 | (buffer[i++] & 0x3F) << 6 | buffer[i++] & 0x3F) - 0x10000;
|
|
str += String.fromCharCode(0xD800 + (t2 >> 10));
|
|
str += String.fromCharCode(0xDC00 + (t2 & 0x3FF));
|
|
}
|
|
}
|
|
|
|
return str;
|
|
};
|
|
|
|
/**
|
|
* Writes a string as UTF8 bytes.
|
|
* @param {string} string Source string
|
|
* @param {Uint8Array} buffer Destination buffer
|
|
* @param {number} offset Destination offset
|
|
* @returns {number} Bytes written
|
|
*/
|
|
utf8.write = function utf8_write(string, buffer, offset) {
|
|
var start = offset,
|
|
c1, // character 1
|
|
c2; // character 2
|
|
for (var i = 0; i < string.length; ++i) {
|
|
c1 = string.charCodeAt(i);
|
|
if (c1 < 128) {
|
|
buffer[offset++] = c1;
|
|
} else if (c1 < 2048) {
|
|
buffer[offset++] = c1 >> 6 | 192;
|
|
buffer[offset++] = c1 & 63 | 128;
|
|
} else if ((c1 & 0xFC00) === 0xD800 && ((c2 = string.charCodeAt(i + 1)) & 0xFC00) === 0xDC00) {
|
|
c1 = 0x10000 + ((c1 & 0x03FF) << 10) + (c2 & 0x03FF);
|
|
++i;
|
|
buffer[offset++] = c1 >> 18 | 240;
|
|
buffer[offset++] = c1 >> 12 & 63 | 128;
|
|
buffer[offset++] = c1 >> 6 & 63 | 128;
|
|
buffer[offset++] = c1 & 63 | 128;
|
|
} else {
|
|
buffer[offset++] = c1 >> 12 | 224;
|
|
buffer[offset++] = c1 >> 6 & 63 | 128;
|
|
buffer[offset++] = c1 & 63 | 128;
|
|
}
|
|
}
|
|
return offset - start;
|
|
};
|