更新于 2024-01-06
ES6可以很好的兼容4字节的UTF-8,不然四字节的UTF-8字符会被编码为两个Unicode字符来处理。
如果原文中不包含4字节UTF-8字符(例如emoji表情),ES6和ES5下效果一样。
程序优先使用ES6的codePoint来处理。
var words = 'hello world!你好,世界!😁';
var bytes = Utf8Encoding.getBytes(words);
console.log(bytes)
console.log(Utf8Encoding.getString(bytes))
//可以看到,最后的一个表情被编码为四个字节:240, 159, 152, 129。
[104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 33, 228, 189, 160, 229, 165,
189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 239, 188, 129, 240, 159, 152, 129]
hello world!你好,世界!😁
var Utf8Encoding = (function(){
var codepoint2bytes = function (code, allBytes) {
if (code >= 0x10000) {
allBytes.push(
0xF0 | ((code & 0x1C0000) >>> 18),
0x80 | ((code & 0x3F000) >>> 12),
0x80 | ((code & 0xFC0) >>> 6),
0x80 | (code & 0x3F));
return
}
if (code >= 0x800) {
allBytes.push(0xE0 | ((code & 0xF000) >>> 12), 0x80 | ((code & 0xFC0) >>> 6), 0x80 | (code & 0x3F));
return;
}
if (code >= 0x80) {
allBytes.push(0xC0 | ((code & 0x7C0) >>> 6), 0x80 | (code & 0x3F));
return;
}
allBytes.push(code);
}
var unicode2utf8bytes = function (c) {
var allBytes = [];
if (String.fromCodePoint) {
var func = new Function('allBytes', 'c', 'codepoint2bytes', 'for (var chr of c) codepoint2bytes(chr.codePointAt(0), allBytes);')
func(allBytes, c, codepoint2bytes);
} else {
for (var i = 0; i < c.length; i++) {
codepoint2bytes(c.charCodeAt(i), allBytes);
}
}
return allBytes;
};
var utf8bytes2unicode = function (c) {
var allBytes = [];
for (var i = 0, l = c.length; i < l; i++) {
var code = c[i];
if (code >= 0xF0) {
allBytes.push(((code & 7) << 18) | ((c[i + 1] & 0x3f) << 12) | ((c[i + 2] & 0x3f) << 6) | (c[i + 3] & 0x3f));
i += 3;
continue;
}
if (code >= 0xE0) {
allBytes.push(((code & 0xf) << 12) | ((c[i + 1] & 0x3f) << 6) | (c[i + 2] & 0x3f));
i += 2;
continue;
}
if (code >= 0xC0) {
allBytes.push(((code & 0x1f) << 6) | (c[i + 1] & 0x3f));
i++;
continue;
}
allBytes.push(code);
}
return (String.fromCodePoint || String.fromCharCode).apply(null, allBytes);
};
return {
getString: utf8bytes2unicode,
getBytes: unicode2utf8bytes
}
})();