Javascript:UTF-8和Unicode互转

更新于 2024-01-06

ES6可以很好的兼容4字节的UTF-8,不然四字节的UTF-8字符会被编码为两个Unicode字符来处理。
如果原文中不包含4字节UTF-8字符(例如emoji表情),ES6和ES5下效果一样。

程序优先使用ES6的codePoint来处理。

示例

var words = 'hello world!你好,世界!😁';
var bytes = Utf8Encoding.getBytes(words);
console.log(bytes)
console.log(Utf8Encoding.getString(bytes))

输出

//可以看到,最后的一个表情被编码为四个字节:240, 159, 152, 129。
[104, 101, 108, 108, 111, 32, 119, 111, 114, 108, 100, 33, 228, 189, 160, 229, 165, 
189, 239, 188, 140, 228, 184, 150, 231, 149, 140, 239, 188, 129, 240, 159, 152, 129]

hello world!你好,世界!😁

源代码

var Utf8Encoding = (function(){

  var codepoint2bytes = function (code, allBytes) {
    if (code >= 0x10000) {
      allBytes.push(
        0xF0 | ((code & 0x1C0000) >>> 18),
        0x80 | ((code & 0x3F000) >>> 12),
        0x80 | ((code & 0xFC0) >>> 6),
        0x80 | (code & 0x3F));
      return
    }
    if (code >= 0x800) {
      allBytes.push(0xE0 | ((code & 0xF000) >>> 12), 0x80 | ((code & 0xFC0) >>> 6), 0x80 | (code & 0x3F));
      return;
    }
    if (code >= 0x80) {
      allBytes.push(0xC0 | ((code & 0x7C0) >>> 6), 0x80 | (code & 0x3F));
      return;
    }
    allBytes.push(code);
  }
  var unicode2utf8bytes = function (c) {
    var allBytes = [];
    if (String.fromCodePoint) {
      var func = new Function('allBytes', 'c', 'codepoint2bytes', 'for (var chr of c) codepoint2bytes(chr.codePointAt(0), allBytes);')
      func(allBytes, c, codepoint2bytes);
    } else {
      for (var i = 0; i < c.length; i++) {
        codepoint2bytes(c.charCodeAt(i), allBytes);
      }
    }
    return allBytes;
  };

  var utf8bytes2unicode = function (c) {
    var allBytes = [];
    for (var i = 0, l = c.length; i < l; i++) {
      var code = c[i];
      if (code >= 0xF0) {
        allBytes.push(((code & 7) << 18) | ((c[i + 1] & 0x3f) << 12) | ((c[i + 2] & 0x3f) << 6) | (c[i + 3] & 0x3f));
        i += 3;
        continue;
      }
      if (code >= 0xE0) {
        allBytes.push(((code & 0xf) << 12) | ((c[i + 1] & 0x3f) << 6) | (c[i + 2] & 0x3f));
        i += 2;
        continue;
      }
      if (code >= 0xC0) {
        allBytes.push(((code & 0x1f) << 6) | (c[i + 1] & 0x3f));
        i++;
        continue;
      }
      allBytes.push(code);
    }
    return (String.fromCodePoint || String.fromCharCode).apply(null, allBytes);
  };
  return {
    getString: utf8bytes2unicode,
    getBytes: unicode2utf8bytes
  }
})();