当前位置: 首页>>WEB开发>>正文


BMP之外的JavaScript字符串

自然而然 WEB开发 , , , , 去评论

问题描述

BMP是Basic Multilingual Plane

根据JavaScript:好零件:

JavaScript was built at a time when Unicode was a 16-bit character set, so all characters in JavaScript are 16 bits wide.

这使我相信JavaScript使用UCS-2(不是UTF-16!),只能处理高达U + FFFF的字符。

进一步调查证实:

> String.fromCharCode(0x20001);

当返回Unicode字符时,fromCharCode方法似乎只使用最低的16位。试图获得U + 20001(CJK统一表意文字20001)代替返回U + 0001。

问:是否可以在JavaScript中处理post-BMP字符?


2011-07-31:从Unicode支持Shotosut滑出十二:好,坏,& (大部分)丑陋涵盖了与此相关的问题:

javascript,unicode,utf-16,surrogate-pairs,astral-plane

最佳解决方案

取决于你的意思â€Â~support’。您可以使用代理程序将non-UCS-2字符放在JS字符串中,如果可以,浏览器将显示它们。

但是,JS字符串中的每个项目都是单独的UTF-16代码单元。没有language-level支持处理全字符:所有的标准字符串成员(lengthsplitslice等)都处理代码单元不是字符,所以将非常愉快地拆分代理对或保存无效代理序列。

如果你想要surrogate-aware方法,恐怕你将不得不开始写他们自己!例如:

String.prototype.getCodePointLength= function() {
    return this.length-this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length+1;
};

String.fromCodePoint= function() {
    var chars= Array.prototype.slice.call(arguments);
    for (var i= chars.length; i-->0;) {
        var n = chars[i]-0x10000;
        if (n>=0)
            chars.splice(i, 1, 0xD800+(n>>10), 0xDC00+(n&0x3FF));
    }
    return String.fromCharCode.apply(null, chars);
};

次佳解决方案

我得到了与鲍比相同的结论。如果要使用包含BMP之外的unicode字符的字符串,则必须重新实现JavaScript的String方法。这是因为JavaScript将字符计为每个16位代码值。 BMP之外的符号需要两个代码值来表示。因此,您会遇到一些符号计数为两个字符,一些计数只能一个。

我重新实现了以下方法来将每个unicode代码点视为单个字符:.length,.charCodeAt,.fromCharCode,.charAt,.indexOf,.lastIndexOf,.splice和.split。

你可以在jsfiddle上查看:http://jsfiddle.net/Y89Du/

以下是没有评论的代码。我测试了它,但它可能仍然有错误。欢迎评论。

if (!String.prototype.ucLength) {
    String.prototype.ucLength = function() {
        // this solution was taken from 
        // http://stackoverflow.com/questions/3744721/javascript-strings-outside-of-the-bmp
        return this.length - this.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g).length + 1;
    };
}

if (!String.prototype.codePointAt) {
    String.prototype.codePointAt = function (ucPos) {
        if (isNaN(ucPos)){
            ucPos = 0;
        }
        var str = String(this);
        var codePoint = null;
        var pairFound = false;
        var ucIndex = -1;
        var i = 0;  
        while (i < str.length){
            ucIndex += 1;
            var code = str.charCodeAt(i);
            var next = str.charCodeAt(i + 1);
            pairFound = (0xD800 <= code && code <= 0xDBFF && 0xDC00 <= next && next <= 0xDFFF);
            if (ucIndex == ucPos){
                codePoint = pairFound ? ((code - 0xD800) * 0x400) + (next - 0xDC00) + 0x10000 : code;
                break;
            } else{
                i += pairFound ? 2 : 1;
            }
        }
        return codePoint;
    };
}

if (!String.fromCodePoint) {
    String.fromCodePoint = function () {
        var strChars = [], codePoint, offset, codeValues, i;
        for (i = 0; i < arguments.length; ++i) {
            codePoint = arguments[i];
            offset = codePoint - 0x10000;
            if (codePoint > 0xFFFF){
                codeValues = [0xD800 + (offset >> 10), 0xDC00 + (offset & 0x3FF)];
            } else{
                codeValues = [codePoint];
            }
            strChars.push(String.fromCharCode.apply(null, codeValues));
        }
        return strChars.join("");
    };
}

if (!String.prototype.ucCharAt) {
    String.prototype.ucCharAt = function (ucIndex) {
        var str = String(this);
        var codePoint = str.codePointAt(ucIndex);
        var ucChar = String.fromCodePoint(codePoint);
        return ucChar;
    };
}

if (!String.prototype.ucIndexOf) {
    String.prototype.ucIndexOf = function (searchStr, ucStart) {
        if (isNaN(ucStart)){
            ucStart = 0;
        }
        if (ucStart < 0){
            ucStart = 0;
        }
        var str = String(this);
        var strUCLength = str.ucLength();
        searchStr = String(searchStr);
        var ucSearchLength = searchStr.ucLength();
        var i = ucStart;
        while (i < strUCLength){
            var ucSlice = str.ucSlice(i,i+ucSearchLength);
            if (ucSlice == searchStr){
                return i;
            }
            i++;
        }
        return -1;
    };
}

if (!String.prototype.ucLastIndexOf) {
    String.prototype.ucLastIndexOf = function (searchStr, ucStart) {
        var str = String(this);
        var strUCLength = str.ucLength();
        if (isNaN(ucStart)){
            ucStart = strUCLength - 1;
        }
        if (ucStart >= strUCLength){
            ucStart = strUCLength - 1;
        }
        searchStr = String(searchStr);
        var ucSearchLength = searchStr.ucLength();
        var i = ucStart;
        while (i >= 0){
            var ucSlice = str.ucSlice(i,i+ucSearchLength);
            if (ucSlice == searchStr){
                return i;
            }
            i--;
        }
        return -1;
    };
}

if (!String.prototype.ucSlice) {
    String.prototype.ucSlice = function (ucStart, ucStop) {
        var str = String(this);
        var strUCLength = str.ucLength();
        if (isNaN(ucStart)){
            ucStart = 0;
        }
        if (ucStart < 0){
            ucStart = strUCLength + ucStart;
            if (ucStart < 0){ ucStart = 0;}
        }
        if (typeof(ucStop) == 'undefined'){
            ucStop = strUCLength - 1;
        }
        if (ucStop < 0){
            ucStop = strUCLength + ucStop;
            if (ucStop < 0){ ucStop = 0;}
        }
        var ucChars = [];
        var i = ucStart;
        while (i < ucStop){
            ucChars.push(str.ucCharAt(i));
            i++;
        }
        return ucChars.join("");
    };
}

if (!String.prototype.ucSplit) {
    String.prototype.ucSplit = function (delimeter, limit) {
        var str = String(this);
        var strUCLength = str.ucLength();
        var ucChars = [];
        if (delimeter == ''){
            for (var i = 0; i < strUCLength; i++){
                ucChars.push(str.ucCharAt(i));
            }
            ucChars = ucChars.slice(0, 0 + limit);
        } else{
            ucChars = str.split(delimeter, limit);
        }
        return ucChars;
    };
}

参考文献

注:本文内容整合自google/baidu/bing辅助翻译的英文资料结果。如果您对结果不满意,可以加入我们改善翻译效果:gxnotes#qq.com(#替换为@)。

本文由《共享笔记》整理, 博文地址: https://gxnotes.com/article/151096.html,未经允许,请勿转载。
Go