js实现unicode码字符串与utf8字节数据互转详解
js的string变量存储字符串使用的是unicode编码,要保存时必须选择其他编码后进行传输,比如转成utf-8,utf-32等。存储到数据库中为utf-8编码,读取出来如何转换成正确的字符串就成了问题。现在给出解决方案,可以正确支持中文、emoji表情、英文混合的字符串编码互转。
/**
*Createdbyhdwangon2019/1/28.
*/
varconvertUtf8=(function(){
/**
*unicodestringtoutf-8
*@paramtext字符串
*@returns{*}utf-8编码
*/
functiontoBytes(text){
varresult=[],i=0;
text=encodeURI(text);
while(i>>7)===0){
unicodeStr+=String.fromCharCode(utf8Bytes[pos]);
pos+=1;
}elseif((flag&0xFC)===0xFC){
unicode=(utf8Bytes[pos]&0x3)<<30;
unicode|=(utf8Bytes[pos+1]&0x3F)<<24;
unicode|=(utf8Bytes[pos+2]&0x3F)<<18;
unicode|=(utf8Bytes[pos+3]&0x3F)<<12;
unicode|=(utf8Bytes[pos+4]&0x3F)<<6;
unicode|=(utf8Bytes[pos+5]&0x3F);
unicodeStr+=String.fromCodePoint(unicode);
pos+=6;
}elseif((flag&0xF8)===0xF8){
unicode=(utf8Bytes[pos]&0x7)<<24;
unicode|=(utf8Bytes[pos+1]&0x3F)<<18;
unicode|=(utf8Bytes[pos+2]&0x3F)<<12;
unicode|=(utf8Bytes[pos+3]&0x3F)<<6;
unicode|=(utf8Bytes[pos+4]&0x3F);
unicodeStr+=String.fromCodePoint(unicode);
pos+=5;
}elseif((flag&0xF0)===0xF0){
unicode=(utf8Bytes[pos]&0xF)<<18;
unicode|=(utf8Bytes[pos+1]&0x3F)<<12;
unicode|=(utf8Bytes[pos+2]&0x3F)<<6;
unicode|=(utf8Bytes[pos+3]&0x3F);
unicodeStr+=String.fromCodePoint(unicode);
pos+=4;
}elseif((flag&0xE0)===0xE0){
unicode=(utf8Bytes[pos]&0x1F)<<12;;
unicode|=(utf8Bytes[pos+1]&0x3F)<<6;
unicode|=(utf8Bytes[pos+2]&0x3F);
unicodeStr+=String.fromCharCode(unicode);
pos+=3;
}elseif((flag&0xC0)===0xC0){//110
unicode=(utf8Bytes[pos]&0x3F)<<6;
unicode|=(utf8Bytes[pos+1]&0x3F);
unicodeStr+=String.fromCharCode(unicode);
pos+=2;
}else{
unicodeStr+=String.fromCharCode(utf8Bytes[pos]);
pos+=1;
}
}
returnunicodeStr;
}
functioncheckInt(value){
return(parseInt(value)===value);
}
functioncheckInts(arrayish){
if(!checkInt(arrayish.length)){returnfalse;}
for(vari=0;i255){
returnfalse;
}
}
returntrue;
}
functioncoerceArray(arg,copy){
//ArrayBufferview
if(arg.buffer&&arg.name==='Uint8Array'){
if(copy){
if(arg.slice){
arg=arg.slice();
}else{
arg=Array.prototype.slice.call(arg);
}
}
returnarg;
}
//It'sanarray;checkitisavalidrepresentationofabyte
if(Array.isArray(arg)){
if(!checkInts(arg)){
thrownewError('Arraycontainsinvalidvalue:'+arg);
}
returnnewUint8Array(arg);
}
//Somethingelse,butbehaveslikeanarray(maybeaBuffer?Arguments?)
if(checkInt(arg.length)&&checkInts(arg)){
returnnewUint8Array(arg);
}
thrownewError('unsupportedarray-likeobject');
}
return{
toBytes:toBytes,
fromBytes:utf8ByteToUnicodeStr
}
})()
针对emoji的字节字符,占两个unicode字符。使用String.fromCharCode也可以实现,需要进行两次fromCharCode,没有fromPointCode方便。下面展示了utf-8的4字节转换为unicode(utf-16)的过程。
//高char10位[一个unicode字符](2+6+2=10)
unicode=((utf8Bytes[pos]&0x3))<<8|((utf8Bytes[pos+1]&0x3f)<<2)|((utf8Bytes[pos+2]>>4)&0x03);
//减去1F600中的1,这里减去6个0即可,低位char已经占据10位
unicode=unicode-parseInt('1000000',2)
//加上utf-16高char的标识符
unicode=0xD800+unicode;
console.log(unicode);
unicodeStr+=String.fromCharCode(unicode);
//低char10位[一个unicode字符](4+6)
unicode=((utf8Bytes[pos+2]&0x0F)<<6)|(utf8Bytes[pos+3]&0x3F);
//加上utf-16低char的标识符
unicode=0xDC00+unicode;
console.log(unicode);
unicodeStr+=String.fromCharCode(unicode);
pos+=4;
以上所述是小编给大家介绍的js实现unicode码字符串与utf8字节数据互转详解整合,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对毛票票网站的支持!