js实现unicode码字符串与utf8字节数据互转详解
js的string变量存储字符串使用的是unicode编码,要保存时必须选择其他编码后进行传输,比如转成utf-8,utf-32等。存储到数据库中为utf-8编码,读取出来如何转换成正确的字符串就成了问题。现在给出解决方案,可以正确支持中文、emoji表情、英文混合的字符串编码互转。
/** *Createdbyhdwangon2019/1/28. */ varconvertUtf8=(function(){ /** *unicodestringtoutf-8 *@paramtext字符串 *@returns{*}utf-8编码 */ functiontoBytes(text){ varresult=[],i=0; text=encodeURI(text); while(i>>7)===0){ unicodeStr+=String.fromCharCode(utf8Bytes[pos]); pos+=1; }elseif((flag&0xFC)===0xFC){ unicode=(utf8Bytes[pos]&0x3)<<30; unicode|=(utf8Bytes[pos+1]&0x3F)<<24; unicode|=(utf8Bytes[pos+2]&0x3F)<<18; unicode|=(utf8Bytes[pos+3]&0x3F)<<12; unicode|=(utf8Bytes[pos+4]&0x3F)<<6; unicode|=(utf8Bytes[pos+5]&0x3F); unicodeStr+=String.fromCodePoint(unicode); pos+=6; }elseif((flag&0xF8)===0xF8){ unicode=(utf8Bytes[pos]&0x7)<<24; unicode|=(utf8Bytes[pos+1]&0x3F)<<18; unicode|=(utf8Bytes[pos+2]&0x3F)<<12; unicode|=(utf8Bytes[pos+3]&0x3F)<<6; unicode|=(utf8Bytes[pos+4]&0x3F); unicodeStr+=String.fromCodePoint(unicode); pos+=5; }elseif((flag&0xF0)===0xF0){ unicode=(utf8Bytes[pos]&0xF)<<18; unicode|=(utf8Bytes[pos+1]&0x3F)<<12; unicode|=(utf8Bytes[pos+2]&0x3F)<<6; unicode|=(utf8Bytes[pos+3]&0x3F); unicodeStr+=String.fromCodePoint(unicode); pos+=4; }elseif((flag&0xE0)===0xE0){ unicode=(utf8Bytes[pos]&0x1F)<<12;; unicode|=(utf8Bytes[pos+1]&0x3F)<<6; unicode|=(utf8Bytes[pos+2]&0x3F); unicodeStr+=String.fromCharCode(unicode); pos+=3; }elseif((flag&0xC0)===0xC0){//110 unicode=(utf8Bytes[pos]&0x3F)<<6; unicode|=(utf8Bytes[pos+1]&0x3F); unicodeStr+=String.fromCharCode(unicode); pos+=2; }else{ unicodeStr+=String.fromCharCode(utf8Bytes[pos]); pos+=1; } } returnunicodeStr; } functioncheckInt(value){ return(parseInt(value)===value); } functioncheckInts(arrayish){ if(!checkInt(arrayish.length)){returnfalse;} for(vari=0;i 255){ returnfalse; } } returntrue; } functioncoerceArray(arg,copy){ //ArrayBufferview if(arg.buffer&&arg.name==='Uint8Array'){ if(copy){ if(arg.slice){ arg=arg.slice(); }else{ arg=Array.prototype.slice.call(arg); } } returnarg; } //It'sanarray;checkitisavalidrepresentationofabyte if(Array.isArray(arg)){ if(!checkInts(arg)){ thrownewError('Arraycontainsinvalidvalue:'+arg); } returnnewUint8Array(arg); } //Somethingelse,butbehaveslikeanarray(maybeaBuffer?Arguments?) if(checkInt(arg.length)&&checkInts(arg)){ returnnewUint8Array(arg); } thrownewError('unsupportedarray-likeobject'); } return{ toBytes:toBytes, fromBytes:utf8ByteToUnicodeStr } })()
针对emoji的字节字符,占两个unicode字符。使用String.fromCharCode也可以实现,需要进行两次fromCharCode,没有fromPointCode方便。下面展示了utf-8的4字节转换为unicode(utf-16)的过程。
//高char10位[一个unicode字符](2+6+2=10) unicode=((utf8Bytes[pos]&0x3))<<8|((utf8Bytes[pos+1]&0x3f)<<2)|((utf8Bytes[pos+2]>>4)&0x03); //减去1F600中的1,这里减去6个0即可,低位char已经占据10位 unicode=unicode-parseInt('1000000',2) //加上utf-16高char的标识符 unicode=0xD800+unicode; console.log(unicode); unicodeStr+=String.fromCharCode(unicode); //低char10位[一个unicode字符](4+6) unicode=((utf8Bytes[pos+2]&0x0F)<<6)|(utf8Bytes[pos+3]&0x3F); //加上utf-16低char的标识符 unicode=0xDC00+unicode; console.log(unicode); unicodeStr+=String.fromCharCode(unicode); pos+=4;
以上所述是小编给大家介绍的js实现unicode码字符串与utf8字节数据互转详解整合,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对毛票票网站的支持!