c#检测文本文件编码的方法
C#如何检测文本文件的编码,本文为大家分享了示例代码,具体内容如下
usingSystem; usingSystem.Text; usingSystem.Text.RegularExpressions; usingSystem.IO; namespaceKlerksSoft { publicstaticclassTextFileEncodingDetector { /* *Simpleclasstohandletextfileencodingwoes(inaprimarilyEnglish-speakingtech *world). * *-Thiscodeisfullymanaged,noshadycallstoMLang(theunmanagedcodepage *detectionlibraryoriginallydevelopedforInternetExplorer). * *-ThisclassdoesNOTtrytodetectarbitrarycodepages/charsets,itreallyonly *aimstodifferentiatebetweensomeofthemostcommonvariantsofUnicode *encoding,anda"default"(western/ascii-based)encodingalternativeprovided *bythecaller. * *-Asthereisno"Reliable"waytodistinguishbetweenUTF-8(withoutBOM)and *Windows-1252(in.Net,alsoincorrectlycalled"ASCII")encodings,weusea *heuristic-sothemoreofthefilewecansamplethebettertheguess.Ifyou *aregoingtoreadthewholefileintomemoryatsomepoint,thenbesttopass *inthewholebytebytearraydirectly.Otherwise,decidehowtotradeoff *reliabilityagainstperformance/memoryusage. * *-TheUTF-8detectionheuristiconlyworksforwesterntext,asitrelieson *thepresenceofUTF-8encodedaccentedandothercharactersfoundintheupper *rangesoftheLatin-1and(particularly)Windows-1252codepages. * *-Formoregeneraldetectionroutines,seeexistingprojects/resources: *-MLang-MicrosoftlibraryoriginallyforIE6,availableinWindowsXPandlaterAPIsnow(Ithink?) *-MLang.Netbindings:http://www.codeproject.com/KB/recipes/DetectEncoding.aspx *-CharDet-Mozillabrowser'sdetectionroutines *-PortedtoJavathen.Net:http://www.conceptdevelopment.net/Localization/NCharDet/ *-Portedstraightto.Net:http://code.google.com/p/chardetsharp/source/browse * *CopyrightTaoKlerks,Jan2010,tao@klerks.biz *LicensedunderthemodifiedBSDlicense: * Redistributionanduseinsourceandbinaryforms,withorwithoutmodification,are permittedprovidedthatthefollowingconditionsaremet: -Redistributionsofsourcecodemustretaintheabovecopyrightnotice,thislistof conditionsandthefollowingdisclaimer. -Redistributionsinbinaryformmustreproducetheabovecopyrightnotice,thislist ofconditionsandthefollowingdisclaimerinthedocumentationand/orothermaterials providedwiththedistribution. -Thenameoftheauthormaynotbeusedtoendorseorpromoteproductsderivedfrom thissoftwarewithoutspecificpriorwrittenpermission. THISSOFTWAREISPROVIDEDBYTHEAUTHOR``ASIS''ANDANYEXPRESSORIMPLIEDWARRANTIES, INCLUDING,BUTNOTLIMITEDTO,THEIMPLIEDWARRANTIESOFMERCHANTABILITYANDFITNESSFOR APARTICULARPURPOSEAREDISCLAIMED.INNOEVENTSHALLTHEAUTHORBELIABLEFORANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,ORCONSEQUENTIALDAMAGES(INCLUDING, BUTNOTLIMITEDTO,PROCUREMENTOFSUBSTITUTEGOODSORSERVICES;LOSSOFUSE,DATA,OR PROFITS;ORBUSINESSINTERRUPTION)HOWEVERCAUSEDANDONANYTHEORYOFLIABILITY, WHETHERINCONTRACT,STRICTLIABILITY,ORTORT(INCLUDINGNEGLIGENCEOROTHERWISE) ARISINGINANYWAYOUTOFTHEUSEOFTHISSOFTWARE,EVENIFADVISEDOFTHEPOSSIBILITY OFSUCHDAMAGE. * */ constlong_defaultHeuristicSampleSize=0x10000;//completelyarbitrary-inappropriateforhighnumbersoffiles/highspeedrequirements publicstaticEncodingDetectTextFileEncoding(stringInputFilename,EncodingDefaultEncoding) { using(FileStreamtextfileStream=File.OpenRead(InputFilename)) { returnDetectTextFileEncoding(textfileStream,DefaultEncoding,_defaultHeuristicSampleSize); } } publicstaticEncodingDetectTextFileEncoding(FileStreamInputFileStream,EncodingDefaultEncoding,longHeuristicSampleSize) { if(InputFileStream==null) thrownewArgumentNullException("MustprovideavalidFilestream!","InputFileStream"); if(!InputFileStream.CanRead) thrownewArgumentException("Providedfilestreamisnotreadable!","InputFileStream"); if(!InputFileStream.CanSeek) thrownewArgumentException("Providedfilestreamcannotseek!","InputFileStream"); EncodingencodingFound=null; longoriginalPos=InputFileStream.Position; InputFileStream.Position=0; //FirstreadonlywhatweneedforBOMdetection byte[]bomBytes=newbyte[InputFileStream.Length>4?4:InputFileStream.Length]; InputFileStream.Read(bomBytes,0,bomBytes.Length); encodingFound=DetectBOMBytes(bomBytes); if(encodingFound!=null) { InputFileStream.Position=originalPos; returnencodingFound; } //BOMDetectionfailed,goingforheuristicsnow. //createsamplebytearrayandpopulateit byte[]sampleBytes=newbyte[HeuristicSampleSize>InputFileStream.Length?InputFileStream.Length:HeuristicSampleSize]; Array.Copy(bomBytes,sampleBytes,bomBytes.Length); if(InputFileStream.Length>bomBytes.Length) InputFileStream.Read(sampleBytes,bomBytes.Length,sampleBytes.Length-bomBytes.Length); InputFileStream.Position=originalPos; //testbytearraycontent encodingFound=DetectUnicodeInByteSampleByHeuristics(sampleBytes); if(encodingFound!=null) returnencodingFound; else returnDefaultEncoding; } publicstaticEncodingDetectTextByteArrayEncoding(byte[]TextData,EncodingDefaultEncoding) { if(TextData==null) thrownewArgumentNullException("Mustprovideavalidtextdatabytearray!","TextData"); EncodingencodingFound=null; encodingFound=DetectBOMBytes(TextData); if(encodingFound!=null) { returnencodingFound; } else { //testbytearraycontent encodingFound=DetectUnicodeInByteSampleByHeuristics(TextData); if(encodingFound!=null) returnencodingFound; else returnDefaultEncoding; } } publicstaticEncodingDetectBOMBytes(byte[]BOMBytes) { if(BOMBytes==null) thrownewArgumentNullException("MustprovideavalidBOMbytearray!","BOMBytes"); if(BOMBytes.Length<2) returnnull; if(BOMBytes[0]==0xff &&BOMBytes[1]==0xfe &&(BOMBytes.Length<4 ||BOMBytes[2]!=0 ||BOMBytes[3]!=0 ) ) returnEncoding.Unicode; if(BOMBytes[0]==0xfe &&BOMBytes[1]==0xff ) returnEncoding.BigEndianUnicode; if(BOMBytes.Length<3) returnnull; if(BOMBytes[0]==0xef&&BOMBytes[1]==0xbb&&BOMBytes[2]==0xbf) returnEncoding.UTF8; if(BOMBytes[0]==0x2b&&BOMBytes[1]==0x2f&&BOMBytes[2]==0x76) returnEncoding.UTF7; if(BOMBytes.Length<4) returnnull; if(BOMBytes[0]==0xff&&BOMBytes[1]==0xfe&&BOMBytes[2]==0&&BOMBytes[3]==0) returnEncoding.UTF32; if(BOMBytes[0]==0&&BOMBytes[1]==0&&BOMBytes[2]==0xfe&&BOMBytes[3]==0xff) returnEncoding.GetEncoding(12001); returnnull; } publicstaticEncodingDetectUnicodeInByteSampleByHeuristics(byte[]SampleBytes) { longoddBinaryNullsInSample=0; longevenBinaryNullsInSample=0; longsuspiciousUTF8SequenceCount=0; longsuspiciousUTF8BytesTotal=0; longlikelyUSASCIIBytesInSample=0; //Cyclethrough,keepingcountofbinarynullpositions,possibleUTF-8 //sequencesfromupperrangesofWindows-1252,andprobableUS-ASCII //charactercounts. longcurrentPos=0; intskipUTF8Bytes=0; while(currentPos<SampleBytes.Length) { //binarynulldistribution if(SampleBytes[currentPos]==0) { if(currentPos%2==0) evenBinaryNullsInSample++; else oddBinaryNullsInSample++; } //likelyUS-ASCIIcharacters if(IsCommonUSASCIIByte(SampleBytes[currentPos])) likelyUSASCIIBytesInSample++; //suspicioussequences(looklikeUTF-8) if(skipUTF8Bytes==0) { intlengthFound=DetectSuspiciousUTF8SequenceLength(SampleBytes,currentPos); if(lengthFound>0) { suspiciousUTF8SequenceCount++; suspiciousUTF8BytesTotal+=lengthFound; skipUTF8Bytes=lengthFound-1; } } else { skipUTF8Bytes--; } currentPos++; } //1:UTF-16LE-inenglish/europeanenvironments,thisisusuallycharacterizedbya //highproportionofoddbinarynulls(startingat0),with(asthisistext)alow //proportionofevenbinarynulls. //Thethresholdshereused(lessthan20%nullswhereyouexpectnon-nulls,andmorethan //60%nullswhereyoudoexpectnulls)arecompletelyarbitrary. if(((evenBinaryNullsInSample*2.0)/SampleBytes.Length)<0.2 &&((oddBinaryNullsInSample*2.0)/SampleBytes.Length)>0.6 ) returnEncoding.Unicode; //2:UTF-16BE-inenglish/europeanenvironments,thisisusuallycharacterizedbya //highproportionofevenbinarynulls(startingat0),with(asthisistext)alow //proportionofoddbinarynulls. //Thethresholdshereused(lessthan20%nullswhereyouexpectnon-nulls,andmorethan //60%nullswhereyoudoexpectnulls)arecompletelyarbitrary. if(((oddBinaryNullsInSample*2.0)/SampleBytes.Length)<0.2 &&((evenBinaryNullsInSample*2.0)/SampleBytes.Length)>0.6 ) returnEncoding.BigEndianUnicode; //3:UTF-8-MartinDürstoutlinesamethodfordetectingwhethersomethingCANbeUTF-8content //usingregexp,inhisw3c.orgunicodeFAQentry: //http://www.w3.org/International/questions/qa-forms-utf-8 //adaptedhereforC#. stringpotentiallyMangledString=Encoding.ASCII.GetString(SampleBytes); RegexUTF8Validator=newRegex(@"\A(" +@"[\x09\x0A\x0D\x20-\x7E]" +@"|[\xC2-\xDF][\x80-\xBF]" +@"|\xE0[\xA0-\xBF][\x80-\xBF]" +@"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}" +@"|\xED[\x80-\x9F][\x80-\xBF]" +@"|\xF0[\x90-\xBF][\x80-\xBF]{2}" +@"|[\xF1-\xF3][\x80-\xBF]{3}" +@"|\xF4[\x80-\x8F][\x80-\xBF]{2}" +@")*\z"); if(UTF8Validator.IsMatch(potentiallyMangledString)) { //Unfortunately,justthefactthatitCANbeUTF-8doesn'ttellyoumuchaboutprobabilities. //Ifallthecharactersareinthe0-127range,noharmdone,mostwesterncharsetsaresameasUTF-8intheseranges. //Ifsomeofthecharacterswereintheupperrange(westernaccentedcharacters),however,theywouldlikelybemangledto2-bytebytheUTF-8encodingprocess. //So,weneedtoplaystats. //The"Random"likelihoodofanypairofrandomlygeneratedcharactersbeingone //ofthese"suspicious"charactersequencesis: //128/(256*256)=0.2%. // //Inwesterntextdata,thatisSIGNIFICANTLYreduced-mosttextdatastaysinthe<127 //characterrange,soweassumethatmorethan1in500,000ofthesecharacter //sequencesindicatesUTF-8.Thenumber500,000iscompletelyarbitrary-sosueme. // //WecanonlyassumethesecharactersequenceswillberareifweALSOassumethatthis //ISinfactwesterntext-inwhichcasethebulkoftheUTF-8encodeddata(thatis //notalreadysuspicioussequences)shouldbeplainUS-ASCIIbytes.This,I //arbitrarilydecided,shouldbe80%(arandomdistribution,egbinarydata,wouldyield //approx40%,sothechancesofhittingthisthresholdbyaccidentinrandomdataare //VERYlow). if((suspiciousUTF8SequenceCount*500000.0/SampleBytes.Length>=1)//suspicioussequences &&( //allsuspicious,socannotevaluateproportionofUS-Ascii SampleBytes.Length-suspiciousUTF8BytesTotal==0 || likelyUSASCIIBytesInSample*1.0/(SampleBytes.Length-suspiciousUTF8BytesTotal)>=0.8 ) ) returnEncoding.UTF8; } returnnull; } privatestaticboolIsCommonUSASCIIByte(bytetestByte) { if(testByte==0x0A//lf ||testByte==0x0D//cr ||testByte==0x09//tab ||(testByte>=0x20&&testByte<=0x2F)//commonpunctuation ||(testByte>=0x30&&testByte<=0x39)//digits ||(testByte>=0x3A&&testByte<=0x40)//commonpunctuation ||(testByte>=0x41&&testByte<=0x5A)//capitalletters ||(testByte>=0x5B&&testByte<=0x60)//commonpunctuation ||(testByte>=0x61&&testByte<=0x7A)//lowercaseletters ||(testByte>=0x7B&&testByte<=0x7E)//commonpunctuation ) returntrue; else returnfalse; } privatestaticintDetectSuspiciousUTF8SequenceLength(byte[]SampleBytes,longcurrentPos) { intlengthFound=0; if(SampleBytes.Length>=currentPos+1 &&SampleBytes[currentPos]==0xC2 ) { if(SampleBytes[currentPos+1]==0x81 ||SampleBytes[currentPos+1]==0x8D ||SampleBytes[currentPos+1]==0x8F ) lengthFound=2; elseif(SampleBytes[currentPos+1]==0x90 ||SampleBytes[currentPos+1]==0x9D ) lengthFound=2; elseif(SampleBytes[currentPos+1]>=0xA0 &&SampleBytes[currentPos+1]<=0xBF ) lengthFound=2; } elseif(SampleBytes.Length>=currentPos+1 &&SampleBytes[currentPos]==0xC3 ) { if(SampleBytes[currentPos+1]>=0x80 &&SampleBytes[currentPos+1]<=0xBF ) lengthFound=2; } elseif(SampleBytes.Length>=currentPos+1 &&SampleBytes[currentPos]==0xC5 ) { if(SampleBytes[currentPos+1]==0x92 ||SampleBytes[currentPos+1]==0x93 ) lengthFound=2; elseif(SampleBytes[currentPos+1]==0xA0 ||SampleBytes[currentPos+1]==0xA1 ) lengthFound=2; elseif(SampleBytes[currentPos+1]==0xB8 ||SampleBytes[currentPos+1]==0xBD ||SampleBytes[currentPos+1]==0xBE ) lengthFound=2; } elseif(SampleBytes.Length>=currentPos+1 &&SampleBytes[currentPos]==0xC6 ) { if(SampleBytes[currentPos+1]==0x92) lengthFound=2; } elseif(SampleBytes.Length>=currentPos+1 &&SampleBytes[currentPos]==0xCB ) { if(SampleBytes[currentPos+1]==0x86 ||SampleBytes[currentPos+1]==0x9C ) lengthFound=2; } elseif(SampleBytes.Length>=currentPos+2 &&SampleBytes[currentPos]==0xE2 ) { if(SampleBytes[currentPos+1]==0x80) { if(SampleBytes[currentPos+2]==0x93 ||SampleBytes[currentPos+2]==0x94 ) lengthFound=3; if(SampleBytes[currentPos+2]==0x98 ||SampleBytes[currentPos+2]==0x99 ||SampleBytes[currentPos+2]==0x9A ) lengthFound=3; if(SampleBytes[currentPos+2]==0x9C ||SampleBytes[currentPos+2]==0x9D ||SampleBytes[currentPos+2]==0x9E ) lengthFound=3; if(SampleBytes[currentPos+2]==0xA0 ||SampleBytes[currentPos+2]==0xA1 ||SampleBytes[currentPos+2]==0xA2 ) lengthFound=3; if(SampleBytes[currentPos+2]==0xA6) lengthFound=3; if(SampleBytes[currentPos+2]==0xB0) lengthFound=3; if(SampleBytes[currentPos+2]==0xB9 ||SampleBytes[currentPos+2]==0xBA ) lengthFound=3; } elseif(SampleBytes[currentPos+1]==0x82 &&SampleBytes[currentPos+2]==0xAC ) lengthFound=3; elseif(SampleBytes[currentPos+1]==0x84 &&SampleBytes[currentPos+2]==0xA2 ) lengthFound=3; } returnlengthFound; } } }
使用方法:
EncodingfileEncoding=TextFileEncodingDetector.DetectTextFileEncoding("youfilepath",Encoding.Default);
以上就是本文的全部内容,希望对大家学习C#程序设计有所帮助。