c#检测文本文件编码的方法
C#如何检测文本文件的编码,本文为大家分享了示例代码,具体内容如下
usingSystem;
usingSystem.Text;
usingSystem.Text.RegularExpressions;
usingSystem.IO;
namespaceKlerksSoft
{
publicstaticclassTextFileEncodingDetector
{
/*
*Simpleclasstohandletextfileencodingwoes(inaprimarilyEnglish-speakingtech
*world).
*
*-Thiscodeisfullymanaged,noshadycallstoMLang(theunmanagedcodepage
*detectionlibraryoriginallydevelopedforInternetExplorer).
*
*-ThisclassdoesNOTtrytodetectarbitrarycodepages/charsets,itreallyonly
*aimstodifferentiatebetweensomeofthemostcommonvariantsofUnicode
*encoding,anda"default"(western/ascii-based)encodingalternativeprovided
*bythecaller.
*
*-Asthereisno"Reliable"waytodistinguishbetweenUTF-8(withoutBOM)and
*Windows-1252(in.Net,alsoincorrectlycalled"ASCII")encodings,weusea
*heuristic-sothemoreofthefilewecansamplethebettertheguess.Ifyou
*aregoingtoreadthewholefileintomemoryatsomepoint,thenbesttopass
*inthewholebytebytearraydirectly.Otherwise,decidehowtotradeoff
*reliabilityagainstperformance/memoryusage.
*
*-TheUTF-8detectionheuristiconlyworksforwesterntext,asitrelieson
*thepresenceofUTF-8encodedaccentedandothercharactersfoundintheupper
*rangesoftheLatin-1and(particularly)Windows-1252codepages.
*
*-Formoregeneraldetectionroutines,seeexistingprojects/resources:
*-MLang-MicrosoftlibraryoriginallyforIE6,availableinWindowsXPandlaterAPIsnow(Ithink?)
*-MLang.Netbindings:http://www.codeproject.com/KB/recipes/DetectEncoding.aspx
*-CharDet-Mozillabrowser'sdetectionroutines
*-PortedtoJavathen.Net:http://www.conceptdevelopment.net/Localization/NCharDet/
*-Portedstraightto.Net:http://code.google.com/p/chardetsharp/source/browse
*
*CopyrightTaoKlerks,Jan2010,tao@klerks.biz
*LicensedunderthemodifiedBSDlicense:
*
Redistributionanduseinsourceandbinaryforms,withorwithoutmodification,are
permittedprovidedthatthefollowingconditionsaremet:
-Redistributionsofsourcecodemustretaintheabovecopyrightnotice,thislistof
conditionsandthefollowingdisclaimer.
-Redistributionsinbinaryformmustreproducetheabovecopyrightnotice,thislist
ofconditionsandthefollowingdisclaimerinthedocumentationand/orothermaterials
providedwiththedistribution.
-Thenameoftheauthormaynotbeusedtoendorseorpromoteproductsderivedfrom
thissoftwarewithoutspecificpriorwrittenpermission.
THISSOFTWAREISPROVIDEDBYTHEAUTHOR``ASIS''ANDANYEXPRESSORIMPLIEDWARRANTIES,
INCLUDING,BUTNOTLIMITEDTO,THEIMPLIEDWARRANTIESOFMERCHANTABILITYANDFITNESSFOR
APARTICULARPURPOSEAREDISCLAIMED.INNOEVENTSHALLTHEAUTHORBELIABLEFORANY
DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,ORCONSEQUENTIALDAMAGES(INCLUDING,
BUTNOTLIMITEDTO,PROCUREMENTOFSUBSTITUTEGOODSORSERVICES;LOSSOFUSE,DATA,OR
PROFITS;ORBUSINESSINTERRUPTION)HOWEVERCAUSEDANDONANYTHEORYOFLIABILITY,
WHETHERINCONTRACT,STRICTLIABILITY,ORTORT(INCLUDINGNEGLIGENCEOROTHERWISE)
ARISINGINANYWAYOUTOFTHEUSEOFTHISSOFTWARE,EVENIFADVISEDOFTHEPOSSIBILITY
OFSUCHDAMAGE.
*
*/
constlong_defaultHeuristicSampleSize=0x10000;//completelyarbitrary-inappropriateforhighnumbersoffiles/highspeedrequirements
publicstaticEncodingDetectTextFileEncoding(stringInputFilename,EncodingDefaultEncoding)
{
using(FileStreamtextfileStream=File.OpenRead(InputFilename))
{
returnDetectTextFileEncoding(textfileStream,DefaultEncoding,_defaultHeuristicSampleSize);
}
}
publicstaticEncodingDetectTextFileEncoding(FileStreamInputFileStream,EncodingDefaultEncoding,longHeuristicSampleSize)
{
if(InputFileStream==null)
thrownewArgumentNullException("MustprovideavalidFilestream!","InputFileStream");
if(!InputFileStream.CanRead)
thrownewArgumentException("Providedfilestreamisnotreadable!","InputFileStream");
if(!InputFileStream.CanSeek)
thrownewArgumentException("Providedfilestreamcannotseek!","InputFileStream");
EncodingencodingFound=null;
longoriginalPos=InputFileStream.Position;
InputFileStream.Position=0;
//FirstreadonlywhatweneedforBOMdetection
byte[]bomBytes=newbyte[InputFileStream.Length>4?4:InputFileStream.Length];
InputFileStream.Read(bomBytes,0,bomBytes.Length);
encodingFound=DetectBOMBytes(bomBytes);
if(encodingFound!=null)
{
InputFileStream.Position=originalPos;
returnencodingFound;
}
//BOMDetectionfailed,goingforheuristicsnow.
//createsamplebytearrayandpopulateit
byte[]sampleBytes=newbyte[HeuristicSampleSize>InputFileStream.Length?InputFileStream.Length:HeuristicSampleSize];
Array.Copy(bomBytes,sampleBytes,bomBytes.Length);
if(InputFileStream.Length>bomBytes.Length)
InputFileStream.Read(sampleBytes,bomBytes.Length,sampleBytes.Length-bomBytes.Length);
InputFileStream.Position=originalPos;
//testbytearraycontent
encodingFound=DetectUnicodeInByteSampleByHeuristics(sampleBytes);
if(encodingFound!=null)
returnencodingFound;
else
returnDefaultEncoding;
}
publicstaticEncodingDetectTextByteArrayEncoding(byte[]TextData,EncodingDefaultEncoding)
{
if(TextData==null)
thrownewArgumentNullException("Mustprovideavalidtextdatabytearray!","TextData");
EncodingencodingFound=null;
encodingFound=DetectBOMBytes(TextData);
if(encodingFound!=null)
{
returnencodingFound;
}
else
{
//testbytearraycontent
encodingFound=DetectUnicodeInByteSampleByHeuristics(TextData);
if(encodingFound!=null)
returnencodingFound;
else
returnDefaultEncoding;
}
}
publicstaticEncodingDetectBOMBytes(byte[]BOMBytes)
{
if(BOMBytes==null)
thrownewArgumentNullException("MustprovideavalidBOMbytearray!","BOMBytes");
if(BOMBytes.Length<2)
returnnull;
if(BOMBytes[0]==0xff
&&BOMBytes[1]==0xfe
&&(BOMBytes.Length<4
||BOMBytes[2]!=0
||BOMBytes[3]!=0
)
)
returnEncoding.Unicode;
if(BOMBytes[0]==0xfe
&&BOMBytes[1]==0xff
)
returnEncoding.BigEndianUnicode;
if(BOMBytes.Length<3)
returnnull;
if(BOMBytes[0]==0xef&&BOMBytes[1]==0xbb&&BOMBytes[2]==0xbf)
returnEncoding.UTF8;
if(BOMBytes[0]==0x2b&&BOMBytes[1]==0x2f&&BOMBytes[2]==0x76)
returnEncoding.UTF7;
if(BOMBytes.Length<4)
returnnull;
if(BOMBytes[0]==0xff&&BOMBytes[1]==0xfe&&BOMBytes[2]==0&&BOMBytes[3]==0)
returnEncoding.UTF32;
if(BOMBytes[0]==0&&BOMBytes[1]==0&&BOMBytes[2]==0xfe&&BOMBytes[3]==0xff)
returnEncoding.GetEncoding(12001);
returnnull;
}
publicstaticEncodingDetectUnicodeInByteSampleByHeuristics(byte[]SampleBytes)
{
longoddBinaryNullsInSample=0;
longevenBinaryNullsInSample=0;
longsuspiciousUTF8SequenceCount=0;
longsuspiciousUTF8BytesTotal=0;
longlikelyUSASCIIBytesInSample=0;
//Cyclethrough,keepingcountofbinarynullpositions,possibleUTF-8
//sequencesfromupperrangesofWindows-1252,andprobableUS-ASCII
//charactercounts.
longcurrentPos=0;
intskipUTF8Bytes=0;
while(currentPos<SampleBytes.Length)
{
//binarynulldistribution
if(SampleBytes[currentPos]==0)
{
if(currentPos%2==0)
evenBinaryNullsInSample++;
else
oddBinaryNullsInSample++;
}
//likelyUS-ASCIIcharacters
if(IsCommonUSASCIIByte(SampleBytes[currentPos]))
likelyUSASCIIBytesInSample++;
//suspicioussequences(looklikeUTF-8)
if(skipUTF8Bytes==0)
{
intlengthFound=DetectSuspiciousUTF8SequenceLength(SampleBytes,currentPos);
if(lengthFound>0)
{
suspiciousUTF8SequenceCount++;
suspiciousUTF8BytesTotal+=lengthFound;
skipUTF8Bytes=lengthFound-1;
}
}
else
{
skipUTF8Bytes--;
}
currentPos++;
}
//1:UTF-16LE-inenglish/europeanenvironments,thisisusuallycharacterizedbya
//highproportionofoddbinarynulls(startingat0),with(asthisistext)alow
//proportionofevenbinarynulls.
//Thethresholdshereused(lessthan20%nullswhereyouexpectnon-nulls,andmorethan
//60%nullswhereyoudoexpectnulls)arecompletelyarbitrary.
if(((evenBinaryNullsInSample*2.0)/SampleBytes.Length)<0.2
&&((oddBinaryNullsInSample*2.0)/SampleBytes.Length)>0.6
)
returnEncoding.Unicode;
//2:UTF-16BE-inenglish/europeanenvironments,thisisusuallycharacterizedbya
//highproportionofevenbinarynulls(startingat0),with(asthisistext)alow
//proportionofoddbinarynulls.
//Thethresholdshereused(lessthan20%nullswhereyouexpectnon-nulls,andmorethan
//60%nullswhereyoudoexpectnulls)arecompletelyarbitrary.
if(((oddBinaryNullsInSample*2.0)/SampleBytes.Length)<0.2
&&((evenBinaryNullsInSample*2.0)/SampleBytes.Length)>0.6
)
returnEncoding.BigEndianUnicode;
//3:UTF-8-MartinDürstoutlinesamethodfordetectingwhethersomethingCANbeUTF-8content
//usingregexp,inhisw3c.orgunicodeFAQentry:
//http://www.w3.org/International/questions/qa-forms-utf-8
//adaptedhereforC#.
stringpotentiallyMangledString=Encoding.ASCII.GetString(SampleBytes);
RegexUTF8Validator=newRegex(@"\A("
+@"[\x09\x0A\x0D\x20-\x7E]"
+@"|[\xC2-\xDF][\x80-\xBF]"
+@"|\xE0[\xA0-\xBF][\x80-\xBF]"
+@"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
+@"|\xED[\x80-\x9F][\x80-\xBF]"
+@"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
+@"|[\xF1-\xF3][\x80-\xBF]{3}"
+@"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
+@")*\z");
if(UTF8Validator.IsMatch(potentiallyMangledString))
{
//Unfortunately,justthefactthatitCANbeUTF-8doesn'ttellyoumuchaboutprobabilities.
//Ifallthecharactersareinthe0-127range,noharmdone,mostwesterncharsetsaresameasUTF-8intheseranges.
//Ifsomeofthecharacterswereintheupperrange(westernaccentedcharacters),however,theywouldlikelybemangledto2-bytebytheUTF-8encodingprocess.
//So,weneedtoplaystats.
//The"Random"likelihoodofanypairofrandomlygeneratedcharactersbeingone
//ofthese"suspicious"charactersequencesis:
//128/(256*256)=0.2%.
//
//Inwesterntextdata,thatisSIGNIFICANTLYreduced-mosttextdatastaysinthe<127
//characterrange,soweassumethatmorethan1in500,000ofthesecharacter
//sequencesindicatesUTF-8.Thenumber500,000iscompletelyarbitrary-sosueme.
//
//WecanonlyassumethesecharactersequenceswillberareifweALSOassumethatthis
//ISinfactwesterntext-inwhichcasethebulkoftheUTF-8encodeddata(thatis
//notalreadysuspicioussequences)shouldbeplainUS-ASCIIbytes.This,I
//arbitrarilydecided,shouldbe80%(arandomdistribution,egbinarydata,wouldyield
//approx40%,sothechancesofhittingthisthresholdbyaccidentinrandomdataare
//VERYlow).
if((suspiciousUTF8SequenceCount*500000.0/SampleBytes.Length>=1)//suspicioussequences
&&(
//allsuspicious,socannotevaluateproportionofUS-Ascii
SampleBytes.Length-suspiciousUTF8BytesTotal==0
||
likelyUSASCIIBytesInSample*1.0/(SampleBytes.Length-suspiciousUTF8BytesTotal)>=0.8
)
)
returnEncoding.UTF8;
}
returnnull;
}
privatestaticboolIsCommonUSASCIIByte(bytetestByte)
{
if(testByte==0x0A//lf
||testByte==0x0D//cr
||testByte==0x09//tab
||(testByte>=0x20&&testByte<=0x2F)//commonpunctuation
||(testByte>=0x30&&testByte<=0x39)//digits
||(testByte>=0x3A&&testByte<=0x40)//commonpunctuation
||(testByte>=0x41&&testByte<=0x5A)//capitalletters
||(testByte>=0x5B&&testByte<=0x60)//commonpunctuation
||(testByte>=0x61&&testByte<=0x7A)//lowercaseletters
||(testByte>=0x7B&&testByte<=0x7E)//commonpunctuation
)
returntrue;
else
returnfalse;
}
privatestaticintDetectSuspiciousUTF8SequenceLength(byte[]SampleBytes,longcurrentPos)
{
intlengthFound=0;
if(SampleBytes.Length>=currentPos+1
&&SampleBytes[currentPos]==0xC2
)
{
if(SampleBytes[currentPos+1]==0x81
||SampleBytes[currentPos+1]==0x8D
||SampleBytes[currentPos+1]==0x8F
)
lengthFound=2;
elseif(SampleBytes[currentPos+1]==0x90
||SampleBytes[currentPos+1]==0x9D
)
lengthFound=2;
elseif(SampleBytes[currentPos+1]>=0xA0
&&SampleBytes[currentPos+1]<=0xBF
)
lengthFound=2;
}
elseif(SampleBytes.Length>=currentPos+1
&&SampleBytes[currentPos]==0xC3
)
{
if(SampleBytes[currentPos+1]>=0x80
&&SampleBytes[currentPos+1]<=0xBF
)
lengthFound=2;
}
elseif(SampleBytes.Length>=currentPos+1
&&SampleBytes[currentPos]==0xC5
)
{
if(SampleBytes[currentPos+1]==0x92
||SampleBytes[currentPos+1]==0x93
)
lengthFound=2;
elseif(SampleBytes[currentPos+1]==0xA0
||SampleBytes[currentPos+1]==0xA1
)
lengthFound=2;
elseif(SampleBytes[currentPos+1]==0xB8
||SampleBytes[currentPos+1]==0xBD
||SampleBytes[currentPos+1]==0xBE
)
lengthFound=2;
}
elseif(SampleBytes.Length>=currentPos+1
&&SampleBytes[currentPos]==0xC6
)
{
if(SampleBytes[currentPos+1]==0x92)
lengthFound=2;
}
elseif(SampleBytes.Length>=currentPos+1
&&SampleBytes[currentPos]==0xCB
)
{
if(SampleBytes[currentPos+1]==0x86
||SampleBytes[currentPos+1]==0x9C
)
lengthFound=2;
}
elseif(SampleBytes.Length>=currentPos+2
&&SampleBytes[currentPos]==0xE2
)
{
if(SampleBytes[currentPos+1]==0x80)
{
if(SampleBytes[currentPos+2]==0x93
||SampleBytes[currentPos+2]==0x94
)
lengthFound=3;
if(SampleBytes[currentPos+2]==0x98
||SampleBytes[currentPos+2]==0x99
||SampleBytes[currentPos+2]==0x9A
)
lengthFound=3;
if(SampleBytes[currentPos+2]==0x9C
||SampleBytes[currentPos+2]==0x9D
||SampleBytes[currentPos+2]==0x9E
)
lengthFound=3;
if(SampleBytes[currentPos+2]==0xA0
||SampleBytes[currentPos+2]==0xA1
||SampleBytes[currentPos+2]==0xA2
)
lengthFound=3;
if(SampleBytes[currentPos+2]==0xA6)
lengthFound=3;
if(SampleBytes[currentPos+2]==0xB0)
lengthFound=3;
if(SampleBytes[currentPos+2]==0xB9
||SampleBytes[currentPos+2]==0xBA
)
lengthFound=3;
}
elseif(SampleBytes[currentPos+1]==0x82
&&SampleBytes[currentPos+2]==0xAC
)
lengthFound=3;
elseif(SampleBytes[currentPos+1]==0x84
&&SampleBytes[currentPos+2]==0xA2
)
lengthFound=3;
}
returnlengthFound;
}
}
}
使用方法:
EncodingfileEncoding=TextFileEncodingDetector.DetectTextFileEncoding("youfilepath",Encoding.Default);
以上就是本文的全部内容,希望对大家学习C#程序设计有所帮助。