.NET下文本相似度算法余弦定理和SimHash浅析及应用实例分析
本文实例讲述了.NET下文本相似度算法余弦定理和SimHash浅析及应用。分享给大家供大家参考。具体分析如下:
余弦相似性
原理:首先我们先把两段文本分词,列出来所有单词,其次我们计算每个词语的词频,最后把词语转换为向量,这样我们就只需要计算两个向量的相似程度.
我们简单表述如下
文本1:我/爱/北京/天安门/经过分词求词频得出向量(伪向量) [1,1,1,1]
文本2:我们/都爱/北京/天安门/经过分词求词频得出向量(伪向量) [1,0,1,2]
我们可以把它们想象成空间中的两条线段,都是从原点([0,0,...])出发,指向不同的方向。两条线段之间形成一个夹角,如果夹角为0度,意味着方向相同、线段重合;如果夹角为90度,意味着形成直角,方向完全不相似;如果夹角为180度,意味着方向正好相反。因此,我们可以通过夹角的大小,来判断向量的相似程度。夹角越小,就代表越相似。
C#核心算法:
publicclassTFIDFMeasure
{
privatestring[]_docs;
privatestring[][]_ngramDoc;
privateint_numDocs=0;
privateint_numTerms=0;
privateArrayList_terms;
privateint[][]_termFreq;
privatefloat[][]_termWeight;
privateint[]_maxTermFreq;
privateint[]_docFreq;
publicclassTermVector
{
publicstaticfloatComputeCosineSimilarity(float[]vector1,float[]vector2)
{
if(vector1.Length!=vector2.Length)
thrownewException("DIFERLENGTH");
floatdenom=(VectorLength(vector1)*VectorLength(vector2));
if(denom==0F)
return0F;
else
return(InnerProduct(vector1,vector2)/denom);
}
publicstaticfloatInnerProduct(float[]vector1,float[]vector2)
{
if(vector1.Length!=vector2.Length)
thrownewException("DIFFERLENGTHARENOTALLOWED");
floatresult=0F;
for(inti=0;i<vector1.Length;i++)
result+=vector1[i]*vector2[i];
returnresult;
}
publicstaticfloatVectorLength(float[]vector)
{
floatsum=0.0F;
for(inti=0;i<vector.Length;i++)
sum=sum+(vector[i]*vector[i]);
return(float)Math.Sqrt(sum);
}
}
privateIDictionary_wordsIndex=newHashtable();
publicTFIDFMeasure(string[]documents)
{
_docs=documents;
_numDocs=documents.Length;
MyInit();
}
privatevoidGeneratNgramText()
{
}
privateArrayListGenerateTerms(string[]docs)
{
ArrayListuniques=newArrayList();
_ngramDoc=newstring[_numDocs][];
for(inti=0;i<docs.Length;i++)
{
Tokenisertokenizer=newTokeniser();
string[]words=tokenizer.Partition(docs[i]);
for(intj=0;j<words.Length;j++)
if(!uniques.Contains(words[j]))
uniques.Add(words[j]);
}
returnuniques;
}
privatestaticobjectAddElement(IDictionarycollection,objectkey,objectnewValue) { objectelement=collection[key]; collection[key]=newValue; returnelement; } privateintGetTermIndex(stringterm) { objectindex=_wordsIndex[term]; if(index==null)return-1; return(int)index; } privatevoidMyInit() { _terms=GenerateTerms(_docs); _numTerms=_terms.Count; _maxTermFreq=newint[_numDocs]; _docFreq=newint[_numTerms]; _termFreq=newint[_numTerms][]; _termWeight=newfloat[_numTerms][]; for(inti=0;i<_terms.Count;i++) { _termWeight[i]=newfloat[_numDocs]; _termFreq[i]=newint[_numDocs]; AddElement(_wordsIndex,_terms[i],i); } GenerateTermFrequency(); GenerateTermWeight(); } privatefloatLog(floatnum) { return(float)Math.Log(num);//log2 } privatevoidGenerateTermFrequency() { for(inti=0;i<_numDocs ;i++) { stringcurDoc=_docs[i]; IDictionaryfreq=GetWordFrequency(curDoc); IDictionaryEnumeratorenums=freq.GetEnumerator(); _maxTermFreq[i]=int.MinValue; while(enums.MoveNext()) { stringword=(string)enums.Key; intwordFreq=(int)enums.Value; inttermIndex=GetTermIndex(word); _termFreq[termIndex][i]=wordFreq; _docFreq[termIndex]++; if(wordFreq>_maxTermFreq[i])_maxTermFreq[i]=wordFreq; } } }
privatevoidGenerateTermWeight() { for(inti=0;i<_numTerms ;i++) { for(intj=0;j<_numDocs;j++) _termWeight[i][j]=ComputeTermWeight(i,j); } } privatefloatGetTermFrequency(intterm,intdoc) { intfreq=_termFreq[term][doc]; intmaxfreq=_maxTermFreq[doc]; return((float)freq/(float)maxfreq); } privatefloatGetInverseDocumentFrequency(intterm) { intdf=_docFreq[term]; returnLog((float)(_numDocs)/(float)df); } privatefloatComputeTermWeight(intterm,intdoc) { floattf=GetTermFrequency(term,doc); floatidf=GetInverseDocumentFrequency(term); returntf*idf; } private float[]GetTermVector(intdoc) { float[]w=newfloat[_numTerms]; for(inti=0;i<_numTerms;i++) w[i]=_termWeight[i][doc]; returnw; } publicfloatGetSimilarity(intdoc_i,intdoc_j) { float[]vector1=GetTermVector(doc_i); float[]vector2=GetTermVector(doc_j); returnTermVector.ComputeCosineSimilarity(vector1,vector2); } privateIDictionaryGetWordFrequency(stringinput) { stringconvertedInput=input.ToLower(); Tokenisertokenizer=newTokeniser(); String[]words=tokenizer.Partition(convertedInput); Array.Sort(words); String[]distinctWords=GetDistinctWords(words); IDictionaryresult=newHashtable(); for(inti=0;i<distinctWords.Length;i++) { objecttmp; tmp=CountWords(distinctWords[i],words); result[distinctWords[i]]=tmp; } returnresult; } privatestring[]GetDistinctWords(String[]input) { if(input==null) returnnewstring[0]; else { ArrayListlist=newArrayList(); for(inti=0;i<input.Length;i++) if(!list.Contains(input[i]))//N-GRAMSIMILARITY? list.Add(input[i]); returnTokeniser.ArrayListToArray(list); } }
privateintCountWords(stringword,string[]words) { intitemIdx=Array.BinarySearch(words,word); if(itemIdx>0) while(itemIdx>0&&words[itemIdx].Equals(word)) itemIdx--; intcount=0; while(itemIdx<words.Length&&itemIdx>=0) { if(words[itemIdx].Equals(word))count++; itemIdx++; if(itemIdx<words.Length) if(!words[itemIdx].Equals(word))break; } returncount; } }