java使用Nagao算法实现新词发现、热门词的挖掘
采用Nagao算法统计各个子字符串的频次,然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息(内部凝聚度)。
名词解释:
Nagao算法:一种快速的统计文本里所有子字符串频次的算法。详细算法可见http://www.doc88.com/p-664123446503.html
词频:该字符串在文档中出现的次数。出现次数越多越重要。
左右邻个数:文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多,说明字符串成词概率越高。
左右熵:文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标,有一定区别。
交互信息:每次将某字符串分成两部分,左半部分字符串和右半部分字符串,计算其同时出现的概率除于其各自独立出现的概率,最后取所有的划分里面概率最小值。这个值越大,说明字符串内部凝聚度越高,越可能成词。
算法具体流程:
1. 将输入文件逐行读入,按照非汉字([^\u4E00-\u9FA5]+)以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”,
分成一个个字符串,代码如下:
String[]phrases=line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
停用词可以修改。
2. 获取所有切分后的字符串的左子串和右子串,分别加入左、右PTable
3. 对PTable排序,并计算LTable。LTable记录的是,排序后的PTable中,下一个子串同上一个子串具有相同字符的数量
4. 遍历PTable和LTable,即可得到所有子字符串的词频、左右邻
5. 根据所有子字符串的词频、左右邻结果,输出字符串的词频、左右邻个数、左右熵、交互信息
1. NagaoAlgorithm.java
packagecom.algo.word;
importjava.io.BufferedReader;
importjava.io.BufferedWriter;
importjava.io.FileNotFoundException;
importjava.io.FileReader;
importjava.io.FileWriter;
importjava.io.IOException;
importjava.util.ArrayList;
importjava.util.Arrays;
importjava.util.Collections;
importjava.util.HashMap;
importjava.util.HashSet;
importjava.util.List;
importjava.util.Map;
importjava.util.Set;
publicclassNagaoAlgorithm{
privateintN;
privateList<String>leftPTable;
privateint[]leftLTable;
privateList<String>rightPTable;
privateint[]rightLTable;
privatedoublewordNumber;
privateMap<String,TFNeighbor>wordTFNeighbor;
privatefinalstaticStringstopwords="的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说";
privateNagaoAlgorithm(){
//defaultN=5
N=5;
leftPTable=newArrayList<String>();
rightPTable=newArrayList<String>();
wordTFNeighbor=newHashMap<String,TFNeighbor>();
}
//reversephrase
privateStringreverse(Stringphrase){
StringBuilderreversePhrase=newStringBuilder();
for(inti=phrase.length()-1;i>=0;i--)
reversePhrase.append(phrase.charAt(i));
returnreversePhrase.toString();
}
//co-prefixlengthofs1ands2
privateintcoPrefixLength(Strings1,Strings2){
intcoPrefixLength=0;
for(inti=0;i<Math.min(s1.length(),s2.length());i++){
if(s1.charAt(i)==s2.charAt(i))coPrefixLength++;
elsebreak;
}
returncoPrefixLength;
}
//addsubstringoflinetopTable
privatevoidaddToPTable(Stringline){
//splitlineaccordingtoconsecutivenoneChinesecharacter
String[]phrases=line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
for(Stringphrase:phrases){
for(inti=0;i<phrase.length();i++)
rightPTable.add(phrase.substring(i));
StringreversePhrase=reverse(phrase);
for(inti=0;i<reversePhrase.length();i++)
leftPTable.add(reversePhrase.substring(i));
wordNumber+=phrase.length();
}
}
//countlTable
privatevoidcountLTable(){
Collections.sort(rightPTable);
rightLTable=newint[rightPTable.size()];
for(inti=1;i<rightPTable.size();i++)
rightLTable[i]=coPrefixLength(rightPTable.get(i-1),rightPTable.get(i));
Collections.sort(leftPTable);
leftLTable=newint[leftPTable.size()];
for(inti=1;i<leftPTable.size();i++)
leftLTable[i]=coPrefixLength(leftPTable.get(i-1),leftPTable.get(i));
System.out.println("Info:[NagaoAlgorithmStep2]:havingsortedPTableandcountedleftandrightLTable");
}
//accordingtopTableandlTable,countstatisticalresult:TF,neighbordistribution
privatevoidcountTFNeighbor(){
//getTFandrightneighbor
for(intpIndex=0;pIndex<rightPTable.size();pIndex++){
Stringphrase=rightPTable.get(pIndex);
for(intlength=1+rightLTable[pIndex];length<=N&&length<=phrase.length();length++){
Stringword=phrase.substring(0,length);
TFNeighbortfNeighbor=newTFNeighbor();
tfNeighbor.incrementTF();
if(phrase.length()>length)
tfNeighbor.addToRightNeighbor(phrase.charAt(length));
for(intlIndex=pIndex+1;lIndex<rightLTable.length;lIndex++){
if(rightLTable[lIndex]>=length){
tfNeighbor.incrementTF();
StringcoPhrase=rightPTable.get(lIndex);
if(coPhrase.length()>length)
tfNeighbor.addToRightNeighbor(coPhrase.charAt(length));
}
elsebreak;
}
wordTFNeighbor.put(word,tfNeighbor);
}
}
//getleftneighbor
for(intpIndex=0;pIndex<leftPTable.size();pIndex++){
Stringphrase=leftPTable.get(pIndex);
for(intlength=1+leftLTable[pIndex];length<=N&&length<=phrase.length();length++){
Stringword=reverse(phrase.substring(0,length));
TFNeighbortfNeighbor=wordTFNeighbor.get(word);
if(phrase.length()>length)
tfNeighbor.addToLeftNeighbor(phrase.charAt(length));
for(intlIndex=pIndex+1;lIndex<leftLTable.length;lIndex++){
if(leftLTable[lIndex]>=length){
StringcoPhrase=leftPTable.get(lIndex);
if(coPhrase.length()>length)
tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length));
}
elsebreak;
}
}
}
System.out.println("Info:[NagaoAlgorithmStep3]:havingcountedTFandNeighbor");
}
//accordingtowordTFNeighbor,countMIofword
privatedoublecountMI(Stringword){
if(word.length()<=1)return0;
doublecoProbability=wordTFNeighbor.get(word).getTF()/wordNumber;
List<Double>mi=newArrayList<Double>(word.length());
for(intpos=1;pos<word.length();pos++){
StringleftPart=word.substring(0,pos);
StringrightPart=word.substring(pos);
doubleleftProbability=wordTFNeighbor.get(leftPart).getTF()/wordNumber;
doublerightProbability=wordTFNeighbor.get(rightPart).getTF()/wordNumber;
mi.add(coProbability/(leftProbability*rightProbability));
}
returnCollections.min(mi);
}
//saveTF,(leftandright)neighbornumber,neighborentropy,mutualinformation
privatevoidsaveTFNeighborInfoMI(Stringout,StringstopList,String[]threshold){
try{
//readstopwordsfile
Set<String>stopWords=newHashSet<String>();
BufferedReaderbr=newBufferedReader(newFileReader(stopList));
Stringline;
while((line=br.readLine())!=null){
if(line.length()>1)
stopWords.add(line);
}
br.close();
//outputwordsTF,neighborinfo,MI
BufferedWriterbw=newBufferedWriter(newFileWriter(out));
for(Map.Entry<String,TFNeighbor>entry:wordTFNeighbor.entrySet()){
if(entry.getKey().length()<=1||stopWords.contains(entry.getKey()))continue;
TFNeighbortfNeighbor=entry.getValue();
inttf,leftNeighborNumber,rightNeighborNumber;
doublemi;
tf=tfNeighbor.getTF();
leftNeighborNumber=tfNeighbor.getLeftNeighborNumber();
rightNeighborNumber=tfNeighbor.getRightNeighborNumber();
mi=countMI(entry.getKey());
if(tf>Integer.parseInt(threshold[0])&&leftNeighborNumber>Integer.parseInt(threshold[1])&&
rightNeighborNumber>Integer.parseInt(threshold[2])&&mi>Integer.parseInt(threshold[3])){
StringBuildersb=newStringBuilder();
sb.append(entry.getKey());
sb.append(",").append(tf);
sb.append(",").append(leftNeighborNumber);
sb.append(",").append(rightNeighborNumber);
sb.append(",").append(tfNeighbor.getLeftNeighborEntropy());
sb.append(",").append(tfNeighbor.getRightNeighborEntropy());
sb.append(",").append(mi).append("\n");
bw.write(sb.toString());
}
}
bw.close();
}catch(IOExceptione){
thrownewRuntimeException(e);
}
System.out.println("Info:[NagaoAlgorithmStep4]:havingsavedtofile");
}
//applynagaoalgorithmtoinputfile
publicstaticvoidapplyNagao(String[]inputs,Stringout,StringstopList){
NagaoAlgorithmnagao=newNagaoAlgorithm();
//step1:addphrasestoPTable
Stringline;
for(Stringin:inputs){
try{
BufferedReaderbr=newBufferedReader(newFileReader(in));
while((line=br.readLine())!=null){
nagao.addToPTable(line);
}
br.close();
}catch(IOExceptione){
thrownewRuntimeException();
}
}
System.out.println("Info:[NagaoAlgorithmStep1]:havingaddedallleftandrightsubstringstoPTable");
//step2:sortPTableandcountLTable
nagao.countLTable();
//step3:countTFandNeighbor
nagao.countTFNeighbor();
//step4:saveTFNeighborInfoandMI
nagao.saveTFNeighborInfoMI(out,stopList,"20,3,3,5".split(","));
}
publicstaticvoidapplyNagao(String[]inputs,Stringout,StringstopList,intn,Stringfilter){
NagaoAlgorithmnagao=newNagaoAlgorithm();
nagao.setN(n);
String[]threshold=filter.split(",");
if(threshold.length!=4){
System.out.println("ERROR:filtermusthave4numbers,seperatedwith','");
return;
}
//step1:addphrasestoPTable
Stringline;
for(Stringin:inputs){
try{
BufferedReaderbr=newBufferedReader(newFileReader(in));
while((line=br.readLine())!=null){
nagao.addToPTable(line);
}
br.close();
}catch(IOExceptione){
thrownewRuntimeException();
}
}
System.out.println("Info:[NagaoAlgorithmStep1]:havingaddedallleftandrightsubstringstoPTable");
//step2:sortPTableandcountLTable
nagao.countLTable();
//step3:countTFandNeighbor
nagao.countTFNeighbor();
//step4:saveTFNeighborInfoandMI
nagao.saveTFNeighborInfoMI(out,stopList,threshold);
}
privatevoidsetN(intn){
N=n;
}
publicstaticvoidmain(String[]args){
String[]ins={"E://test//ganfen.txt"};
applyNagao(ins,"E://test//out.txt","E://test//stoplist.txt");
}
}
2.TFNeighbor.java
packagecom.algo.word;
importjava.util.HashMap;
importjava.util.Map;
publicclassTFNeighbor{
privateinttf;
privateMap<Character,Integer>leftNeighbor;
privateMap<Character,Integer>rightNeighbor;
TFNeighbor(){
leftNeighbor=newHashMap<Character,Integer>();
rightNeighbor=newHashMap<Character,Integer>();
}
//addwordtoleftNeighbor
publicvoidaddToLeftNeighbor(charword){
//leftNeighbor.put(word,1+leftNeighbor.getOrDefault(word,0));
Integernumber=leftNeighbor.get(word);
leftNeighbor.put(word,number==null?1:1+number);
}
//addwordtorightNeighbor
publicvoidaddToRightNeighbor(charword){
//rightNeighbor.put(word,1+rightNeighbor.getOrDefault(word,0));
Integernumber=rightNeighbor.get(word);
rightNeighbor.put(word,number==null?1:1+number);
}
//incrementtf
publicvoidincrementTF(){
tf++;
}
publicintgetLeftNeighborNumber(){
returnleftNeighbor.size();
}
publicintgetRightNeighborNumber(){
returnrightNeighbor.size();
}
publicdoublegetLeftNeighborEntropy(){
doubleentropy=0;
intsum=0;
for(intnumber:leftNeighbor.values()){
entropy+=number*Math.log(number);
sum+=number;
}
if(sum==0)return0;
returnMath.log(sum)-entropy/sum;
}
publicdoublegetRightNeighborEntropy(){
doubleentropy=0;
intsum=0;
for(intnumber:rightNeighbor.values()){
entropy+=number*Math.log(number);
sum+=number;
}
if(sum==0)return0;
returnMath.log(sum)-entropy/sum;
}
publicintgetTF(){
returntf;
}
}
3.Main.java
packagecom.algo.word;
publicclassMain{
publicstaticvoidmain(String[]args){
//if3arguments,firstargumentisinputfilessplittingwith','
//secondargumentisoutputfile
//output7columnssplitwith',',likebelow:
//word,termfrequency,leftneighbornumber,rightneighbornumber,leftneighborentropy,rightneighborentropy,mutualinformation
//thirdargumentisstopwordslist
if(args.length==3)
NagaoAlgorithm.applyNagao(args[0].split(","),args[1],args[2]);
//if4arguments,forthargumentistheNGramparameterN
//5thargumentisthresholdofoutputwords,defaultis"20,3,3,5"
//outputTF>20&&(left|right)neighbornumber>3&&MI>5
elseif(args.length==5)
NagaoAlgorithm.applyNagao(args[0].split(","),args[1],args[2],Integer.parseInt(args[3]),args[4]);
}
}
以上所述就是本文的全部内容了,希望大家能够喜欢。