java使用Nagao算法实现新词发现、热门词的挖掘
采用Nagao算法统计各个子字符串的频次,然后基于这些频次统计每个字符串的词频、左右邻个数、左右熵、交互信息(内部凝聚度)。
名词解释:
Nagao算法:一种快速的统计文本里所有子字符串频次的算法。详细算法可见http://www.doc88.com/p-664123446503.html
词频:该字符串在文档中出现的次数。出现次数越多越重要。
左右邻个数:文档中该字符串的左边和右边出现的不同的字的个数。左右邻越多,说明字符串成词概率越高。
左右熵:文档中该字符串的左边和右边出现的不同的字的数量分布的熵。类似上面的指标,有一定区别。
交互信息:每次将某字符串分成两部分,左半部分字符串和右半部分字符串,计算其同时出现的概率除于其各自独立出现的概率,最后取所有的划分里面概率最小值。这个值越大,说明字符串内部凝聚度越高,越可能成词。
算法具体流程:
1. 将输入文件逐行读入,按照非汉字([^\u4E00-\u9FA5]+)以及停词“的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说”,
分成一个个字符串,代码如下:
String[]phrases=line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
停用词可以修改。
2. 获取所有切分后的字符串的左子串和右子串,分别加入左、右PTable
3. 对PTable排序,并计算LTable。LTable记录的是,排序后的PTable中,下一个子串同上一个子串具有相同字符的数量
4. 遍历PTable和LTable,即可得到所有子字符串的词频、左右邻
5. 根据所有子字符串的词频、左右邻结果,输出字符串的词频、左右邻个数、左右熵、交互信息
1. NagaoAlgorithm.java
packagecom.algo.word; importjava.io.BufferedReader; importjava.io.BufferedWriter; importjava.io.FileNotFoundException; importjava.io.FileReader; importjava.io.FileWriter; importjava.io.IOException; importjava.util.ArrayList; importjava.util.Arrays; importjava.util.Collections; importjava.util.HashMap; importjava.util.HashSet; importjava.util.List; importjava.util.Map; importjava.util.Set; publicclassNagaoAlgorithm{ privateintN; privateList<String>leftPTable; privateint[]leftLTable; privateList<String>rightPTable; privateint[]rightLTable; privatedoublewordNumber; privateMap<String,TFNeighbor>wordTFNeighbor; privatefinalstaticStringstopwords="的很了么呢是嘛个都也比还这于不与才上用就好在和对挺去后没说"; privateNagaoAlgorithm(){ //defaultN=5 N=5; leftPTable=newArrayList<String>(); rightPTable=newArrayList<String>(); wordTFNeighbor=newHashMap<String,TFNeighbor>(); } //reversephrase privateStringreverse(Stringphrase){ StringBuilderreversePhrase=newStringBuilder(); for(inti=phrase.length()-1;i>=0;i--) reversePhrase.append(phrase.charAt(i)); returnreversePhrase.toString(); } //co-prefixlengthofs1ands2 privateintcoPrefixLength(Strings1,Strings2){ intcoPrefixLength=0; for(inti=0;i<Math.min(s1.length(),s2.length());i++){ if(s1.charAt(i)==s2.charAt(i))coPrefixLength++; elsebreak; } returncoPrefixLength; } //addsubstringoflinetopTable privatevoidaddToPTable(Stringline){ //splitlineaccordingtoconsecutivenoneChinesecharacter String[]phrases=line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]"); for(Stringphrase:phrases){ for(inti=0;i<phrase.length();i++) rightPTable.add(phrase.substring(i)); StringreversePhrase=reverse(phrase); for(inti=0;i<reversePhrase.length();i++) leftPTable.add(reversePhrase.substring(i)); wordNumber+=phrase.length(); } } //countlTable privatevoidcountLTable(){ Collections.sort(rightPTable); rightLTable=newint[rightPTable.size()]; for(inti=1;i<rightPTable.size();i++) rightLTable[i]=coPrefixLength(rightPTable.get(i-1),rightPTable.get(i)); Collections.sort(leftPTable); leftLTable=newint[leftPTable.size()]; for(inti=1;i<leftPTable.size();i++) leftLTable[i]=coPrefixLength(leftPTable.get(i-1),leftPTable.get(i)); System.out.println("Info:[NagaoAlgorithmStep2]:havingsortedPTableandcountedleftandrightLTable"); } //accordingtopTableandlTable,countstatisticalresult:TF,neighbordistribution privatevoidcountTFNeighbor(){ //getTFandrightneighbor for(intpIndex=0;pIndex<rightPTable.size();pIndex++){ Stringphrase=rightPTable.get(pIndex); for(intlength=1+rightLTable[pIndex];length<=N&&length<=phrase.length();length++){ Stringword=phrase.substring(0,length); TFNeighbortfNeighbor=newTFNeighbor(); tfNeighbor.incrementTF(); if(phrase.length()>length) tfNeighbor.addToRightNeighbor(phrase.charAt(length)); for(intlIndex=pIndex+1;lIndex<rightLTable.length;lIndex++){ if(rightLTable[lIndex]>=length){ tfNeighbor.incrementTF(); StringcoPhrase=rightPTable.get(lIndex); if(coPhrase.length()>length) tfNeighbor.addToRightNeighbor(coPhrase.charAt(length)); } elsebreak; } wordTFNeighbor.put(word,tfNeighbor); } } //getleftneighbor for(intpIndex=0;pIndex<leftPTable.size();pIndex++){ Stringphrase=leftPTable.get(pIndex); for(intlength=1+leftLTable[pIndex];length<=N&&length<=phrase.length();length++){ Stringword=reverse(phrase.substring(0,length)); TFNeighbortfNeighbor=wordTFNeighbor.get(word); if(phrase.length()>length) tfNeighbor.addToLeftNeighbor(phrase.charAt(length)); for(intlIndex=pIndex+1;lIndex<leftLTable.length;lIndex++){ if(leftLTable[lIndex]>=length){ StringcoPhrase=leftPTable.get(lIndex); if(coPhrase.length()>length) tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length)); } elsebreak; } } } System.out.println("Info:[NagaoAlgorithmStep3]:havingcountedTFandNeighbor"); } //accordingtowordTFNeighbor,countMIofword privatedoublecountMI(Stringword){ if(word.length()<=1)return0; doublecoProbability=wordTFNeighbor.get(word).getTF()/wordNumber; List<Double>mi=newArrayList<Double>(word.length()); for(intpos=1;pos<word.length();pos++){ StringleftPart=word.substring(0,pos); StringrightPart=word.substring(pos); doubleleftProbability=wordTFNeighbor.get(leftPart).getTF()/wordNumber; doublerightProbability=wordTFNeighbor.get(rightPart).getTF()/wordNumber; mi.add(coProbability/(leftProbability*rightProbability)); } returnCollections.min(mi); } //saveTF,(leftandright)neighbornumber,neighborentropy,mutualinformation privatevoidsaveTFNeighborInfoMI(Stringout,StringstopList,String[]threshold){ try{ //readstopwordsfile Set<String>stopWords=newHashSet<String>(); BufferedReaderbr=newBufferedReader(newFileReader(stopList)); Stringline; while((line=br.readLine())!=null){ if(line.length()>1) stopWords.add(line); } br.close(); //outputwordsTF,neighborinfo,MI BufferedWriterbw=newBufferedWriter(newFileWriter(out)); for(Map.Entry<String,TFNeighbor>entry:wordTFNeighbor.entrySet()){ if(entry.getKey().length()<=1||stopWords.contains(entry.getKey()))continue; TFNeighbortfNeighbor=entry.getValue(); inttf,leftNeighborNumber,rightNeighborNumber; doublemi; tf=tfNeighbor.getTF(); leftNeighborNumber=tfNeighbor.getLeftNeighborNumber(); rightNeighborNumber=tfNeighbor.getRightNeighborNumber(); mi=countMI(entry.getKey()); if(tf>Integer.parseInt(threshold[0])&&leftNeighborNumber>Integer.parseInt(threshold[1])&& rightNeighborNumber>Integer.parseInt(threshold[2])&&mi>Integer.parseInt(threshold[3])){ StringBuildersb=newStringBuilder(); sb.append(entry.getKey()); sb.append(",").append(tf); sb.append(",").append(leftNeighborNumber); sb.append(",").append(rightNeighborNumber); sb.append(",").append(tfNeighbor.getLeftNeighborEntropy()); sb.append(",").append(tfNeighbor.getRightNeighborEntropy()); sb.append(",").append(mi).append("\n"); bw.write(sb.toString()); } } bw.close(); }catch(IOExceptione){ thrownewRuntimeException(e); } System.out.println("Info:[NagaoAlgorithmStep4]:havingsavedtofile"); } //applynagaoalgorithmtoinputfile publicstaticvoidapplyNagao(String[]inputs,Stringout,StringstopList){ NagaoAlgorithmnagao=newNagaoAlgorithm(); //step1:addphrasestoPTable Stringline; for(Stringin:inputs){ try{ BufferedReaderbr=newBufferedReader(newFileReader(in)); while((line=br.readLine())!=null){ nagao.addToPTable(line); } br.close(); }catch(IOExceptione){ thrownewRuntimeException(); } } System.out.println("Info:[NagaoAlgorithmStep1]:havingaddedallleftandrightsubstringstoPTable"); //step2:sortPTableandcountLTable nagao.countLTable(); //step3:countTFandNeighbor nagao.countTFNeighbor(); //step4:saveTFNeighborInfoandMI nagao.saveTFNeighborInfoMI(out,stopList,"20,3,3,5".split(",")); } publicstaticvoidapplyNagao(String[]inputs,Stringout,StringstopList,intn,Stringfilter){ NagaoAlgorithmnagao=newNagaoAlgorithm(); nagao.setN(n); String[]threshold=filter.split(","); if(threshold.length!=4){ System.out.println("ERROR:filtermusthave4numbers,seperatedwith','"); return; } //step1:addphrasestoPTable Stringline; for(Stringin:inputs){ try{ BufferedReaderbr=newBufferedReader(newFileReader(in)); while((line=br.readLine())!=null){ nagao.addToPTable(line); } br.close(); }catch(IOExceptione){ thrownewRuntimeException(); } } System.out.println("Info:[NagaoAlgorithmStep1]:havingaddedallleftandrightsubstringstoPTable"); //step2:sortPTableandcountLTable nagao.countLTable(); //step3:countTFandNeighbor nagao.countTFNeighbor(); //step4:saveTFNeighborInfoandMI nagao.saveTFNeighborInfoMI(out,stopList,threshold); } privatevoidsetN(intn){ N=n; } publicstaticvoidmain(String[]args){ String[]ins={"E://test//ganfen.txt"}; applyNagao(ins,"E://test//out.txt","E://test//stoplist.txt"); } }
2.TFNeighbor.java
packagecom.algo.word; importjava.util.HashMap; importjava.util.Map; publicclassTFNeighbor{ privateinttf; privateMap<Character,Integer>leftNeighbor; privateMap<Character,Integer>rightNeighbor; TFNeighbor(){ leftNeighbor=newHashMap<Character,Integer>(); rightNeighbor=newHashMap<Character,Integer>(); } //addwordtoleftNeighbor publicvoidaddToLeftNeighbor(charword){ //leftNeighbor.put(word,1+leftNeighbor.getOrDefault(word,0)); Integernumber=leftNeighbor.get(word); leftNeighbor.put(word,number==null?1:1+number); } //addwordtorightNeighbor publicvoidaddToRightNeighbor(charword){ //rightNeighbor.put(word,1+rightNeighbor.getOrDefault(word,0)); Integernumber=rightNeighbor.get(word); rightNeighbor.put(word,number==null?1:1+number); } //incrementtf publicvoidincrementTF(){ tf++; } publicintgetLeftNeighborNumber(){ returnleftNeighbor.size(); } publicintgetRightNeighborNumber(){ returnrightNeighbor.size(); } publicdoublegetLeftNeighborEntropy(){ doubleentropy=0; intsum=0; for(intnumber:leftNeighbor.values()){ entropy+=number*Math.log(number); sum+=number; } if(sum==0)return0; returnMath.log(sum)-entropy/sum; } publicdoublegetRightNeighborEntropy(){ doubleentropy=0; intsum=0; for(intnumber:rightNeighbor.values()){ entropy+=number*Math.log(number); sum+=number; } if(sum==0)return0; returnMath.log(sum)-entropy/sum; } publicintgetTF(){ returntf; } }
3.Main.java
packagecom.algo.word; publicclassMain{ publicstaticvoidmain(String[]args){ //if3arguments,firstargumentisinputfilessplittingwith',' //secondargumentisoutputfile //output7columnssplitwith',',likebelow: //word,termfrequency,leftneighbornumber,rightneighbornumber,leftneighborentropy,rightneighborentropy,mutualinformation //thirdargumentisstopwordslist if(args.length==3) NagaoAlgorithm.applyNagao(args[0].split(","),args[1],args[2]); //if4arguments,forthargumentistheNGramparameterN //5thargumentisthresholdofoutputwords,defaultis"20,3,3,5" //outputTF>20&&(left|right)neighbornumber>3&&MI>5 elseif(args.length==5) NagaoAlgorithm.applyNagao(args[0].split(","),args[1],args[2],Integer.parseInt(args[3]),args[4]); } }
以上所述就是本文的全部内容了,希望大家能够喜欢。