Java实现AC自动机全文检索示例
第一步,构建Trie树,定义Node类型:
/** *Createdbyzhaoyyon2017/2/7. */ interfaceNode{ charvalue(); booleanexists(); booleanisRoot(); Nodeparent(); NodechildOf(charc); Nodefail(); voidsetFail(Nodenode); voidsetExists(booleanexists); voidadd(Nodechild); List<Node>children(); }
第二步,实现两种Node,如果词汇全是可打印的ASCII字符,就采用AsciiNode,否则(比如包含汉字),使用基于hash表的MapNode;这两种Node均集成自AbstractNode:
/** *Createdbyzhaoyyon2017/2/8. */ abstractclassAbstractNodeimplementsNode{ privatestaticfinalcharEMPTY='\0'; privatefinalcharvalue; privatefinalNodeparent; privatebooleanexists; privateNodefail; publicAbstractNode(Nodeparent,charvalue){ this.parent=parent; this.value=value; this.exists=false; this.fail=null; } publicAbstractNode(){ this(null,EMPTY); } privatestaticStringtab(intn){ StringBuilderbuilder=newStringBuilder(); for(inti=0;i<n;i++){ builder.append('\t'); } returnbuilder.toString(); } privatestaticStringtoString(Nodenode,intdepth){ StringBuilderbuilder=newStringBuilder(); Stringtab=tab(depth); Nodefail=node.fail(); Nodeparent=node.parent(); builder .append(tab) .append('<') .append(node.value()) .append("exists=\"") .append(node.exists()) .append('"') .append("parent=\"") .append(parent==null?"null":parent.value()) .append('"') .append("fail=\"") .append(fail==null?"null":fail.value()) .append("\">\r\n"); for(Nodechild:node.children()) builder.append(toString(child,depth+1)); builder .append(tab) .append("</") .append(node.value()) .append(">\r\n"); returnbuilder.toString(); } @Override publiccharvalue(){ returnvalue; } @Override publicbooleanexists(){ returnexists; } @Override publicbooleanisRoot(){ returnvalue==EMPTY; } @Override publicNodeparent(){ returnparent; } @Override publicNodefail(){ returnfail; } @Override publicvoidsetFail(Nodenode){ this.fail=node; } @Override publicvoidsetExists(booleanexists){ this.exists=exists; } @Override publicStringtoString(){ returntoString(this,0); } } ///////////////////////////////////////////////////////////////////////////////////////////// /** *Createdbyzhaoyyon2017/2/8. */ finalclassAsciiNodeextendsAbstractNodeimplementsNode{ privatestaticfinalcharFROM=32; privatestaticfinalcharTO=126; privatefinalNode[]children; publicAsciiNode(){ super(); this.children=newNode[TO-FROM+1]; } publicAsciiNode(Nodeparent,charvalue){ super(parent,value); this.children=newNode[TO-FROM+1]; } @Override publicNodechildOf(charc){ if(c>=FROM&&c<=TO) returnchildren[(int)c-FROM]; elsereturnnull; } @Override publicvoidadd(Nodechild){ intindex=(int)child.value(); children[index-FROM]=child; } @Override publicList<Node>children(){ List<Node>nodes=newArrayList<Node>(); for(Nodechild:children) if(child!=null) nodes.add(child); returnnodes; } } ////////////////////////////////////////////////////////////////////////////////////////////// /** *Createdbyzhaoyyon2017/2/8. */ finalclassMapNodeextendsAbstractNodeimplementsNode{ privatefinalMap<Character,Node>children; publicMapNode(){ super(); this.children=newHashMap<Character,Node>(); } publicMapNode(Nodeparent,charvalue){ super(parent,value); this.children=newHashMap<Character,Node>(); } @Override publicNodechildOf(charc){ returnchildren.get(c); } @Override publicvoidadd(Nodechild){ children.put(child.value(),child); } @Override publicList<Node>children(){ List<Node>nodes=newArrayList<Node>(); nodes.addAll(children.values()); returnnodes; } }
第三步,
首先定义一个Node构造器:
/** *Createdbyzhaoyyon2017/2/8. */ publicinterfaceNodeMaker{ Nodemake(Nodeparent,charvalue); NodemakeRoot(); }
然后构建AC自动机,实现创建及查找方法:
/** *Createdbyzhaoyyon2017/2/7. */ publicfinalclassWordTable{ privatefinalNoderoot; privateWordTable(Collection<?extendsCharSequence>words,NodeMakermaker){ Noderoot=buildTrie(words,maker); setFailNode(root); this.root=root; } publicstaticWordTablecompile(Collection<?extendsCharSequence>words){ if(words==null||words.isEmpty()) thrownewIllegalArgumentException(); finalNodeMakermaker; if(isAscii(words)) maker=newNodeMaker(){ @Override publicNodemake(Nodeparent,charvalue){ returnnewAsciiNode(parent,value); } @Override publicNodemakeRoot(){ returnnewAsciiNode(); } }; elsemaker=newNodeMaker(){ @Override publicNodemake(Nodeparent,charvalue){ returnnewMapNode(parent,value); } @Override publicNodemakeRoot(){ returnnewMapNode(); } }; returnnewWordTable(words,maker); } privatestaticbooleanisAscii(Collection<?extendsCharSequence>words){ for(CharSequenceword:words){ intlen=word.length(); for(inti=0;i<len;i++){ intc=(int)word.charAt(i); if(c<32||c>126) returnfalse; } } returntrue; } privatestaticNodebuildTrie(Collection<?extendsCharSequence>sequences,NodeMakermaker){ Noderoot=maker.makeRoot(); for(CharSequencesequence:sequences){ intlen=sequence.length(); Nodecurrent=root; for(inti=0;i<len;i++){ charc=sequence.charAt(i); Nodenode=current.childOf(c); if(node==null){ node=maker.make(current,c); current.add(node); } current=node; if(i==len-1) node.setExists(true); } } returnroot; } privatestaticvoidsetFailNode(finalNoderoot){ root.setFail(null); Queue<Node>queue=newLinkedList<Node>(); queue.add(root); while(!queue.isEmpty()){ Nodeparent=queue.poll(); Nodetemp; for(Nodechild:parent.children()){ if(parent.isRoot()) child.setFail(root); else{ temp=parent.fail(); while(temp!=null){ Nodenode=temp.childOf(child.value()); if(node!=null){ child.setFail(node); break; } temp=temp.fail(); } if(temp==null) child.setFail(root); } queue.add(child); } } } publicbooleanfindAnyIn(CharSequencecs){ intlen=cs.length(); Nodenode=root; for(inti=0;i<len;i++){ Nodenext=node.childOf(cs.charAt(i)); if(next==null){ next=node.fail(); if(next==null){ node=root; continue; } } if(next.exists()) returntrue; } returnfalse; } publicList<MatchInfo>search(CharSequencecs){ if(cs==null||cs.length()==0) returnCollections.emptyList(); List<MatchInfo>result=newArrayList<MatchInfo>(); intlen=cs.length(); Nodenode=root; for(inti=0;i<len;i++){ Nodenext=node.childOf(cs.charAt(i)); if(next==null){ next=node.fail(); if(next==null){ node=root; continue; } } if(next.exists()){ MatchInfoinfo=newMatchInfo(i,next); result.add(info); node=root; continue; } node=next; } returnresult; } @Override publicStringtoString(){ returnroot.toString(); } }
定义一个保存查找结果的实体:
/** *Createdbyzhaoyyon2017/2/7. */ publicfinalclassMatchInfo{ privatefinalintindex; privatefinalStringword; publicMatchInfo(intindex,Stringword){ this.index=index; this.word=word; } publicMatchInfo(intindex,Nodenode){ StringBuilderbuilder=newStringBuilder(); while(node!=null){ if(!node.isRoot()) builder.append(node.value()); node=node.parent(); } Stringword=builder.reverse().toString(); this.index=index+1-word.length(); this.word=word; } publicintgetIndex(){ returnindex; } publicStringgetWord(){ returnword; } @Override publicStringtoString(){ returnindex+":"+word; } }
第四步,调用Demo:
publicstaticvoidmain(String[]args){ List<String>list=Arrays.asList("say","her","he","she","shr","alone"); WordTabletable=WordTable.compile(list); System.out.println(table); System.out.println(table.search("1shesaynothingabouthislivinghimalone")); }
以下是输出结果:
<exists="false"parent="null"fail="null"> <sexists="false"parent=""fail=""> <aexists="false"parent="s"fail="a"> <yexists="true"parent="a"fail=""> </y> </a> <hexists="false"parent="s"fail="h"> <eexists="true"parent="h"fail="e"> </e> <rexists="true"parent="h"fail=""> </r> </h> </s> <hexists="false"parent=""fail=""> <eexists="true"parent="h"fail=""> <rexists="true"parent="e"fail=""> </r> </e> </h> <aexists="false"parent=""fail=""> <lexists="false"parent="a"fail=""> <oexists="false"parent="l"fail=""> <nexists="false"parent="o"fail=""> <eexists="true"parent="n"fail=""> </e> </n> </o> </l> </a> </> [1:she,4:say,31:alone]
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。