GO语言利用K近邻算法实现小说鉴黄
Usuage:
gorunkNN.go--file="data.txt"
关键是向量点的选择和阈值的判定
样本数据来自国家新闻出版总署发布通知公布的《40部淫秽色情网络小说名单》
packagemain import( "bufio" "flag" "fmt" "io" "log" "math" "os" "path" "path/filepath" ) vardebugbool=false vardata_dirstring="./moyan"//文件存放目录 varlimenfloat64=0.1159203888322267//阈值 const( MIN_HANZIrune=0x3400 MAX_HANZIrune=0x9fbb ) varlabels[]rune=[]rune{ 0x817f,0x80f8,0x4e73,0x81c0, 0x5c41,0x80a1,0x88f8,0x6deb, } funcerrHandle(errerror){ iferr!=nil{ log.Fatal(err) } } funcload(namestring)(mmap[rune]int,errerror){ f,err:=os.Open(name) iferr!=nil{ returnnil,err } deferf.Close() buf:=bufio.NewReader(f) m=make(map[rune]int) varrrune for{ r,_,err=buf.ReadRune() iferr!=nil{ iferr==io.EOF{ break } returnnil,err } ifr>=MIN_HANZI&&r<=MAX_HANZI{ m[r]+=1 } } returnm,nil } funcclassify(mmap[rune]int)(idv[]float64,disfloat64){ len_m:=len(m) fori,v:=rangelabels{ ifdebug{ fmt.Println(i,m[v],string(v),float64(m[v])/float64(len_m)) } idv=append(idv,float64(m[v])/float64(len_m)) } for_,v:=rangeidv{ dis+=math.Pow(v,2) } dis=math.Sqrt(dis) return } funccheck(fpstring,disfloat64){ switch{ casedis>=limen: fmt.Println(fp,dis,"涉黄") casedis==1.0: fmt.Println(fp,dis,"你在作弊吗") casedis==0: fmt.Println(fp,dis,"检查一下文件字符编码是不是utf8格式吧") default: fmt.Println(fp,dis,"正常") } } funcwalkFunc(fpstring,infoos.FileInfo,errerror)error{ ifpath.Ext(fp)==".txt"{ m,err:=load(fp) errHandle(err) _,dis:=classify(m) check(fp,dis) } returnerr } varfilestring funcinit(){ _,err:=os.Stat(data_dir) iferr!=nil{ err=os.Mkdir(data_dir,os.ModePerm) errHandle(err) } flag.StringVar(&file,"file","","filereadin,ifyoudon'tgivethefilereadin,"+ "itwillcreateadatadictionary,justpustyourfilesinit") } funcmain(){ flag.Parse() iffile==""{ filepath.Walk(data_dir,walkFunc) return } m,err:=load(file) errHandle(err) _,dis:=classify(m) check(file,dis) }
以上所述就是本文的全部内容了,希望大家能够喜欢。