GO语言利用K近邻算法实现小说鉴黄
Usuage:
gorunkNN.go--file="data.txt"
关键是向量点的选择和阈值的判定
样本数据来自国家新闻出版总署发布通知公布的《40部淫秽色情网络小说名单》
packagemain
import(
"bufio"
"flag"
"fmt"
"io"
"log"
"math"
"os"
"path"
"path/filepath"
)
vardebugbool=false
vardata_dirstring="./moyan"//文件存放目录
varlimenfloat64=0.1159203888322267//阈值
const(
MIN_HANZIrune=0x3400
MAX_HANZIrune=0x9fbb
)
varlabels[]rune=[]rune{
0x817f,0x80f8,0x4e73,0x81c0,
0x5c41,0x80a1,0x88f8,0x6deb,
}
funcerrHandle(errerror){
iferr!=nil{
log.Fatal(err)
}
}
funcload(namestring)(mmap[rune]int,errerror){
f,err:=os.Open(name)
iferr!=nil{
returnnil,err
}
deferf.Close()
buf:=bufio.NewReader(f)
m=make(map[rune]int)
varrrune
for{
r,_,err=buf.ReadRune()
iferr!=nil{
iferr==io.EOF{
break
}
returnnil,err
}
ifr>=MIN_HANZI&&r<=MAX_HANZI{
m[r]+=1
}
}
returnm,nil
}
funcclassify(mmap[rune]int)(idv[]float64,disfloat64){
len_m:=len(m)
fori,v:=rangelabels{
ifdebug{
fmt.Println(i,m[v],string(v),float64(m[v])/float64(len_m))
}
idv=append(idv,float64(m[v])/float64(len_m))
}
for_,v:=rangeidv{
dis+=math.Pow(v,2)
}
dis=math.Sqrt(dis)
return
}
funccheck(fpstring,disfloat64){
switch{
casedis>=limen:
fmt.Println(fp,dis,"涉黄")
casedis==1.0:
fmt.Println(fp,dis,"你在作弊吗")
casedis==0:
fmt.Println(fp,dis,"检查一下文件字符编码是不是utf8格式吧")
default:
fmt.Println(fp,dis,"正常")
}
}
funcwalkFunc(fpstring,infoos.FileInfo,errerror)error{
ifpath.Ext(fp)==".txt"{
m,err:=load(fp)
errHandle(err)
_,dis:=classify(m)
check(fp,dis)
}
returnerr
}
varfilestring
funcinit(){
_,err:=os.Stat(data_dir)
iferr!=nil{
err=os.Mkdir(data_dir,os.ModePerm)
errHandle(err)
}
flag.StringVar(&file,"file","","filereadin,ifyoudon'tgivethefilereadin,"+
"itwillcreateadatadictionary,justpustyourfilesinit")
}
funcmain(){
flag.Parse()
iffile==""{
filepath.Walk(data_dir,walkFunc)
return
}
m,err:=load(file)
errHandle(err)
_,dis:=classify(m)
check(file,dis)
}
以上所述就是本文的全部内容了,希望大家能够喜欢。