python实现朴素贝叶斯算法
本代码实现了朴素贝叶斯分类器(假设了条件独立的版本),常用于垃圾邮件分类,进行了拉普拉斯平滑。
关于朴素贝叶斯算法原理可以参考博客中原理部分的博文。
#!/usr/bin/python #-*-coding:utf-8-*- frommathimportlog fromnumpyimport* importoperator importmatplotlib importmatplotlib.pyplotasplt fromosimportlistdir defloadDataSet(): postingList=[['my','dog','has','flea','problems','help','please'], ['maybe','not','take','him','to','dog','park','stupid'], ['my','dalmation','is','so','cute','I','love','him'], ['stop','posting','stupid','worthless','garbage'], ['mr','licks','ate','my','steak','how','to','stop','him'], ['quit','buying','worthless','dog','food','stupid']] classVec=[0,1,0,1,0,1] returnpostingList,classVec defcreateVocabList(dataSet): vocabSet=set([])#createemptyset fordocumentindataSet: vocabSet=vocabSet|set(document)#unionofthetwosets returnlist(vocabSet) defsetOfWords2Vec(vocabList,inputSet): returnVec=[0]*len(vocabList) forwordininputSet: ifwordinvocabList: returnVec[vocabList.index(word)]=1 else:print"theword:%sisnotinmyVocabulary!"%word returnreturnVec deftrainNB0(trainMatrix,trainCategory):#训练模型 numTrainDocs=len(trainMatrix) numWords=len(trainMatrix[0]) pAbusive=sum(trainCategory)/float(numTrainDocs) p0Num=ones(numWords);p1Num=ones(numWords)#拉普拉斯平滑 p0Denom=0.0+2.0;p1Denom=0.0+2.0#拉普拉斯平滑 foriinrange(numTrainDocs): iftrainCategory[i]==1: p1Num+=trainMatrix[i] p1Denom+=sum(trainMatrix[i]) else: p0Num+=trainMatrix[i] p0Denom+=sum(trainMatrix[i]) p1Vect=log(p1Num/p1Denom)#用log()是为了避免概率乘积时浮点数下溢 p0Vect=log(p0Num/p0Denom) returnp0Vect,p1Vect,pAbusive defclassifyNB(vec2Classify,p0Vec,p1Vec,pClass1): p1=sum(vec2Classify*p1Vec)+log(pClass1) p0=sum(vec2Classify*p0Vec)+log(1.0-pClass1) ifp1>p0: return1 else: return0 defbagOfWords2VecMN(vocabList,inputSet): returnVec=[0]*len(vocabList) forwordininputSet: ifwordinvocabList: returnVec[vocabList.index(word)]+=1 returnreturnVec deftestingNB():#测试训练结果 listOPosts,listClasses=loadDataSet() myVocabList=createVocabList(listOPosts) trainMat=[] forpostinDocinlistOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) p0V,p1V,pAb=trainNB0(array(trainMat),array(listClasses)) testEntry=['love','my','dalmation'] thisDoc=array(setOfWords2Vec(myVocabList,testEntry)) printtestEntry,'classifiedas:',classifyNB(thisDoc,p0V,p1V,pAb) testEntry=['stupid','garbage'] thisDoc=array(setOfWords2Vec(myVocabList,testEntry)) printtestEntry,'classifiedas:',classifyNB(thisDoc,p0V,p1V,pAb) deftextParse(bigString):#长字符转转单词列表 importre listOfTokens=re.split(r'\W*',bigString) return[tok.lower()fortokinlistOfTokensiflen(tok)>2] defspamTest():#测试垃圾文件需要数据 docList=[]; classList=[]; fullText=[] foriinrange(1,26): wordList=textParse(open('email/spam/%d.txt'%i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(1) wordList=textParse(open('email/ham/%d.txt'%i).read()) docList.append(wordList) fullText.extend(wordList) classList.append(0) vocabList=createVocabList(docList) trainingSet=range(50); testSet=[] foriinrange(10): randIndex=int(random.uniform(0,len(trainingSet))) testSet.append(trainingSet[randIndex]) del(trainingSet[randIndex]) trainMat=[]; trainClasses=[] fordocIndexintrainingSet: trainMat.append(bagOfWords2VecMN(vocabList,docList[docIndex])) trainClasses.append(classList[docIndex]) p0V,p1V,pSpam=trainNB0(array(trainMat),array(trainClasses)) errorCount=0 fordocIndexintestSet: wordVector=bagOfWords2VecMN(vocabList,docList[docIndex]) ifclassifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]: errorCount+=1 print"classificationerror",docList[docIndex] print'theerrorrateis:',float(errorCount)/len(testSet) listOPosts,listClasses=loadDataSet() myVocabList=createVocabList(listOPosts) printmyVocabList,'\n' #printsetOfWords2Vec(myVocabList,listOPosts[0]),'\n' trainMat=[] forpostinDocinlistOPosts: trainMat.append(setOfWords2Vec(myVocabList,postinDoc)) printtrainMat p0V,p1V,pAb=trainNB0(trainMat,listClasses) printpAb printp0V,'\n',p1V testingNB()
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。