Python使用gensim计算文档相似性
pre_file.py
#-*-coding:utf-8-*- importMySQLdb importMySQLdbasmdb importos,sys,string importjieba importcodecs reload(sys) sys.setdefaultencoding('utf-8') #连接数据库 try: conn=mdb.connect(host='127.0.0.1',user='root',passwd='kongjunli',db='test1',charset='utf8') exceptException,e: printe sys.exit() #获取cursor对象操作数据库 cursor=conn.cursor(mdb.cursors.DictCursor)#cursor游标 #获取内容 sql='SELECTlink,contentFROMtest1.spider;' cursor.execute(sql)#execute()方法,将字符串当命令执行 data=cursor.fetchall()#fetchall()接收全部返回结果行 f=codecs.open('C:\Users\kk\Desktop\hello-result1.txt','w','utf-8') forrowindata:#row接收结果行的每行数据 seg='/'.join(list(jieba.cut(row['content'],cut_all='False'))) f.write(row['link']+''+seg+'\r\n') f.close() cursor.close() #提交事务,在插入数据时必须
jiansuo.py
#-*-coding:utf-8-*- importsys importstring importMySQLdb importMySQLdbasmdb importgensim fromgensimimportcorpora,models,similarities fromgensim.similaritiesimportMatrixSimilarity importlogging importcodecs reload(sys) sys.setdefaultencoding('utf-8') con=mdb.connect(host='127.0.0.1',user='root',passwd='kongjunli',db='test1',charset='utf8') withcon: cur=con.cursor() cur.execute('SELECT*FROMcutresult_copy') rows=cur.fetchall() classMyCorpus(object): def__iter__(self): forrowinrows: yieldstr(row[1]).split('/') #开启日志 logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO) Corp=MyCorpus() #将网页文档转化为tf-idf dictionary=corpora.Dictionary(Corp) corpus=[dictionary.doc2bow(text)fortextinCorp]#将文档转化为词袋模型 #printcorpus tfidf=models.TfidfModel(corpus)#使用tf-idf模型得出文档的tf-idf模型 corpus_tfidf=tfidf[corpus]#计算得出tf-idf值 #fordocincorpus_tfidf: #printdoc ### ''' q_file=open('C:\Users\kk\Desktop\q.txt','r') query=q_file.readline() q_file.close() vec_bow=dictionary.doc2bow(query.split(''))#将请求转化为词带模型 vec_tfidf=tfidf[vec_bow]#计算出请求的tf-idf值 #fortinvec_tfidf: #printt ''' ### query=raw_input('Enteryourquery:') vec_bow=dictionary.doc2bow(query.split()) vec_tfidf=tfidf[vec_bow] index=similarities.MatrixSimilarity(corpus_tfidf) sims=index[vec_tfidf] similarity=list(sims) printsorted(similarity,reverse=True)
encodings.xml
<?xmlversion="1.0"encoding="UTF-8"?> <projectversion="4"> <componentname="Encoding"> <fileurl="PROJECT"charset="UTF-8"/> </component> </project>
misc.xml
<?xmlversion="1.0"encoding="UTF-8"?> <projectversion="4"> <componentname="ProjectLevelVcsManager"settingsEditedManually="false"> <OptionsSettingvalue="true"id="Add"/> <OptionsSettingvalue="true"id="Remove"/> <OptionsSettingvalue="true"id="Checkout"/> <OptionsSettingvalue="true"id="Update"/> <OptionsSettingvalue="true"id="Status"/> <OptionsSettingvalue="true"id="Edit"/> <ConfirmationsSettingvalue="0"id="Add"/> <ConfirmationsSettingvalue="0"id="Remove"/> </component> <componentname="ProjectRootManager"version="2"project-jdk-name="Python2.7.11(C:\Python27\python.exe)"project-jdk-type="PythonSDK"/> </project>
modules.xml
<?xmlversion="1.0"encoding="UTF-8"?> <projectversion="4"> <componentname="ProjectModuleManager"> <modules> <modulefileurl="file://$PROJECT_DIR$/.idea/爬虫练习代码.iml"filepath="$PROJECT_DIR$/.idea/爬虫练习代码.iml"/> </modules> </component> </project>