python提取内容关键词的方法
本文实例讲述了python提取内容关键词的方法。分享给大家供大家参考。具体分析如下:
一个非常高效的提取内容关键词的python代码,这段代码只能用于英文文章内容,中文因为要分词,这段代码就无能为力了,不过要加上分词功能,效果和英文是一样的。
#coding=UTF-8
importnltk
fromnltk.corpusimportbrown
#Thisisafastandsimplenounphraseextractor(basedonNLTK)
#Feelfreetouseit,justkeepalinkbacktothispost
#http://thetokenizer.com/2013/05/09/efficient-way-to-extract-the-main-topics-of-a-sentence/
#CreatebyShlomiBabluki
#May,2013
#ThisisourfastPartofSpeechtagger
#############################################################################
brown_train=brown.tagged_sents(categories='news')
regexp_tagger=nltk.RegexpTagger(
[(r'^-?[0-9]+(.[0-9]+)?$','CD'),
(r'(-|:|;)$',':'),
(r'\'*$','MD'),
(r'(The|the|A|a|An|an)$','AT'),
(r'.*able$','JJ'),
(r'^[A-Z].*$','NNP'),
(r'.*ness$','NN'),
(r'.*ly$','RB'),
(r'.*s$','NNS'),
(r'.*ing$','VBG'),
(r'.*ed$','VBD'),
(r'.*','NN')
])
unigram_tagger=nltk.UnigramTagger(brown_train,backoff=regexp_tagger)
bigram_tagger=nltk.BigramTagger(brown_train,backoff=unigram_tagger)
#############################################################################
#Thisisoursemi-CFG;Extenditaccordingtoyourownneeds
#############################################################################
cfg={}
cfg["NNP+NNP"]="NNP"
cfg["NN+NN"]="NNI"
cfg["NNI+NN"]="NNI"
cfg["JJ+JJ"]="JJ"
cfg["JJ+NN"]="NNI"
#############################################################################
classNPExtractor(object):
def__init__(self,sentence):
self.sentence=sentence
#Splitthesentenceintosinglwwords/tokens
deftokenize_sentence(self,sentence):
tokens=nltk.word_tokenize(sentence)
returntokens
#Normalizebrowncorpus'tags("NN","NN-PL","NNS">"NN")
defnormalize_tags(self,tagged):
n_tagged=[]
fortintagged:
ift[1]=="NP-TL"ort[1]=="NP":
n_tagged.append((t[0],"NNP"))
continue
ift[1].endswith("-TL"):
n_tagged.append((t[0],t[1][:-3]))
continue
ift[1].endswith("S"):
n_tagged.append((t[0],t[1][:-1]))
continue
n_tagged.append((t[0],t[1]))
returnn_tagged
#Extractthemaintopicsfromthesentence
defextract(self):
tokens=self.tokenize_sentence(self.sentence)
tags=self.normalize_tags(bigram_tagger.tag(tokens))
merge=True
whilemerge:
merge=False
forxinrange(0,len(tags)-1):
t1=tags[x]
t2=tags[x+1]
key="%s+%s"%(t1[1],t2[1])
value=cfg.get(key,'')
ifvalue:
merge=True
tags.pop(x)
tags.pop(x)
match="%s%s"%(t1[0],t2[0])
pos=value
tags.insert(x,(match,pos))
break
matches=[]
fortintags:
ift[1]=="NNP"ort[1]=="NNI":
#ift[1]=="NNP"ort[1]=="NNI"ort[1]=="NN":
matches.append(t[0])
returnmatches
#Mainmethod,justrun"pythonnp_extractor.py"
defmain():
sentence="Swayyisabeautifulnewdashboardfordiscoveringandcuratingonlinecontent."
np_extractor=NPExtractor(sentence)
result=np_extractor.extract()
print"Thissentenceisabout:%s"%",".join(result)
if__name__=='__main__':
main()
希望本文所述对大家的Python程序设计有所帮助。