python中scikit-learn机器代码实例
我们给大家带来了关于学习python中scikit-learn机器代码的相关具体实例,以下就是全部代码内容:
#-*-coding:utf-8-*-
importnumpy
fromsklearnimportmetrics
fromsklearn.svmimportLinearSVC
fromsklearn.naive_bayesimportMultinomialNB
fromsklearnimportlinear_model
fromsklearn.datasetsimportload_iris
fromsklearn.cross_validationimporttrain_test_split
fromsklearn.preprocessingimportOneHotEncoder,StandardScaler
fromsklearnimportcross_validation
fromsklearnimportpreprocessing
#importiris_data
defload_data():
iris=load_iris()
x,y=iris.data,iris.target
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=42)
returnx_train,y_train,x_test,y_test
deftrain_clf3(train_data,train_tags):
clf=LinearSVC(C=1100.0)#defaultwith'rbf'
clf.fit(train_data,train_tags)
returnclf
deftrain_clf(train_data,train_tags):
clf=MultinomialNB(alpha=0.01)
printnumpy.asarray(train_tags)
clf.fit(train_data,numpy.asarray(train_tags))
returnclf
defevaluate(actual,pred):
m_precision=metrics.precision_score(actual,pred)
m_recall=metrics.recall_score(actual,pred)
print'precision:{0:.3f}'.format(m_precision)
print'recall:{0:0.3f}'.format(m_recall)
print'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
x_train,y_train,x_test,y_test=load_data()
clf=train_clf(x_train,y_train)
pred=clf.predict(x_test)
evaluate(numpy.asarray(y_test),pred)
printmetrics.classification_report(y_test,pred)
使用自定义数据
#coding:utf-8
importnumpy
fromsklearnimportmetrics
fromsklearn.feature_extraction.textimportHashingVectorizer
fromsklearn.feature_extraction.textimportTfidfVectorizer
fromsklearn.naive_bayesimportMultinomialNB
fromsklearn.feature_extraction.textimportCountVectorizer,TfidfTransformer
fromsklearn.neighborsimportKNeighborsClassifier
fromsklearn.svmimportSVC
fromsklearn.svmimportLinearSVC
importcodecs
fromsklearn.ensembleimportRandomForestClassifier
fromsklearnimportcross_validation
fromsklearnimportlinear_model
train_corpus=[
'我们我们好孩子认证。就是',
'我们好孩子认证。中国',
'我们好孩子认证。孤独',
'我们好孩子认证。',
]
test_corpus=[
'我菲律宾韩国',
'我们好孩子认证。中国',
]
definput_data(train_file,test_file):
train_words=[]
train_tags=[]
test_words=[]
test_tags=[]
f1=codecs.open(train_file,'r','utf-8','ignore')
forlineinf1:
tks=line.split(':',1)
word_list=tks[1]
word_array=word_list[1:(len(word_list)-3)].split(",")
train_words.append("".join(word_array))
train_tags.append(tks[0])
f2=codecs.open(test_file,'r','utf-8','ignore')
forlineinf2:
tks=line.split(':',1)
word_list=tks[1]
word_array=word_list[1:(len(word_list)-3)].split(",")
test_words.append("".join(word_array))
test_tags.append(tks[0])
returntrain_words,train_tags,test_words,test_tags
defvectorize(train_words,test_words):
#v=HashingVectorizer(n_features=25000,non_negative=True)
v=HashingVectorizer(non_negative=True)
#v=CountVectorizer(min_df=1)
train_data=v.fit_transform(train_words)
test_data=v.fit_transform(test_words)
returntrain_data,test_data
defvectorize1(train_words,test_words):
tv=TfidfVectorizer(sublinear_tf=False,use_idf=True);
train_data=tv.fit_transform(train_words);
tv2=TfidfVectorizer(vocabulary=tv.vocabulary_);
test_data=tv2.fit_transform(test_words);
returntrain_data,test_data
defvectorize2(train_words,test_words):
count_v1=CountVectorizer(stop_words='english',max_df=0.5);
counts_train=count_v1.fit_transform(train_words);
count_v2=CountVectorizer(vocabulary=count_v1.vocabulary_);
counts_test=count_v2.fit_transform(test_words);
tfidftransformer=TfidfTransformer();
train_data=tfidftransformer.fit(counts_train).transform(counts_train);
test_data=tfidftransformer.fit(counts_test).transform(counts_test);
returntrain_data,test_data
defevaluate(actual,pred):
m_precision=metrics.precision_score(actual,pred)
m_recall=metrics.recall_score(actual,pred)
print'precision:{0:.3f}'.format(m_precision)
print'recall:{0:0.3f}'.format(m_recall)
print'f1-score:{0:.8f}'.format(metrics.f1_score(actual,pred));
deftrain_clf(train_data,train_tags):
clf=MultinomialNB(alpha=0.01)
clf.fit(train_data,numpy.asarray(train_tags))
returnclf
deftrain_clf1(train_data,train_tags):
#KNNClassifier
clf=KNeighborsClassifier()#defaultwithk=5
clf.fit(train_data,numpy.asarray(train_tags))
returnclf
deftrain_clf2(train_data,train_tags):
clf=linear_model.LogisticRegression(C=1e5)
clf.fit(train_data,train_tags)
returnclf
deftrain_clf3(train_data,train_tags):
clf=LinearSVC(C=1100.0)#defaultwith'rbf'
clf.fit(train_data,train_tags)
returnclf
deftrain_clf4(train_data,train_tags):
"""
随机森林,不可使用稀疏矩阵
"""
clf=RandomForestClassifier(n_estimators=10)
clf.fit(train_data.todense(),train_tags)
returnclf
#使用codecs逐行读取
defcodecs_read_label_line(filename):
label_list=[]
f=codecs.open(filename,'r','utf-8','ignore')
line=f.readline()
whileline:
#label_list.append(line[0:len(line)-2])
label_list.append(line[0:len(line)-1])
line=f.readline()
f.close()
returnlabel_list
defsave_test_features(test_url,test_label):
test_feature_list=codecs_read_label_line('test.dat')
fw=open('test_labeded.dat',"w+")
for(url,label)inzip(test_feature_list,test_label):
fw.write(url+'\t'+label)
fw.write('\n')
fw.close()
defmain():
train_file=u'..\\file\\py_train.txt'
test_file=u'..\\file\\py_test.txt'
train_words,train_tags,test_words,test_tags=input_data(train_file,test_file)
#printlen(train_words),len(train_tags),len(test_words),len(test_words),
train_data,test_data=vectorize1(train_words,test_words)
printtype(train_data)
printtrain_data.shape
printtest_data.shape
printtest_data[0].shape
printnumpy.asarray(test_data[0])
clf=train_clf3(train_data,train_tags)
scores=cross_validation.cross_val_score(
clf,train_data,train_tags,cv=5,scoring="f1_weighted")
printscores
#predicted=cross_validation.cross_val_predict(clf,train_data,train_tags,cv=5)
'''
'''
pred=clf.predict(test_data)
error_list=[]
for(true_tag,predict_tag)inzip(test_tags,pred):
iftrue_tag!=predict_tag:
printtrue_tag,predict_tag
error_list.append(true_tag+''+predict_tag)
printlen(error_list)
evaluate(numpy.asarray(test_tags),pred)
'''
#输出打标签结果
test_feature_list=codecs_read_label_line('test.dat')
save_test_features(test_feature_list,pred)
'''
if__name__=='__main__':
main()