python机器学习库xgboost的使用

2023-08-02 18:26:04 273

1.数据读取

利用原生xgboost库读取libsvm数据

importxgboostasxgb
data=xgb.DMatrix(libsvm文件)

使用sklearn读取libsvm数据

fromsklearn.datasetsimportload_svmlight_file
X_train,y_train=load_svmlight_file(libsvm文件)

使用pandas读取完数据后在转化为标准形式

2.模型训练过程

1.未调参基线模型

使用xgboost原生库进行训练

importxgboostasxgb
fromsklearn.metricsimportaccuracy_score

dtrain=xgb.DMatrix(f_train,label=l_train)
dtest=xgb.DMatrix(f_test,label=l_test)
param={'max_depth':2,'eta':1,'silent':0,'objective':'binary:logistic'}
num_round=2
bst=xgb.train(param,dtrain,num_round)
train_preds=bst.predict(dtrain)
train_predictions=[round(value)forvalueintrain_preds]#进行四舍五入的操作--变成0.1(算是设定阈值的符号函数)
train_accuracy=accuracy_score(l_train,train_predictions)#使用sklearn进行比较正确率
print("TrainAccuary:%.2f%%"%(train_accuracy*100.0))

fromxgboostimportplot_importance#显示特征重要性
plot_importance(bst)#打印重要程度结果。
pyplot.show()

使用XGBClassifier进行训练

#未设定早停止，未进行矩阵变换
fromxgboostimportXGBClassifier
fromsklearn.datasetsimportload_svmlight_file#用于直接读取svmlight文件形式，否则就需要使用xgboost.DMatrix(文件名)来读取这种格式的文件
fromsklearn.metricsimportaccuracy_score
frommatplotlibimportpyplot


num_round=100
bst1=XGBClassifier(max_depth=2,learning_rate=1,n_estimators=num_round,#弱分类树太少的话取不到更多的特征重要性
silent=True,objective='binary:logistic')
bst1.fit(f_train,l_train)

train_preds=bst1.predict(f_train)
train_accuracy=accuracy_score(l_train,train_preds)
print("TrainAccuary:%.2f%%"%(train_accuracy*100.0))

preds=bst1.predict(f_test)
test_accuracy=accuracy_score(l_test,preds)
print("TestAccuracy:%.2f%%"%(test_accuracy*100.0))

fromxgboostimportplot_importance#显示特征重要性
plot_importance(bst1)#打印重要程度结果。
pyplot.show()

2.两种交叉验证方式

使用cross_val_score进行交叉验证

#利用model_selection进行交叉训练
fromxgboostimportXGBClassifier
fromsklearn.model_selectionimportStratifiedKFold
fromsklearn.model_selectionimportcross_val_score
fromsklearn.metricsimportaccuracy_score
frommatplotlibimportpyplot

param={'max_depth':2,'eta':1,'silent':0,'objective':'binary:logistic'}
num_round=100
bst2=XGBClassifier(max_depth=2,learning_rate=0.1,n_estimators=num_round,silent=True,objective='binary:logistic')
bst2.fit(f_train,l_train)
kfold=StratifiedKFold(n_splits=10,random_state=7)
results=cross_val_score(bst2,f_train,l_train,cv=kfold)#对数据进行十折交叉验证--9份训练，一份测试
print(results)
print("CVAccuracy:%.2f%%(%.2f%%)"%(results.mean()*100,results.std()*100))

fromxgboostimportplot_importance#显示特征重要性
plot_importance(bst2)#打印重要程度结果。
pyplot.show()

使用GridSearchCV进行网格搜索

#使用sklearn中提供的网格搜索进行测试--找出最好参数，并作为默认训练参数
fromxgboostimportXGBClassifier
fromsklearn.model_selectionimportGridSearchCV
fromsklearn.metricsimportaccuracy_score
frommatplotlibimportpyplot

params={'max_depth':2,'eta':0.1,'silent':0,'objective':'binary:logistic'}
bst=XGBClassifier(max_depth=2,learning_rate=0.1,silent=True,objective='binary:logistic')
param_test={
'n_estimators':range(1,51,1)
}
clf=GridSearchCV(estimator=bst,param_grid=param_test,scoring='accuracy',cv=5)#5折交叉验证
clf.fit(f_train,l_train)#默认使用最优的参数


preds=clf.predict(f_test)

test_accuracy=accuracy_score(l_test,preds)
print("TestAccuracyofgridsearchcv:%.2f%%"%(test_accuracy*100.0))

clf.cv_results_,clf.best_params_,clf.best_score_

3.早停止调参–early_stopping_rounds（查看的是损失是否变化）

#进行提早停止的单独实例
importxgboostasxgb
fromxgboostimportXGBClassifier
fromsklearn.metricsimportaccuracy_score
frommatplotlibimportpyplot

param={'max_depth':2,'eta':1,'silent':0,'objective':'binary:logistic'}
num_round=100
bst=XGBClassifier(max_depth=2,learning_rate=0.1,n_estimators=num_round,silent=True,objective='binary:logistic')
eval_set=[(f_test,l_test)]
bst.fit(f_train,l_train,early_stopping_rounds=10,eval_metric="error",eval_set=eval_set,verbose=True)#early_stopping_rounds--当多少次的效果差不多时停止eval_set--用于显示损失率的数据verbose--显示错误率的变化过程

#makeprediction
preds=bst.predict(f_test)

test_accuracy=accuracy_score(l_test,preds)
print("TestAccuracy:%.2f%%"%(test_accuracy*100.0))

4.多数据观察训练损失

#多参数顺
importxgboostasxgb
fromxgboostimportXGBClassifier
fromsklearn.metricsimportaccuracy_score
frommatplotlibimportpyplot

num_round=100
bst=XGBClassifier(max_depth=2,learning_rate=0.1,n_estimators=num_round,silent=True,objective='binary:logistic')
eval_set=[(f_train,l_train),(f_test,l_test)]
bst.fit(f_train,l_train,eval_metric=["error","logloss"],eval_set=eval_set,verbose=True)

#makeprediction
preds=bst.predict(f_test)
test_accuracy=accuracy_score(l_test,preds)
print("TestAccuracy:%.2f%%"%(test_accuracy*100.0))

5.模型保存与读取

#模型保存
bst.save_model('demo.model')

#模型读取与预测
modelfile='demo.model'

#1
bst=xgb.Booster({'nthread':8},model_file=modelfile)

#2

f_test1=xgb.DMatrix(f_test)#尽量使用xgboost的自己的数据矩阵
ypred1=bst.predict(f_test1)
train_predictions=[round(value)forvalueinypred1]
test_accuracy1=accuracy_score(l_test,train_predictions)
print("TestAccuracy:%.2f%%"%(test_accuracy1*100.0))

以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持毛票票。

声明：本文内容来源于网络，版权归原作者所有，内容由互联网用户自发贡献自行上传，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任。如果您发现有涉嫌版权的内容，欢迎发送邮件至：czq8825#qq.com（发邮件时，请将#更换为@）进行举报，并提供相关证据，一经查实，本站将立刻删除涉嫌侵权内容。

python机器学习库xgboost的使用

热门推荐

随机推荐