python实现KNN近邻算法
示例:《电影类型分类》
获取数据来源
| 电影名称 | 打斗次数 | 接吻次数 | 电影类型 |
|---|---|---|---|
| CaliforniaMan | 3 | 104 | Romance |
| He'sNotReallyintoDudes | 8 | 95 | Romance |
| BeautifulWoman | 1 | 81 | Romance |
| KevinLongblade | 111 | 15 | Action |
| RoobSlayer3000 | 99 | 2 | Action |
| AmpedII | 88 | 10 | Action |
| Unknown | 18 | 90 | unknown |
数据显示:肉眼判断电影类型unknown是什么
frommatplotlibimportpyplotasplt
#用来正常显示中文标签
plt.rcParams["font.sans-serif"]=["SimHei"]
#电影名称
names=["CaliforniaMan","He'sNotReallyintoDudes","BeautifulWoman",
"KevinLongblade","RoboSlayer3000","AmpedII","Unknown"]
#类型标签
labels=["Romance","Romance","Romance","Action","Action","Action","Unknown"]
colors=["darkblue","red","green"]
colorDict={label:colorfor(label,color)inzip(set(labels),colors)}
print(colorDict)
#打斗次数,接吻次数
X=[3,8,1,111,99,88,18]
Y=[104,95,81,15,2,10,88]
plt.title("通过打斗次数和接吻次数判断电影类型",fontsize=18)
plt.xlabel("电影中打斗镜头出现的次数",fontsize=16)
plt.ylabel("电影中接吻镜头出现的次数",fontsize=16)
#绘制数据
foriinrange(len(X)):
#散点图绘制
plt.scatter(X[i],Y[i],color=colorDict[labels[i]])
#每个点增加描述信息
foriinrange(0,7):
plt.text(X[i]+2,Y[i]-1,names[i],fontsize=14)
plt.show()
问题分析:根据已知信息分析电影类型unknown是什么
核心思想:
未标记样本的类别由距离其最近的K个邻居的类别决定
距离度量:
一般距离计算使用欧式距离(用勾股定理计算距离),也可以采用曼哈顿距离(水平上和垂直上的距离之和)、余弦值和相似度(这是距离的另一种表达方式)。相比于上述距离,马氏距离更为精确,因为它能考虑很多因素,比如单位,由于在求协方差矩阵逆矩阵的过程中,可能不存在,而且若碰见3维及3维以上,求解过程中极其复杂,故可不使用马氏距离
知识扩展
- 马氏距离概念:表示数据的协方差距离
- 方差:数据集中各个点到均值点的距离的平方的平均值
- 标准差:方差的开方
- 协方差cov(x,y):E表示均值,D表示方差,x,y表示不同的数据集,xy表示数据集元素对应乘积组成数据集
cov(x,y)=E(xy)-E(x)*E(y)
cov(x,x)=D(x)
cov(x1+x2,y)=cov(x1,y)+cov(x2,y)
cov(ax,by)=abcov(x,y)
- 协方差矩阵:根据维度组成的矩阵,假设有三个维度,a,b,c
∑ij=[cov(a,a)cov(a,b)cov(a,c)cov(b,a)cov(b,b)cov(b,c)cov(c,a)cov(c,b)cov(c,c)]
算法实现:欧氏距离
编码实现
#自定义实现mytest1.py
importnumpyasnp
#创建数据集
defcreateDataSet():
features=np.array([[3,104],[8,95],[1,81],[111,15],
[99,2],[88,10]])
labels=["Romance","Romance","Romance","Action","Action","Action"]
returnfeatures,labels
defknnClassify(testFeature,trainingSet,labels,k):
"""
KNN算法实现,采用欧式距离
:paramtestFeature:测试数据集,ndarray类型,一维数组
:paramtrainingSet:训练数据集,ndarray类型,二维数组
:paramlabels:训练集对应标签,ndarray类型,一维数组
:paramk:k值,int类型
:return:预测结果,类型与标签中元素一致
"""
dataSetsize=trainingSet.shape[0]
"""
构建一个由dataSet[i]-testFeature的新的数据集diffMat
diffMat中的每个元素都是dataSet中每个特征与testFeature的差值(欧式距离中差)
"""
testFeatureArray=np.tile(testFeature,(dataSetsize,1))
diffMat=testFeatureArray-trainingSet
#对每个差值求平方
sqDiffMat=diffMat**2
#计算dataSet中每个属性与testFeature的差的平方的和
sqDistances=sqDiffMat.sum(axis=1)
#计算每个feature与testFeature之间的欧式距离
distances=sqDistances**0.5
"""
排序,按照从小到大的顺序记录distances中各个数据的位置
如distance=[5,9,0,2]
则sortedStance=[2,3,0,1]
"""
sortedDistances=distances.argsort()
#选择距离最小的k个点
classCount={}
foriinrange(k):
voteiLabel=labels[list(sortedDistances).index(i)]
classCount[voteiLabel]=classCount.get(voteiLabel,0)+1
#对k个结果进行统计、排序,选取最终结果,将字典按照value值从大到小排序
sortedclassCount=sorted(classCount.items(),key=lambdax:x[1],reverse=True)
returnsortedclassCount[0][0]
testFeature=np.array([100,200])
features,labels=createDataSet()
res=knnClassify(testFeature,features,labels,3)
print(res)
#使用python包实现mytest2.py
fromsklearn.neighborsimportKNeighborsClassifier
from.mytest1importcreateDataSet
features,labels=createDataSet()
k=5
clf=KNeighborsClassifier(k_neighbors=k)
clf.fit(features,labels)
#样本值
my_sample=[[18,90]]
res=clf.predict(my_sample)
print(res)
示例:《交友网站匹配效果预测》
数据来源:略
数据显示
importpandasaspd
importnumpyasnp
frommatplotlibimportpyplotasplt
frommpl_toolkits.mplot3dimportAxes3D
#数据加载
defloadDatingData(file):
datingData=pd.read_table(file,header=None)
datingData.columns=["FlightDistance","PlaytimePreweek","IcecreamCostPreweek","label"]
datingTrainData=np.array(datingData[["FlightDistance","PlaytimePreweek","IcecreamCostPreweek"]])
datingTrainLabel=np.array(datingData["label"])
returndatingData,datingTrainData,datingTrainLabel
#3D图显示数据
defdataView3D(datingTrainData,datingTrainLabel):
plt.figure(1,figsize=(8,3))
plt.subplot(111,projection="3d")
plt.scatter(np.array([datingTrainData[x][0]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="smallDoses"]),
np.array([datingTrainData[x][1]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="smallDoses"]),
np.array([datingTrainData[x][2]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="smallDoses"]),c="red")
plt.scatter(np.array([datingTrainData[x][0]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="didntLike"]),
np.array([datingTrainData[x][1]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="didntLike"]),
np.array([datingTrainData[x][2]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="didntLike"]),c="green")
plt.scatter(np.array([datingTrainData[x][0]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="largeDoses"]),
np.array([datingTrainData[x][1]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="largeDoses"]),
np.array([datingTrainData[x][2]
forxinrange(len(datingTrainLabel))
ifdatingTrainLabel[x]=="largeDoses"]),c="blue")
plt.xlabel("飞行里程数",fontsize=16)
plt.ylabel("视频游戏耗时百分比",fontsize=16)
plt.clabel("冰淇凌消耗",fontsize=16)
plt.show()
datingData,datingTrainData,datingTrainLabel=loadDatingData(FILEPATH1)
datingView3D(datingTrainData,datingTrainLabel)
问题分析:抽取数据集的前10%在数据集的后90%进行测试
编码实现
#自定义方法实现
importpandasaspd
importnumpyasnp
#数据加载
defloadDatingData(file):
datingData=pd.read_table(file,header=None)
datingData.columns=["FlightDistance","PlaytimePreweek","IcecreamCostPreweek","label"]
datingTrainData=np.array(datingData[["FlightDistance","PlaytimePreweek","IcecreamCostPreweek"]])
datingTrainLabel=np.array(datingData["label"])
returndatingData,datingTrainData,datingTrainLabel
#数据归一化
defautoNorm(datingTrainData):
#获取数据集每一列的最值
minValues,maxValues=datingTrainData.min(0),datingTrainData.max(0)
diffValues=maxValues-minValues
#定义形状和datingTrainData相似的最小值矩阵和差值矩阵
m=datingTrainData.shape(0)
minValuesData=np.tile(minValues,(m,1))
diffValuesData=np.tile(diffValues,(m,1))
normValuesData=(datingTrainData-minValuesData)/diffValuesData
returnnormValuesData
#核心算法实现
defKNNClassifier(testData,trainData,trainLabel,k):
m=trainData.shape(0)
testDataArray=np.tile(testData,(m,1))
diffDataArray=(testDataArray-trainData)**2
sumDataArray=diffDataArray.sum(axis=1)**0.5
#对结果进行排序
sumDataSortedArray=sumDataArray.argsort()
classCount={}
foriinrange(k):
labelName=trainLabel[list(sumDataSortedArray).index(i)]
classCount[labelName]=classCount.get(labelName,0)+1
classCount=sorted(classCount.items(),key=lambdax:x[1],reversed=True)
returnclassCount[0][0]
#数据测试
defdatingTest(file):
datingData,datingTrainData,datingTrainLabel=loadDatingData(file)
normValuesData=autoNorm(datingTrainData)
errorCount=0
ratio=0.10
total=datingTrainData.shape(0)
numberTest=int(total*ratio)
foriinrange(numberTest):
res=KNNClassifier(normValuesData[i],normValuesData[numberTest:m],datingTrainLabel,5)
ifres!=datingTrainLabel[i]:
errorCount+=1
print("Thetotalerrorrateis:{}\n".format(error/float(numberTest)))
if__name__=="__main__":
FILEPATH="./datingTestSet1.txt"
datingTest(FILEPATH)
#python第三方包实现
importpandasaspd
importnumpyasnp
fromsklearn.neighborsimportKNeighborsClassifier
if__name__=="__main__":
FILEPATH="./datingTestSet1.txt"
datingData,datingTrainData,datingTrainLabel=loadDatingData(FILEPATH)
normValuesData=autoNorm(datingTrainData)
errorCount=0
ratio=0.10
total=normValuesData.shape[0]
numberTest=int(total*ratio)
k=5
clf=KNeighborsClassifier(n_neighbors=k)
clf.fit(normValuesData[numberTest:total],datingTrainLabel[numberTest:total])
foriinrange(numberTest):
res=clf.predict(normValuesData[i].reshape(1,-1))
ifres!=datingTrainLabel[i]:
errorCount+=1
print("Thetotalerrorrateis:{}\n".format(errorCount/float(numberTest)))
以上就是python实现KNN近邻算法的详细内容,更多关于python实现KNN近邻算法的资料请关注毛票票其它相关文章!