红酒数据集分类
from sklearn import tree
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
wine = load_wine()
查看wine的内容
{'data': array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
1.065e+03],
[1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
1.050e+03],
[1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
1.185e+03],
...,
[1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
8.350e+02],
[1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
8.400e+02],
[1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
5.600e+02]]),
'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2]),
'frame': None,
'target_names': array(['class_0', 'class_1', 'class_2'], dtype='<U7'),
'DESCR': '.. _wine_dataset:\n\nWine recognition dataset\n------------------------\n\n**Data Set Characteristics:**\n\n :Number of Instances: 178 (50 in each of three classes)\n :Number of Attributes: 13 numeric, predictive attributes and the class\n :Attribute Information:\n \t\t- Alcohol\n \t\t- Malic acid\n \t\t- Ash\n\t\t- Alcalinity of ash \n \t\t- Magnesium\n\t\t- Total phenols\n \t\t- Flavanoids\n \t\t- Nonflavanoid phenols\n \t\t- Proanthocyanins\n\t\t- Color intensity\n \t\t- Hue\n \t\t- OD280/OD315 of diluted wines\n \t\t- Proline\n\n - class:\n - class_0\n - class_1\n - class_2\n\t\t\n :Summary Statistics:\n \n ============================= ==== ===== ======= =====\n Min Max Mean SD\n ============================= ==== ===== ======= =====\n Alcohol: 11.0 14.8 13.0 0.8\n Malic Acid: 0.74 5.80 2.34 1.12\n Ash: 1.36 3.23 2.36 0.27\n Alcalinity of Ash: 10.6 30.0 19.5 3.3\n Magnesium: 70.0 162.0 99.7 14.3\n Total Phenols: 0.98 3.88 2.29 0.63\n Flavanoids: 0.34 5.08 2.03 1.00\n Nonflavanoid Phenols: 0.13 0.66 0.36 0.12\n Proanthocyanins: 0.41 3.58 1.59 0.57\n Colour Intensity: 1.3 13.0 5.1 2.3\n Hue: 0.48 1.71 0.96 0.23\n OD280/OD315 of diluted wines: 1.27 4.00 2.61 0.71\n Proline: 278 1680 746 315\n ============================= ==== ===== ======= =====\n\n :Missing Attribute Values: None\n :Class Distribution: class_0 (59), class_1 (71), class_2 (48)\n :Creator: R.A. Fisher\n :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n :Date: July, 1988\n\nThis is a copy of UCI ML Wine recognition datasets.\nhttps://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data\n\nThe data is the results of a chemical analysis of wines grown in the same\nregion in Italy by three different cultivators. There are thirteen different\nmeasurements taken for different constituents found in the three types of\nwine.\n\nOriginal Owners: \n\nForina, M. et al, PARVUS - \nAn Extendible Package for Data Exploration, Classification and Correlation. \nInstitute of Pharmaceutical and Food Analysis and Technologies,\nVia Brigata Salerno, 16147 Genoa, Italy.\n\nCitation:\n\nLichman, M. (2013). UCI Machine Learning Repository\n[https://archive.ics.uci.edu/ml]. Irvine, CA: University of California,\nSchool of Information and Computer Science. \n\n.. topic:: References\n\n (1) S. Aeberhard, D. Coomans and O. de Vel, \n Comparison of Classifiers in High Dimensional Settings, \n Tech. Rep. no. 92-02, (1992), Dept. of Computer Science and Dept. of \n Mathematics and Statistics, James Cook University of North Queensland. \n (Also submitted to Technometrics). \n\n The data was used with many others for comparing various \n classifiers. The classes are separable, though only RDA \n has achieved 100% correct classification. \n (RDA : 100%, QDA 99.4%, LDA 98.9%, 1NN 96.1% (z-transformed data)) \n (All results using the leave-one-out technique) \n\n (2) S. Aeberhard, D. Coomans and O. de Vel, \n "THE CLASSIFICATION PERFORMANCE OF RDA" \n Tech. Rep. no. 92-01, (1992), Dept. of Computer Science and Dept. of \n Mathematics and Statistics, James Cook University of North Queensland. \n (Also submitted to Journal of Chemometrics).\n',
'feature_names': ['alcohol',
'malic_acid',
'ash',
'alcalinity_of_ash',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavanoid_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'od280/od315_of_diluted_wines',
'proline']}
##标签
wine.target
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2])
可以看到wine分为了0,1,2三类
##特征矩阵
wine.data
array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
1.065e+03],
[1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
1.050e+03],
[1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
1.185e+03],
...,
[1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
8.350e+02],
[1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
8.400e+02],
[1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
5.600e+02]])
这是wine的数据集,但是这样看并不直观,可以转化成表格的形式
##通过pandas把列表转化成表格
import pandas as pd
pd.concat([pd.DataFrame(wine.data),pd.DataFrame(wine.target)],axis=1)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.23 | 1.71 | 2.43 | 15.6 | 127.0 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065.0 | 0 |
1 | 13.20 | 1.78 | 2.14 | 11.2 | 100.0 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050.0 | 0 |
2 | 13.16 | 2.36 | 2.67 | 18.6 | 101.0 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185.0 | 0 |
3 | 14.37 | 1.95 | 2.50 | 16.8 | 113.0 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480.0 | 0 |
4 | 13.24 | 2.59 | 2.87 | 21.0 | 118.0 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735.0 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
173 | 13.71 | 5.65 | 2.45 | 20.5 | 95.0 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740.0 | 2 |
174 | 13.40 | 3.91 | 2.48 | 23.0 | 102.0 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750.0 | 2 |
175 | 13.27 | 4.28 | 2.26 | 20.0 | 120.0 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835.0 | 2 |
176 | 13.17 | 2.59 | 2.37 | 20.0 | 120.0 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840.0 | 2 |
177 | 14.13 | 4.10 | 2.74 | 24.5 | 96.0 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560.0 | 2 |
178 rows × 14 columns
##获取特征标签
wine.feature_names
['alcohol',
'malic_acid',
'ash',
'alcalinity_of_ash',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavanoid_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'od280/od315_of_diluted_wines',
'proline']
wine.target_names
array(['class_0', 'class_1', 'class_2'], dtype='<U7')
训练测试数据
##确定训练集测试集,百分之70的训练集
##使用的函数就是最开始导入的包
##训练集和测试集是随机挑选的
Xtrain,Xtest,Ytrain,Ytest=train_test_split(wine.data,wine.target,test_size=0.3)
clf = tree.DecisionTreeClassifier(criterion="entropy")
clf = clf.fit(Xtrain,Ytrain)
score = clf.score(Xtest,Ytest) ##返回预测的准确性
查看得分
score
0.9259259259259259
用图的形式展现分类结果
class_name本来是0,1,2的,这里转化成了中文,便于查看
feature_name=wine.feature_names
import graphviz
doc_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=["琴酒","学历","贝尔杜锋"]
,filled=True ##颜色
,rounded=True ##是否是圆角矩形
)
graph=graphviz.Source(doc_data)
graph
种类相同的颜色是一样的,但是有深浅区分,entropy越小,颜色越深
##对决策树的贡献越大,数值越大
##可以发现有些特征并没有放入决策树中,贡献为0
##根结点对决策树的贡献肯定是最大的
clf.feature_importances_
array([0.17046776, 0. , 0. , 0. , 0. ,
0. , 0.43193696, 0. , 0. , 0.3416575 ,
0.05593778, 0. , 0. ])
##把特征和该特征的贡献值一一对应起来
[*zip(feature_name,clf.feature_importances_)]
zip的用法第一次见
[('alcohol', 0.17046776430294286),
('malic_acid', 0.0),
('ash', 0.0),
('alcalinity_of_ash', 0.0),
('magnesium', 0.0),
('total_phenols', 0.0),
('flavanoids', 0.43193696031267537),
('nonflavanoid_phenols', 0.0),
('proanthocyanins', 0.0),
('color_intensity', 0.34165749849006455),
('hue', 0.05593777689431736),
('od280/od315_of_diluted_wines', 0.0),
('proline', 0.0)]
上面的得分每次运行得到的分数都是不一样的,因为他的训练集和测试集每次执行都是随机划分的,所以得到的结果必然不一样
##通过random_state(值为多少无所谓)使得每一次生成的score不变
clf = tree.DecisionTreeClassifier(criterion="entropy",random_state=0)
clf = clf.fit(Xtrain,Ytrain)
score = clf.score(Xtest,Ytest) ##返回预测的准确性
score
doc_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=["琴酒","学历","贝尔杜锋"]
,filled=True ##颜色
,rounded=True ##是否是圆角矩形
)
graph=graphviz.Source(doc_data)
graph
文章来源地址https://www.toymoban.com/news/detail-657634.html
splitter参数
##splitter默认是best,还有参数random
##不一定就是说参数取某一个值最后的得分会更高
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
)
clf = clf.fit(Xtrain,Ytrain)
score = clf.score(Xtest,Ytest) ##返回预测的准确性
score
doc_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=["琴酒","学历","贝尔杜锋"]
,filled=True ##颜色
,rounded=True ##是否是圆角矩形
)
graph=graphviz.Source(doc_data)
graph
## 查看训练集的拟合程度
## 训练集比测试集的结果要好 所以过拟合了。需要剪枝
score_train=clf.score(Xtrain,Ytrain)
score_train
1.0
训练集比测试集的结果要好 所以过拟合了。需要剪枝
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
# ,max_depth=3 #最大深度为3
# ,min_samples_leaf=5 ##最小的叶子结点个数
# ,min_samples_split=5
)
clf = clf.fit(Xtrain,Ytrain)
doc_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=["琴酒","学历","贝尔杜锋"]
,filled=True ##颜色
,rounded=True ##是否是圆角矩形
)
graph=graphviz.Source(doc_data)
graph
score = clf.score(Xtest,Ytest) ##返回预测的准确性
score
0.9074074074074074
剪枝操作有多种方式,可以是
max_depth
min_samples_leaf
min_samples_split
下面是通过控制最大深度来剪枝
clf = tree.DecisionTreeClassifier(criterion="entropy"
,random_state=30
,splitter="random"
,max_depth=3 #最大深度为3
# ,min_samples_leaf=5 ##最小的叶子结点个数
# ,min_samples_split=5
)
clf = clf.fit(Xtrain,Ytrain)
doc_data=tree.export_graphviz(clf
,feature_names=feature_name
,class_names=["琴酒","学历","贝尔杜锋"]
,filled=True ##颜色
,rounded=True ##是否是圆角矩形
)
graph=graphviz.Source(doc_data)
graph
score = clf.score(Xtest,Ytest) ##返回预测的准确性
score
0.9444444444444444
但是有时候并不知道最大深度控制在多少是最合适的,所以可以通过下面的操作
一个for循环,让树的深度在1到10,分别求出对应的分数,并画出图像,函数的最高点对应的x值便是最理想的深度
##画图。最大深度从1到10观察对应的分数
import matplotlib.pyplot as plt
test=[]
for i in range(10):
clf=tree.DecisionTreeClassifier(max_depth=i+1
,criterion="entropy"
,random_state=30
,splitter='random'
)
clf=clf.fit(Xtrain,Ytrain)
score=clf.score(Xtest,Ytest)
test.append(score)
plt.plot(range(1,11),test,color="red",label='max_depth')
plt.legend()
plt.show()
文章来源:https://www.toymoban.com/news/detail-657634.html
决策树sklearn中另外两个常见接口
##apply返回每个测试样本所在的叶子节点的索引
clf.apply(Xtest)
array([10, 8, 5, 26, 5, 5, 26, 16, 26, 16, 27, 27, 10, 8, 26, 16, 8,
26, 3, 8, 26, 8, 8, 16, 8, 8, 19, 16, 16, 16, 26, 16, 6, 3,
3, 18, 27, 16, 10, 12, 5, 26, 8, 26, 8, 3, 5, 8, 26, 26, 23,
5, 16, 16], dtype=int64)
##predict返回每个测试样本的分类结果
clf.predict(Xtest)
array([2, 2, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 2, 2, 0, 1, 2, 0, 2, 2, 0, 2,
2, 1, 2, 2, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 1, 1, 2, 1, 1, 0, 2, 0,
2, 2, 1, 2, 0, 0, 0, 1, 1, 1])
到了这里,关于数学建模——决策树(sklearn)的文章就介绍完了。如果您还想了解更多内容,请在右上角搜索TOY模板网以前的文章或继续浏览下面的相关文章,希望大家以后多多支持TOY模板网!