利用随机森林,斯朴素贝叶斯,支持向量机对鸢尾花数据集进行分类和可视化
1. 导入所需要的库文件(准备阶段)import numpy as npimport pandas as pdimport matplotlib.pyplot as pltfrom mpl_toolkits import mplot3dimport seaborn as snsimport warningswarnings.filterwarnings('ignore')plt.style.use
·
1. 导入所需要的库文件(准备阶段)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
sns.set(font_scale=1) #sns字体大小
sns.set(font='SimHei') # 解决Seaborn中文显示问题
# Set default font size
#设置默认字体大小
plt.rcParams['font.size'] = 16
sns.set(font_scale = 2)
# Display up to 60 columns of a dataframe
# 最多显示60列
pd.set_option('display.max_columns', 60)
import warnings
from sklearn.datasets import load_iris
2. 导出数据集
data = load_iris()
species = data.target
iris_features = pd.DataFrame(data=data.data, columns=data.feature_names) #利用Pandas转化为DataFrame格式
iris_data['species'] = species
iris_data
3. 可视化部分
该部分主要是对花四个特征的数值分布情况进行可视化统计。主要利用核密度图与散点图搭配、小提琴与散点图搭配和3D可视化。
sns.set(font_scale=1)
g = sns.PairGrid(iris_data,hue="species")
g = g.map_upper(sns.scatterplot)#在上对角线子图上用二元函数绘制的图
g = g.map_lower(sns.kdeplot,color='r')#在下对角线子图上用二元函数绘制的图
g = g.map_diag(sns.kdeplot)#对角线单变量子图
ax = sns.violinplot(x="species", y=iris_data.columns[0],data=iris_data,inner=None, whis=np.inf)
ax = sns.swarmplot(x="species", y=iris_data.columns[0], data=iris_data, color="c")
ax = sns.violinplot(x="species", y=iris_data.columns[1],data=iris_data,inner=None, whis=np.inf)
ax = sns.swarmplot(x="species", y=iris_data.columns[1], data=iris_data, color="c")
p0 = iris_data.iloc[iris_data['species'][iris_data['species'] == 0].index,:]
p1 = iris_data.iloc[iris_data['species'][iris_data['species'] == 1].index,:]
p2 = iris_data.iloc[iris_data['species'][iris_data['species'] == 2].index,:]
p0_x = p0.iloc[:,0]
p0_y = p0.iloc[:,1]
p0_z = p0.iloc[:,2]
p1_x = p1.iloc[:,0]
p1_y = p1.iloc[:,1]
p1_z = p1.iloc[:,2]
p2_x = p2.iloc[:,0]
p2_y = p2.iloc[:,1]
p2_z = p2.iloc[:,2]
sns.set(font='SimHei') # 解决Seaborn中文显示问题
fig = plt.figure(figsize=(16,10))
ax = plt.axes(projection = '3d')
plt.style.use('ggplot')
# ax.w_xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
plt.rcParams['font.size'] = 8
ax.scatter3D(p0_x,p0_y,p0_z,alpha=0.7,label = '0',marker='*',c='y')
ax.scatter3D(p1_x,p1_y,p1_z,alpha=0.7,label = '1',marker='p')
ax.scatter3D(p2_x,p2_y,p2_z,alpha=0.7,label = '2',marker='^')
# 添加坐标轴(顺序是Z, Y, X)
ax.set_zlabel('sepal length (cm)', fontdict={'size': 15})
ax.set_ylabel('sepal width (cm)', fontdict={'size': 15})
ax.set_xlabel('petal length (cm)', fontdict={'size': 15})
plt.legend()
4. 分类
- 导包
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import LeaveOneOut
# from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score
- 数据集X,y提取
X = iris_data.iloc[:,:-1]
Y = iris_data.iloc[:,-1]
# X = StandardScaler().fit_transform(X)
X = pd.DataFrame(X)
- 留一法进行训练
def leave_one_out(algr, X, y):
loo = LeaveOneOut()
count = 0
l = []
for train,test in loo.split(X):
X_train, X_test = np.array(X.iloc[train,:]),np.array(X.iloc[test,:])
y_train,y_test = list(y[train]),y[test]
X_train = pd.DataFrame(X_train)
model = algr.fit(X_train, y_train)
predicted_y = model.predict(X_test)
l.append(predicted_y)
# print('Raw:',np.array(y_test),'Forecast:',predicted_y)
if np.array(y_test) == predicted_y:
count = count + 1
print('Using the leave one test and The assessment results are: {}'.format(count/X.shape[0]))
return algr,l
- 算法调用
clf_gs = GaussianNB()
print("GaussianNB:")
clf_gs,l_gs = leave_one_out(clf_gs, X, Y)
clf_svm = SVC(kernel='linear', C=1E10)
print("SVM:")
clf_svm,l_svm = leave_one_out(clf_svm, X, Y)
clf_RF = RandomForestClassifier()
print("RandomForestClassifier:")
clf_RF,l_RF = leave_one_out(clf_RF, X, Y)
GaussianNB:
Using the leave one test and The assessment results are: 0.9533333333333334
SVM:
Using the leave one test and The assessment results are: 0.96
RandomForestClassifier:
Using the leave one test and The assessment results are: 0.9533333333333334
- 混淆矩阵
classes = ['0','1','2']
def plot_confusion_matrix(cm,title='Confusion Matrix'):
plt.figure(figsize=(12, 8), dpi=100)
np.set_printoptions(precision=2)
ind_array = np.arange(len(classes))
x, y = np.meshgrid(ind_array, ind_array)
for x_val, y_val in zip(x.flatten(), y.flatten()):
c = cm[y_val][x_val]
if c > 0.001:
plt.text(x_val, y_val, "%0.2f" % (c,), color='red', fontsize=15, va='center', ha='center')
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.binary)
plt.title(title)
plt.colorbar()
xlocations = np.array(range(len(classes)))
plt.xticks(xlocations, classes, rotation=90)
plt.yticks(xlocations, classes)
plt.ylabel('Actual label')
plt.xlabel('Predict label')
# offset the tick
tick_marks = np.array(range(len(classes))) + 0.5
plt.gca().set_xticks(tick_marks, minor=True)
plt.gca().set_yticks(tick_marks, minor=True)
plt.gca().xaxis.set_ticks_position('none')
plt.gca().yaxis.set_ticks_position('none')
plt.grid(True, which='minor', linestyle='-')
plt.gcf().subplots_adjust(bottom=0.15)
# show confusion matrix
plt.show()
cm = confusion_matrix(Y,l_gs)
plot_confusion_matrix(cm,title='confusion matrix (GaussianNB)')
cm = confusion_matrix(Y,l_svm)
plot_confusion_matrix(cm,title='confusion matrix (SVM)')
cm = confusion_matrix(Y,l_RF)
plot_confusion_matrix(cm,title='confusion matrix (RandomForestClassifier)')
更多推荐
所有评论(0)