数据挖掘 使用ID3算法构建决策树
使用ID3算法构建一个决策树from math import logimport operatordef calShang(dataSet):# 计算数据集的熵numEnteries=len(dataSet)# 数据集长度(数据个数)labelCounts={}# 用字典来统计每种结果的个数for featVec in dataSet:# 统计数据集中不同结果的个数currentLabel=fea
·
使用ID3算法构建一个决策树
from math import log
import operator
def calShang(dataSet): # 计算数据集的熵
numEnteries=len(dataSet) # 数据集长度(数据个数)
labelCounts={} # 用字典来统计每种结果的个数
for featVec in dataSet: # 统计数据集中不同结果的个数
currentLabel=featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel]=0
labelCounts[currentLabel]+=1
shannoEnt=0.0 # 熵
for key in labelCounts: # for循环遍历计算每个类别的熵,并求和得数据集得熵
prob=float(labelCounts[key])/numEnteries # 每种结果在数据集中的比例
shannoEnt-=prob*log(prob,2)
return shannoEnt # 返回数据集的熵
def splitDataSet(dataSet,axis,value): # 按照给定属性和属性值来划分数据集
retDataSet=[]
for featVec in dataSet:
if featVec[axis]==value:
reducedFeatVec=featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureTosplit(dataSet): # 选最好的数据集划分方式
numFeatures=len(dataSet[0])-1
baseEntropy=calShang(dataSet)
maxInfoGain=0.0 # 最大的信息增益
minFeature=0 # 最大的信息增益的特征在所有属性中的位置
for i in range(numFeatures):
# 创建分类标签列表
featList=[j[i] for j in dataSet]
uniqueValues=set(featList)
# 计算数据集划分后的信息增益
newEntropy=0.0
for value in uniqueValues:
subDataSet=splitDataSet(dataSet,i,value)
prob=len(subDataSet)/float(len(dataSet))
newEntropy+=prob*calShang(subDataSet)
infoGain=baseEntropy-newEntropy
# 获取信息增益最大的特征
if(infoGain>maxInfoGain):
maxInfoGain=infoGain
minFeature=i
return minFeature
def majorityCnt(classList):
classCount={}
for i in classList:
if i not in classCount.keys():
classCount[i]=0
classCount[i]+=1
sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet,labels):
classList=[j[-1] for j in dataSet]
if classList.count(classList[0])==len(classList): # 类别完全相同则
return classList[0] # 停止继续划分
if len(dataSet[0])==1: # 划分到最后一个属性后
return majorityCnt(classList) # 返回数据集分类结果总次数最多的一个
bestFeat=chooseBestFeatureTosplit(dataSet) # 选择一个最好的属性作为节点,该属性在属性中的位置
bestFeatLabel=labels[bestFeat] # 作为节点的属性名
mytree={bestFeatLabel:{}}
del(labels[bestFeat]) # 删除已经选为节点的属性
featValues=[j[bestFeat] for j in dataSet] #
uniqueValues=set(featValues)
for value in uniqueValues:
subLabels=labels[:]
mytree[bestFeatLabel][value]=createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return mytree
def createDataSet():
dataSet=[["sunny","hot","high","false","no"],
["sunny","hot","high","true","no"],
["overcast","hot","high","false","yes"],
["rainy","mild","high","false","yes"],
["rainy","cool","normal","false","yes"],
["rainy","cool","normal","true","no"],
["overcast","cool","normal","true","yes"],
["sunny","mild","high","false","no"],
["sunny","cool","normal","false","yes"],
["rainy","mild","normal","false","yes"],
["sunny","mild","normal","true","yes"],
["overcast","mild","high","true","yes"],
["overcast","hot","normal","false","yes"],
["rainy","mild","high","true","no"]]
labels=['天气','温度','湿度','风','是否出去玩']
return dataSet,labels
myData,labels=createDataSet()
print(createTree(myData,labels))
更多推荐
所有评论(0)