我使用以下公式为信息增益创建了自己的决策树实现的python代码
类的熵
P = Target Class Positive(True) Count
N = Target Class Negative(False) Count
= (-P/P+N)log2(P/P+N) (-N/P+N)log2(N/P+N)
每个属性的信息增益(IG)
IG(Pi , Ni)= (-P/P+N)log2(P/P+N) (-N/P+N)log2(N/P+N)
属性熵
= (∑(Pi , Ni)/P+N) * IG(Pi , Ni)
获得
= Entropy of Class - Entropy of Attribute
适用于小型数据集(profit.csv)
age competition type profit
old yes sw down
old no sw down
old no hw down
mid yes sw down
mid yes hw down
mid no hw up
mid no sw up
new yes sw up
new no hw up
new no sw up
但它不适用于所有数据集。例如,对于以下数据集(play.csv)
outlook temp humidity windy play
sunny hot high false no
sunny hot high true no
overcast hot high false yes
rainy mild high false yes
rainy cool normal false yes
rainy cool normal true no
overcast cool normal true yes
sunny mild high false no
sunny cool normal false yes
rainy mild normal false yes
sunny mild normal true yes
overcast mild high true yes
overcast hot normal false yes
rainy mild high true no
我的python代码如下
from __future__ import division
import pandas as pd
import numpy as np
from graphviz import Digraph
target_cls_pn_count = []
target_cls_pn_count = np.array(target_cls_pn_count)
target_cls_pn_count_sum = float()
cls_entropy_value = float()
def sum_target_cls_pn_count(df):
target_cls_pn_count_sum = 0.0
Count_Col = df.shape[1]
target_cls_pn_count_vals, target_cls_pn_count =
np.unique(df.iloc[:, [Count_Col - 1]], return_counts=True)
for index in np.nditer(target_cls_pn_count):
target_cls_pn_count_sum += index
return target_cls_pn_count_sum
def entropy_cls(df):
cls_entropy_value = 0.0
Count_Col = df.shape[1]
target_cls_pn_count_vals, target_cls_pn_count = np.unique(df.iloc[:, [Count_Col - 1]], return_counts=True)
target_cls_pn_count_sum = sum_target_cls_pn_count(df)
flag = -1 #
for target_cls_pn_count_index, target_cls_pn_vals in enumerate(target_cls_pn_count):
if (flag < target_cls_pn_count_index):
cls_entropy_value = cls_entropy_value + (-(target_cls_pn_vals / target_cls_pn_count_sum) * np.log2(target_cls_pn_vals / target_cls_pn_count_sum))
flag = target_cls_pn_count_index
return cls_entropy_value
def ig_atr(df_each_col_count):
cls_entropy_value = 0.0
target_cls_pn_count_sum = 0
flag = -1 #
for i in np.nditer(df_each_col_count):
target_cls_pn_count_sum += i
for target_cls_pn_count_index, target_cls_pn_vals in enumerate(df_each_col_count):
if (flag < target_cls_pn_count_index):
cls_entropy_value = cls_entropy_value + (-(target_cls_pn_vals / target_cls_pn_count_sum) * np.log2(target_cls_pn_vals / target_cls_pn_count_sum))
flag = target_cls_pn_count_index
return cls_entropy_value
def gain(df, df_col_names_index1, df_first_col_val):
df_col_names = df.columns.tolist()
Count_Col = df.shape[1]
sum_etpy_atr = 0
sum_etpy_cls = sum_target_cls_pn_count(df)
cls_entropy_value = entropy_cls(df)
for index2, m in enumerate(df_first_col_val):
df_each_col = pd.DataFrame(df.loc[(df[df_col_names[df_col_names_index1]] == df_first_col_val[index2]), [df_col_names[Count_Col - 1]]])
df_each_col_vals, df_each_col_count = np.unique(df_each_col, return_counts=True)
sum_atr = 0
for i in np.nditer(df_each_col_count):
sum_atr += i
sum_etpy_atr += ((sum_atr / sum_etpy_cls) * ig_atr(df_each_col_count))
gain_s = str(df_col_names[df_col_names_index1]) + ":" + str(cls_entropy_value - sum_etpy_atr) + " "
return gain_s
def gain_string(df):
gs_str = ""
df_col_names = df.columns.tolist()
df_col_names_count = df.shape[1]
for df_col_names_index, c in enumerate(df_col_names):
if (df_col_names_index == df_col_names_count - 1):
df_col_names_index = df_col_names_index - 1
# raise SystemExit()
df_first_col_val = np.unique(df[df_col_names[df_col_names_index]])
gs_str += gain(df, df_col_names_index, df_first_col_val)
gain_str = gs_str
ls_gainstr = gain_str.split(" ")
return ls_gainstr[0:len(ls_gainstr) - 2]
def max_gain_col(df):
g_gainstr = pd.Series(gain_string(df))
unsorted_df = pd.DataFrame(g_gainstr.str.split(':', expand=True))
sorted_df = unsorted_df.sort_values(1, ascending=False)
max_gain_col, max_gain_val = sorted_df.loc[sorted_df[1].idxmax()] # Max gain values
return max_gain_col
def tree_graphviz(df):
l1_tree_items_next=""
df_col_names = df.columns.tolist()
df_col_names_count = df.shape[1]
gain_max_col = max_gain_col(df)
l1_tree_items = np.unique(df[[gain_max_col]].values)
for l1_node_index, l1_node in enumerate(l1_tree_items):
l1_tree_leaf_items = pd.DataFrame(df.loc[(df[df_col_names[0]] == l1_tree_items[l1_node_index]), [df_col_names[df_col_names_count - 1]]])
leaf_vals, leaf_counts = np.unique(l1_tree_leaf_items, return_counts=True)
for l1_leaf_index, l1_leaf_node in enumerate(leaf_vals):
if (len(leaf_vals) > 1):
l1_tree_items_next = l1_tree_items[l1_node_index]
else:
print ""
return gain_max_col, l1_tree_items, l1_tree_items_next
def DT_pre_final(g,df,root,l1,sub_root):
df_col_names = df.columns.tolist()
df_col_names_count = df.shape[1]
for l1_node_index, l1_node in enumerate(l1):
g.edge(root, l1[l1_node_index])
l1_tree_leaf_items = pd.DataFrame(df.loc[(df[df_col_names[0]] == l1[l1_node_index]), [df_col_names[df_col_names_count - 1]]])
leaf_vals, leaf_counts = np.unique(l1_tree_leaf_items, return_counts=True)
for l1_leaf_index, l1_leaf_node in enumerate(leaf_vals):
if len(leaf_vals) > 1:
print ""
else:
g.edge(l1[l1_node_index], leaf_vals[l1_leaf_index])
def DT_final():
g = Digraph('G', filename='hello.gv')
data = pd.read_csv('profit.csv')
# data = pd.read_csv('play.csv')
df = pd.DataFrame(data)
l1 = []
root, l1, sub_root = tree_graphviz(df)
DT_pre_final(g, df, root, l1, sub_root)
#--------- code for sub tree -----------
data = df.loc[df[root] == sub_root]
df = pd.DataFrame(data)
df.drop(df.columns[[0]], axis=1, inplace=True)
link = ""
link = sub_root
root, l1, sub_root = tree_graphviz(df)
g.edge(link, root)
#--------- code for sub tree -----------
DT_pre_final(g, df, root, l1, sub_root)
g.view()
DT_final()
我知道DT_final()函数的子树代码想要推广..但我不知道怎么做..请指导我解决这个问题,我的代码应该适用于所有的数据集(就像我的2个示例类型的数据集)。提前感谢您的宝贵支持..