我自己的决策树Python代码不适用于所有数据集..需要指南

时间:2018-06-13 08:11:25

标签: python graphviz

我使用以下公式为信息增益创建了自己的决策树实现的python代码

类的熵

        P = Target Class Positive(True) Count
        N = Target Class Negative(False) Count

= (-P/P+N)log2(P/P+N) (-N/P+N)log2(N/P+N) 

每个属性的信息增益(IG)

IG(Pi , Ni)= (-P/P+N)log2(P/P+N) (-N/P+N)log2(N/P+N) 

属性熵

= (∑(Pi , Ni)/P+N) *   IG(Pi , Ni)

获得

= Entropy of Class - Entropy of Attribute

适用于小型数据集(profit.csv)

age competition type    profit
old yes         sw      down
old no          sw      down
old no          hw      down
mid yes         sw      down
mid yes         hw      down
mid no          hw      up
mid no          sw      up
new yes         sw      up
new no          hw      up
new no          sw      up

但它不适用于所有数据集。例如,对于以下数据集(play.csv)

outlook    temp humidity    windy   play
sunny      hot  high        false   no
sunny      hot  high        true    no
overcast   hot  high        false   yes
rainy      mild high        false   yes
rainy      cool normal      false   yes
rainy      cool normal      true    no
overcast   cool normal      true    yes
sunny      mild high        false   no
sunny      cool normal      false   yes
rainy      mild normal      false   yes
sunny      mild normal      true    yes
overcast   mild high        true    yes
overcast   hot  normal      false   yes
rainy      mild high        true    no

我的python代码如下

from __future__ import division
import pandas as pd
import numpy as np
from graphviz import Digraph

target_cls_pn_count = []
target_cls_pn_count = np.array(target_cls_pn_count)
target_cls_pn_count_sum = float()
cls_entropy_value = float()


def sum_target_cls_pn_count(df):
    target_cls_pn_count_sum = 0.0
    Count_Col = df.shape[1]
    target_cls_pn_count_vals, target_cls_pn_count = 
    np.unique(df.iloc[:, [Count_Col - 1]], return_counts=True)

    for index in np.nditer(target_cls_pn_count):
        target_cls_pn_count_sum += index
    return target_cls_pn_count_sum


def entropy_cls(df):
    cls_entropy_value = 0.0
    Count_Col = df.shape[1]
    target_cls_pn_count_vals, target_cls_pn_count = np.unique(df.iloc[:, [Count_Col - 1]], return_counts=True)
    target_cls_pn_count_sum = sum_target_cls_pn_count(df)
    flag = -1  #
    for target_cls_pn_count_index, target_cls_pn_vals in enumerate(target_cls_pn_count):
        if (flag < target_cls_pn_count_index):
            cls_entropy_value = cls_entropy_value + (-(target_cls_pn_vals / target_cls_pn_count_sum) * np.log2(target_cls_pn_vals / target_cls_pn_count_sum))
            flag = target_cls_pn_count_index
    return cls_entropy_value

def ig_atr(df_each_col_count):
    cls_entropy_value = 0.0
    target_cls_pn_count_sum = 0
    flag = -1  #
    for i in np.nditer(df_each_col_count):
        target_cls_pn_count_sum += i
    for target_cls_pn_count_index, target_cls_pn_vals in enumerate(df_each_col_count):
        if (flag < target_cls_pn_count_index):
            cls_entropy_value = cls_entropy_value + (-(target_cls_pn_vals / target_cls_pn_count_sum) * np.log2(target_cls_pn_vals / target_cls_pn_count_sum))
            flag = target_cls_pn_count_index
    return cls_entropy_value


def gain(df, df_col_names_index1, df_first_col_val):
    df_col_names = df.columns.tolist()
    Count_Col = df.shape[1]
    sum_etpy_atr = 0
    sum_etpy_cls = sum_target_cls_pn_count(df)
    cls_entropy_value = entropy_cls(df)

    for index2, m in enumerate(df_first_col_val):
        df_each_col = pd.DataFrame(df.loc[(df[df_col_names[df_col_names_index1]] == df_first_col_val[index2]), [df_col_names[Count_Col - 1]]])
        df_each_col_vals, df_each_col_count = np.unique(df_each_col, return_counts=True)

        sum_atr = 0
        for i in np.nditer(df_each_col_count):
            sum_atr += i

        sum_etpy_atr += ((sum_atr / sum_etpy_cls) * ig_atr(df_each_col_count))

    gain_s = str(df_col_names[df_col_names_index1]) + ":" + str(cls_entropy_value - sum_etpy_atr) + " "

    return gain_s


def gain_string(df):
    gs_str = ""
    df_col_names = df.columns.tolist()
    df_col_names_count = df.shape[1]
    for df_col_names_index, c in enumerate(df_col_names):
        if (df_col_names_index == df_col_names_count - 1):
            df_col_names_index = df_col_names_index - 1
        # raise SystemExit()

        df_first_col_val = np.unique(df[df_col_names[df_col_names_index]])
        gs_str += gain(df, df_col_names_index, df_first_col_val)
        gain_str = gs_str
        ls_gainstr = gain_str.split(" ")
    return ls_gainstr[0:len(ls_gainstr) - 2]


def max_gain_col(df):
    g_gainstr = pd.Series(gain_string(df))
    unsorted_df = pd.DataFrame(g_gainstr.str.split(':', expand=True))
    sorted_df = unsorted_df.sort_values(1, ascending=False)
    max_gain_col, max_gain_val = sorted_df.loc[sorted_df[1].idxmax()]  # Max gain values
    return max_gain_col


def tree_graphviz(df):
    l1_tree_items_next=""

    df_col_names = df.columns.tolist()
    df_col_names_count = df.shape[1]

    gain_max_col = max_gain_col(df)

    l1_tree_items = np.unique(df[[gain_max_col]].values)

    for l1_node_index, l1_node in enumerate(l1_tree_items):
        l1_tree_leaf_items = pd.DataFrame(df.loc[(df[df_col_names[0]] == l1_tree_items[l1_node_index]), [df_col_names[df_col_names_count - 1]]])
        leaf_vals, leaf_counts = np.unique(l1_tree_leaf_items, return_counts=True)
        for l1_leaf_index, l1_leaf_node in enumerate(leaf_vals):
            if (len(leaf_vals) > 1):
                l1_tree_items_next = l1_tree_items[l1_node_index]
            else:
                print ""
    return gain_max_col, l1_tree_items, l1_tree_items_next


def DT_pre_final(g,df,root,l1,sub_root):

    df_col_names = df.columns.tolist()
    df_col_names_count = df.shape[1]

    for l1_node_index, l1_node in enumerate(l1):
        g.edge(root, l1[l1_node_index])
        l1_tree_leaf_items = pd.DataFrame(df.loc[(df[df_col_names[0]] == l1[l1_node_index]), [df_col_names[df_col_names_count - 1]]])
        leaf_vals, leaf_counts = np.unique(l1_tree_leaf_items, return_counts=True)
        for l1_leaf_index, l1_leaf_node in enumerate(leaf_vals):
            if len(leaf_vals) > 1:
                print ""
            else:
                g.edge(l1[l1_node_index], leaf_vals[l1_leaf_index])

def DT_final():

    g = Digraph('G', filename='hello.gv')
    data = pd.read_csv('profit.csv')
    # data = pd.read_csv('play.csv')

    df = pd.DataFrame(data)
    l1 = []
    root, l1, sub_root = tree_graphviz(df)

    DT_pre_final(g, df, root, l1, sub_root)


 #--------- code for sub tree -----------
    data = df.loc[df[root] == sub_root]
    df = pd.DataFrame(data)
    df.drop(df.columns[[0]], axis=1, inplace=True)

    link = ""
    link = sub_root
    root, l1, sub_root = tree_graphviz(df)
    g.edge(link, root)
#--------- code for sub tree -----------
    DT_pre_final(g, df, root, l1, sub_root)

    g.view()

DT_final()

我知道DT_final()函数的子树代码想要推广..但我不知道怎么做..请指导我解决这个问题,我的代码应该适用于所有的数据集(就像我的2个示例类型的数据集)。提前感谢您的宝贵支持..

0 个答案:

没有答案