通过一组函数创建一类对象

时间:2018-11-27 13:19:32

标签: python-3.x pandas oop dataframe pandas-groupby

我是OOP的新手,并且想开始养成编写模块化,可重用代码的习惯。很抱歉,代码和文本的墙,只是试图提供尽可能多的上下文并且尽可能清晰。

我已经定义了一系列功能,所有这些功能都作用于pandas DataFrame,并且我希望从这一系列功能中创建一个类。我认为创建一个类是一个好主意,因为我将在以后非常标准的DataFrames上使用此代码。

我的问题是双重的: 1.我应该使用面向对象的概念和设计来解决这个问题吗 2.到底是什么导致我的织补代码损坏?!

提前感谢大家!

以下是我定义的功能:

def __describe(df, col):
    df_desc = df[col].describe()
    return df_desc

def __sort_by_character_study_day(df):        
    new_index_values = df.index.levels[0].str.split().str[-1].astype(int)
    df.index = df.index.set_levels(new_index_values, level='study_day')
    df = df.sort_index()
    return df

def change_from_baseline(df, col):
    df_sorted = sort_df(df = df, by = ["Animal_id", "ord"])
    groupedf = grouper(df=df_sorted, by='Animal_id')
    df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
    df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
    return df_sorted

def grouper(df, by):
    df = df.groupby(by)
    return df

def sns_box_plot(df, col, y_lab):   
    plt.subplots(figsize=(12,8))
    sns.set_style("whitegrid")
    g = sns.boxplot(x="study_day", y = col, hue="group_c", 
                    data = df, palette='rainbow', 
                    hue_order=["Group 1", "Group 2", "Group 3"])
    plt.ylabel(y_lab, fontsize=20)
    plt.xlabel("Study Day", fontsize=20)
    plt.tick_params('both', labelsize='14')
    plt.show()

def sns_bar_plot(df, col, y_lab):       
    plt.subplots(figsize=(12,8))
    sns.set_style("whitegrid")
    g = sns.barplot(x="study_day", y = col, hue="group_c", 
                    data = df, palette='rainbow')
    plt.ylabel(y_lab, fontsize=20)
    plt.xlabel("Study Day", fontsize=20)
    plt.tick_params('both', labelsize='14')
    plt.show()

def sort_df(df, by):
    df = df.sort_values(by)
    return df   

def summary_stats(df, by, col):
    dfgrouped = grouper(df, by)
    df_desc = __describe(dfgrouped, col)
    df_summ = pd.DataFrame()
    df_summ["count"] = df_desc['count']
    df_summ["mean"] = df_desc['mean'].round(2).astype(str)
    df_summ["std"] = df_desc['std'].round(2).astype(str)
    df_summ["25%"] = df_desc['25%'].round(2).astype(str)
    df_summ["50%"] = df_desc['50%'].round(2).astype(str)
    df_summ["75%"] = df_desc['75%'].round(2).astype(str)
    df_summ["min"] = df_desc['min'].round(2).astype(str)
    df_summ["max"] = df_desc['max'].round(2).astype(str)
    return __sort_by_character_study_day(df_summ)

def summary_stat_formatted(df, by, col):
    data_summ = summary_stats(df, by, col)
    formatted_summ = pd.DataFrame()
    formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
    formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
    formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  
    formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
    return formatted_summ

def write_to_excel(df, outfile, sheetname):
    writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
    df.to_excel(writer, sheet_name=sheetname)

当我调用其中一个函数时,说:

summary_stats(df=bw, by= ["study_day", "group_c"], col='Body_Weight')

一切正常,我得到以下输出: enter image description here

现在,由于我将在几乎相同的DataFrames上使用这些相同的函数(唯一不同的是dataframe的名称和分析列),所以我认为创建一个类可能是一个好主意(请让我知道这是否是正确的方法)。我的课看起来像这样:

class PreClinicalData(object):

    def __init__(self, df):
        self.df = df

    def __describe(self, col):
        df_desc = self.df[col].describe()
        return df_desc

    def __sort_by_character_study_day(self):
        new_index_values = self.df.index.levels[0].str.split().str[-1].astype(int)
        self.df.index = self.df.index.set_levels(new_index_values, level='study_day')
        self.df = self.df.sort_index()
        return self.df

    def change_from_baseline(self,  col):
        df_sorted = self.sort_df(df = self.df, by = ["Animal_id", "ord"])
        groupedf = self.grouper(df=df_sorted, by='Animal_id')
        df_sorted['Change From Baseline'] = df_sorted[col] - groupedf[col].transform('first')
        df_sorted['Percent Change From Baseline'] = (df_sorted['Change From Baseline'] / groupedf[col].transform('first')).round(4) * 100
        return df_sorted

    def summary_stats(self, by, col):
        dfgrouped = self.grouper(df=self.df, by=by)
        df_desc = self.__describe(dfgrouped, col=col)
        df_summ = pd.DataFrame()
        df_summ["count"] = df_desc['count']
        df_summ["mean"] = df_desc['mean'].round(2).astype(str)
        df_summ["std"] = df_desc['std'].round(2).astype(str)
        df_summ["25%"] = df_desc['25%'].round(2).astype(str)
        df_summ["50%"] = df_desc['50%'].round(2).astype(str)
        df_summ["75%"] = df_desc['75%'].round(2).astype(str)
        df_summ["min"] = df_desc['min'].round(2).astype(str)
        df_summ["max"] = df_desc['max'].round(2).astype(str)
        return self.__sort_by_character_study_day(df_summ)

    def summary_stat_formatted(self, by, col):
        data_summ = self.summary_stats(self, by, col)
        formatted_summ = pd.DataFrame()
        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  
        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
        return formatted_summ

    def write_to_excel(self, outfile, sheetname):
        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
        self.df.to_excel(writer, sheet_name=sheetname)    

    def grouper(self, by):
        return self.df.groupby(by)

    def sns_box_plot(self, col, y_lab):   
        plt.subplots(figsize=(12,8))
        sns.set_style("whitegrid")
        g = sns.boxplot(x="study_day", y = col, hue="group_c", 
                        data = self.df, palette='rainbow', 
                        hue_order=["Group 1", "Group 2", "Group 3"])
        plt.ylabel(y_lab, fontsize=20)
        plt.xlabel("Study Day", fontsize=20)
        plt.tick_params('both', labelsize='14')
        plt.show()

    def sns_bar_plot(self, col, y_lab):       
        plt.subplots(figsize=(12,8))
        sns.set_style("whitegrid")
        g = sns.barplot(x="study_day", y = col, hue="group_c", 
                        data = self.df, palette='rainbow')
        plt.ylabel(y_lab, fontsize=20)
        plt.xlabel("Study Day", fontsize=20)
        plt.tick_params('both', labelsize='14')
        plt.show()

    def sort_df(self, by):
        return self.df.sort_values(by)   

    def summary_stats(self, by, col):
        dfgrouped = self.grouper(self, by)
        df_desc = self.__describe(dfgrouped, col)
        df_summ = pd.DataFrame()
        df_summ["count"] = df_desc['count']
        df_summ["mean"] = df_desc['mean'].round(2).astype(str)
        df_summ["std"] = df_desc['std'].round(2).astype(str)
        df_summ["25%"] = df_desc['25%'].round(2).astype(str)
        df_summ["50%"] = df_desc['50%'].round(2).astype(str)
        df_summ["75%"] = df_desc['75%'].round(2).astype(str)
        df_summ["min"] = df_desc['min'].round(2).astype(str)
        df_summ["max"] = df_desc['max'].round(2).astype(str)
        return self.__sort_by_character_study_day(df_summ)

    def summary_stat_formatted(self, by, col):
        data_summ = self.summary_stats(self, by, col)
        formatted_summ = pd.DataFrame()
        formatted_summ["Number of Observations"] = data_summ['count'].astype(int)
        formatted_summ["Mean (SD)"] = data_summ["mean"] +' (' + data_summ["std"] +')'
        formatted_summ["Median (25th - 75th %ile)"] = data_summ["50%"] + ' (' + data_summ["25%"] +' ,' + data_summ["75%"] +')'  
        formatted_summ["Min, Max"] = data_summ["min"] +' ,' + data_summ["max"]
        return formatted_summ

    def write_to_excel(self, outfile, sheetname):
        writer = pd.ExcelWriter(outfile, engine='xlsxwriter')
        self.df.to_excel(writer, sheet_name=sheetname)

我可以实例化该类的实例,如下所示:

bodyweight = PreClinicalData(chickv_data['bodyweight'])

而且我能够调用DataFrame并使用head()方法就好了。

bodyweight.df.head()

现在,当我调用与上图所示相同的方法时……我得到以下信息:

bodyweight.summary_stats(by = ["study_day", "group_c"], col = "Body_Weight")

会产生以下错误:

enter image description here

我不知道这些所谓的三个论点从何而来?

0 个答案:

没有答案