根据数据框中逗号分隔列中的文本进行分组

时间:2018-08-31 10:05:14

标签: python json pandas dataframe

我有一个用逗号分隔的列的数据表。我想根据逗号分隔列中的每个值对数据进行分组。

Date        Investment Type                                    Medium
1/1/2000    Mutual Fund, Stocks, Fixed Deposit, Real Estate    Own, Online,Through Agent
1/2/2000    Mutual Fund, Stocks, Real Estate                   Own
1/3/2000    Fixed Deposit                                      Online
1/3/2000    Mutual Fund, Fixed Deposit, Real Estate            Through Agent
1/2/2000    Stocks                                             Own, Online,                               Through Agent

我必须按如下所示的媒介和投资类型进行分组。介质是我正在编写的软件的输入。

中型投资类型日期

Online        Stocks            1/2/2000,1/1/2000
Own           Mutual Fund       1/1/2000,1/3/2000

我已经使用收到的输入进行搜索,并且得到了结果。但是我无法进入所需的汇总格式。 我是Python和Pandas的新手。感谢您的帮助。谢谢

1 个答案:

答案 0 :(得分:1)

首先使用Series.str.findall和正则表达式单词边界按列表提取L = ['Online','Own'] pat = '|'.join(r"\b{}\b".format(x) for x in L) df['New_Medium'] = df.pop('Medium').str.findall('('+ pat + ')').str.join(', ') #remove rows with empty values df = df[df['New_Medium'].astype(bool)] print (df) Date Investment Type New_Medium 0 1/1/2000 Mutual Fund, Stocks, Fixed Deposit, Real Estate Own, Online 1 1/2/2000 Mutual Fund, Stocks, Real Estate Own 2 1/3/2000 Fixed Deposit Online 4 1/2/2000 Stocks Own, Online 中的值:

product

最后获得与join的所有组合以及最后聚合的from itertools import product df1 = pd.DataFrame([j for i in df.apply(lambda x: x.str.split(',\s*')).values for j in product(*i)], columns=df.columns) df = df1.groupby(['Investment Type','New_Medium'])['Date'].agg(', '.join).reset_index() print (df) Investment Type New_Medium Date 0 Fixed Deposit Online 1/1/2000, 1/3/2000 1 Fixed Deposit Own 1/1/2000 2 Mutual Fund Online 1/1/2000 3 Mutual Fund Own 1/1/2000, 1/2/2000 4 Real Estate Online 1/1/2000 5 Real Estate Own 1/1/2000, 1/2/2000 6 Stocks Online 1/1/2000, 1/2/2000 7 Stocks Own 1/1/2000, 1/2/2000, 1/2/2000

function isBetween(ST, ET, PST, PET) {
    var res = false;
    if (((ST - PST) * (ST - PET) <= 0) || ((ET - PST) * (ET - PET) <= 0) || ((PST - ST) * (PST - ET) <= 0) || ((PET - ST) * (PET - ET) <= 0)) res = true;
    return res;
}

function disabletime(start_time, end_time) {
    debugger;
    var start_date = new Date(start_time);
    var end_date = new Date(end_time);
    var disable_times = new Array();
    var max_date = 0;
    var min_date = 0;
    var startTimeOverlapIndex = -1;
    var endTimeOverlapIndex = -1;
    var sameDateIndex = -1;
    var resultA = true;
    if (KitchenHourList.length > 0) {
        for (var i = 0; i < KitchenHourList.length; i++) {
            var prev_s_date = new Date(KitchenHourList[i].KitchenFromDate + " " + KitchenHourList[i].KitchenFromTime);
            var prev_e_date = new Date(KitchenHourList[i].KitchenToDate + " " + KitchenHourList[i].KitchenToTime);
            var STMinut = (start_date.getHours() * 60) + start_date.getMinutes();
            var ETMinut = (end_date.getHours() * 60) + end_date.getMinutes();
            var PSTMinut = (prev_s_date.getHours() * 60) + prev_s_date.getMinutes();
            var PETMinut = (prev_e_date.getHours() * 60) + prev_e_date.getMinutes();
            if (end_date <= prev_e_date) {
                if (end_date > prev_s_date) {
                    if (isBetween(STMinut, ETMinut, PSTMinut, PETMinut)) {
                        endTimeOverlapIndex = i + 1;
                        break;
                    }
                }
            }
            if (start_date < prev_e_date) {
                if (start_date >= prev_s_date) {
                    if (isBetween(STMinut, ETMinut, PSTMinut, PETMinut)) {
                        startTimeOverlapIndex = i + 1;
                        break;
                    }
                } else {
                    if (end_date > prev_s_date) {
                        if (isBetween(STMinut, ETMinut, PSTMinut, PETMinut)) {
                            {
                                endTimeOverlapIndex = i + 1;
                                break;
                            }
                        }
                    }
                }
            }
            if (start_date.toString() === prev_s_date.toString() && end_date.toString() === prev_e_date.toString()) {
                sameDateIndex = i + 1;
                break;
            }
        }
        if (sameDateIndex > 0) {
            alert("Sorry! your time cannot be same as row (" + startTimeOverlapIndex + "), please check again!");
            return false;
        } else if (startTimeOverlapIndex > 0) {
            alert("Sorry! your START time is overlaping with row (" + startTimeOverlapIndex + "), please check again!");
            return false;
        } else if (endTimeOverlapIndex > 0) {
            alert("Sorry! your END time is overlaping with row (" + endTimeOverlapIndex + "), please check again!");
            return false;
        } else {
            return true;
        }
    }
    return true;
}