我有一个用逗号分隔的列的数据表。我想根据逗号分隔列中的每个值对数据进行分组。
Date Investment Type Medium
1/1/2000 Mutual Fund, Stocks, Fixed Deposit, Real Estate Own, Online,Through Agent
1/2/2000 Mutual Fund, Stocks, Real Estate Own
1/3/2000 Fixed Deposit Online
1/3/2000 Mutual Fund, Fixed Deposit, Real Estate Through Agent
1/2/2000 Stocks Own, Online, Through Agent
我必须按如下所示的媒介和投资类型进行分组。介质是我正在编写的软件的输入。
中型投资类型日期
Online Stocks 1/2/2000,1/1/2000
Own Mutual Fund 1/1/2000,1/3/2000
我已经使用收到的输入进行搜索,并且得到了结果。但是我无法进入所需的汇总格式。 我是Python和Pandas的新手。感谢您的帮助。谢谢
答案 0 :(得分:1)
首先使用Series.str.findall
和正则表达式单词边界按列表提取L = ['Online','Own']
pat = '|'.join(r"\b{}\b".format(x) for x in L)
df['New_Medium'] = df.pop('Medium').str.findall('('+ pat + ')').str.join(', ')
#remove rows with empty values
df = df[df['New_Medium'].astype(bool)]
print (df)
Date Investment Type New_Medium
0 1/1/2000 Mutual Fund, Stocks, Fixed Deposit, Real Estate Own, Online
1 1/2/2000 Mutual Fund, Stocks, Real Estate Own
2 1/3/2000 Fixed Deposit Online
4 1/2/2000 Stocks Own, Online
中的值:
product
最后获得与join
的所有组合以及最后聚合的from itertools import product
df1 = pd.DataFrame([j for i in df.apply(lambda x: x.str.split(',\s*')).values
for j in product(*i)], columns=df.columns)
df = df1.groupby(['Investment Type','New_Medium'])['Date'].agg(', '.join).reset_index()
print (df)
Investment Type New_Medium Date
0 Fixed Deposit Online 1/1/2000, 1/3/2000
1 Fixed Deposit Own 1/1/2000
2 Mutual Fund Online 1/1/2000
3 Mutual Fund Own 1/1/2000, 1/2/2000
4 Real Estate Online 1/1/2000
5 Real Estate Own 1/1/2000, 1/2/2000
6 Stocks Online 1/1/2000, 1/2/2000
7 Stocks Own 1/1/2000, 1/2/2000, 1/2/2000
:
function isBetween(ST, ET, PST, PET) {
var res = false;
if (((ST - PST) * (ST - PET) <= 0) || ((ET - PST) * (ET - PET) <= 0) || ((PST - ST) * (PST - ET) <= 0) || ((PET - ST) * (PET - ET) <= 0)) res = true;
return res;
}
function disabletime(start_time, end_time) {
debugger;
var start_date = new Date(start_time);
var end_date = new Date(end_time);
var disable_times = new Array();
var max_date = 0;
var min_date = 0;
var startTimeOverlapIndex = -1;
var endTimeOverlapIndex = -1;
var sameDateIndex = -1;
var resultA = true;
if (KitchenHourList.length > 0) {
for (var i = 0; i < KitchenHourList.length; i++) {
var prev_s_date = new Date(KitchenHourList[i].KitchenFromDate + " " + KitchenHourList[i].KitchenFromTime);
var prev_e_date = new Date(KitchenHourList[i].KitchenToDate + " " + KitchenHourList[i].KitchenToTime);
var STMinut = (start_date.getHours() * 60) + start_date.getMinutes();
var ETMinut = (end_date.getHours() * 60) + end_date.getMinutes();
var PSTMinut = (prev_s_date.getHours() * 60) + prev_s_date.getMinutes();
var PETMinut = (prev_e_date.getHours() * 60) + prev_e_date.getMinutes();
if (end_date <= prev_e_date) {
if (end_date > prev_s_date) {
if (isBetween(STMinut, ETMinut, PSTMinut, PETMinut)) {
endTimeOverlapIndex = i + 1;
break;
}
}
}
if (start_date < prev_e_date) {
if (start_date >= prev_s_date) {
if (isBetween(STMinut, ETMinut, PSTMinut, PETMinut)) {
startTimeOverlapIndex = i + 1;
break;
}
} else {
if (end_date > prev_s_date) {
if (isBetween(STMinut, ETMinut, PSTMinut, PETMinut)) {
{
endTimeOverlapIndex = i + 1;
break;
}
}
}
}
}
if (start_date.toString() === prev_s_date.toString() && end_date.toString() === prev_e_date.toString()) {
sameDateIndex = i + 1;
break;
}
}
if (sameDateIndex > 0) {
alert("Sorry! your time cannot be same as row (" + startTimeOverlapIndex + "), please check again!");
return false;
} else if (startTimeOverlapIndex > 0) {
alert("Sorry! your START time is overlaping with row (" + startTimeOverlapIndex + "), please check again!");
return false;
} else if (endTimeOverlapIndex > 0) {
alert("Sorry! your END time is overlaping with row (" + endTimeOverlapIndex + "), please check again!");
return false;
} else {
return true;
}
}
return true;
}