所以我有两个DF。在DF1的每一行中,我有2个日期和一个ID,并且我想为DF1中的每一行选择DF2中介于这些日期之间并具有相同ID的所有行。
这是我的原始代码:
c_list = conversion_samples[['account_id','calendar_date','purchase_date','p1_purchased']].values.tolist()
for conversion in clist:
temp_df = df.copy()
acct, end_date, start_date, score = conversion
btwn_dates = (temp_df['calendar_date'] >= start_date) & (temp_df['calendar_date'] <= end_date)
specified_acct = temp_df['account_id'] == acct
temp_df['groupby_key'] = str(acct)+str(end_date)
temp_df['score_diff'] = score - temp_df['p1']
df_list.append(temp_df.loc[btwn_dates & specified_acct])
然后我尝试将其与多处理程序包并行化:
def get_rows_prior_to_conversion(conversion,df=df):
temp_df = df.copy()
acct, start_date, end_date, score = conversion
btwn_dates = (temp_df['calendar_date'] >= start_date) & (temp_df['calendar_date'] <= end_date)
specified_acct = temp_df['account_id'] == acct
temp_df['groupby_key'] = str(acct)+str(end_date)
temp_df['score_diff'] = score - temp_df['p1']
temp_df['purchase_date'] = end_date
df_list.append(temp_df.loc[btwn_dates & specified_acct])
#get N number of samples of conversions
conversion_samples = conversions_only_df.sample(n=1000, random_state=1)
#add purchase date, t-90 days date, and acct_id to list of lists
c_sample_list = conversion_samples[['account_id','calendar_date','purchase_date','p1_purchased']].values.tolist()
#empty list to eventually hold list of dfs
df_list = []
pool = ThreadPool(mp.cpu_count())
pool.map(get_rows_prior_to_conversion, [conversion for conversion in c_sample_list])
但这实际上不能并行工作吗?不确定发生了什么