标签: python pandas dataframe

我有两个pandas DataFrames,并希望根据以下内容生成结果:

* DataFrame 1具有浮点数,并且第二个DataFrame中的值无关紧要。 两个DataFrame具有相同的列数,但df1有一些额外的行,因为它的索引中有更多的条目,遍布整个索引。


如何获得df2形状的DataFrame,但是df1的值具有约束条件,即如果索引中的df1值不在df2中,则必须将df1值添加到先前的有效索引中(即非NaN)在df2中的该列中。 res_df显示从df1和df2。*


DataFrame 1

import pandas as pd

df1_col1 = pd.Series([2.5, .5, 1, 1, .5, .5, 2], index=[0.0, 2.5, 3.0, 4.0, 5.0, 5.5, 6])
df1_col2 = pd.Series([2, 2, 2, 1, 1], index=[0.0, 2.0, 4.0, 6.0, 7.0])
df1 = pd.concat([df1_col1, df1_col2], axis=1)

>>> df1
       0   1
0.0  2.5   2
2.0  NaN   2
2.5  0.5 NaN
3.0  1.0 NaN
4.0  1.0   2
5.0  0.5 NaN
5.5  0.5 NaN
6.0  2.0   1
7.0  NaN   1

DataFrame 2

df2_col1 = pd.Series(['val', 'val', 'val', 'val', 'val', 'val'], index=[0.0, 2.5, 3.0, 5.0, 5.5, 6])
df2_col2 = pd.Series(['val', 'val', 'val', 'val'], index=[0.0, 2.0, 6.0, 7.0])
df2 = pd.concat([df2_col1, df2_col2], axis=1)

>>> df2
       0    1
0.0  val  val
2.0  NaN  val
2.5  val  NaN
3.0  val  NaN
5.0  val  NaN
5.5  val  NaN
6.0  val  val
7.0  NaN  val


res_col1 = pd.Series([2.5, .5, 2, .5, .5, 2], index=[0.0, 2.5, 3.0, 5.0, 5.5, 6])
res_col2 = pd.Series([2, 4, 1, 1], index=[0.0, 2.0, 6.0, 7.0])
res_df = pd.concat([res_col1, res_col2], axis=1)

>>> res_df
       0   1
0.0  2.5   2
2.0  NaN   4
2.5  0.5 NaN
3.0  2.0 NaN
5.0  0.5 NaN
5.5  0.5 NaN
6.0  2.0   1
7.0  NaN   1

我在Linux Ubuntu上使用pandas 0.18.0,解决方案需要适用于python 2.7.6和python 3.5.1。谢谢。

6 个答案:

答案 0 :(得分:1)

# Track what's missing, we'll loop over these
isin = df1.index.isin(df2.index)
missidx = df1.index[~isin]

# Base case in preparation for back-add
res_df = df1.reindex_like(df2)

# For each missing index
for i in missidx:
    # iterate over df2 columns
    # because we need to capture
    # its last valid index prior
    # the missing index we've found
    for j, col in df2.iteritems():
        # look for last valid index prior to i
        lvi = col.loc[:i].last_valid_index()
        # take value in df1 (now in res_df)
        # at last valid index from df2
        # and add to it the value in df1
        # at the missing index i
        res_df.at[lvi, j] += df1.at[i, j]

def pir_back_add(df1, df2):
    isin = df1.index.isin(df2.index)
    missidx = df1.index[~isin]

    res_df = df1.reindex_like(df2)

    for i in missidx:
        for j, col in df2.iteritems():
            lvi = col.loc[:i].last_valid_index()
            res_df.at[lvi, j] += df1.at[i, j]

    return res_df



1000 loops, best of 3: 677 µs per loop
100 loops, best of 3: 3.06 ms per loop
100 loops, best of 3: 4.55 ms per loop
Alberto Garcia-Raboso
100 loops, best of 3: 2.81 ms per loop
100 loops, best of 3: 2.28 ms per loop

答案 1 :(得分:1)

考虑到我的另一个答案,我意识到有一个更好的方法来处理这个问题。您仍然希望使用pd.cut()df1.index创建分类,但您希望为每列分别创建分箱 - 同时使用df2.indexdf1行的索引列中没有NaN。这是代码。

from __future__ import print_function
import pandas as pd

df1_col1 = pd.Series([2.5, .5, 1, 1, .5, .5, 2],
                     index=[0.0, 2.5, 3.0, 4.0, 5.0, 5.5, 6])
df1_col2 = pd.Series([2, 2, 2, 1, 1],
                     index=[0.0, 2.0, 4.0, 6.0, 7.0])
df1 = pd.concat([df1_col1, df1_col2], axis=1)

df2_col1 = pd.Series(['val', 'val', 'val', 'val', 'val', 'val'],
                     index=[0.0, 2.5, 3.0, 5.0, 5.5, 6])
df2_col2 = pd.Series(['val', 'val', 'val', 'val'],
                     index=[0.0, 2.0, 6.0, 7.0])
df2 = pd.concat([df2_col1, df2_col2], axis=1)

res_df = pd.DataFrame(index=df2.index)
for col, values in df1.iteritems():
    bin_bdrys = list(df1[col].dropna().index.intersection(df2.index))
    bin_bdrys.append(df2.index[-1] + 1)
    bins = pd.cut(df1.index, bin_bdrys, right=False, labels=bin_bdrys[:-1])
    res_df[col] = df1[col].groupby(bins).sum().reindex_like(df2)


       0    1
0.0  2.5  2.0
2.0  NaN  4.0
2.5  0.5  NaN
3.0  2.0  NaN
5.0  0.5  NaN
5.5  0.5  NaN
6.0  2.0  1.0
7.0  NaN  1.0

答案 2 :(得分:1)




s1 = pd.Series(list(range(7)), index=[1.0, 1.5, 1.6, 2.0, 3.0, 3.5, 4.0])
s2 = pd.Series([1], index=[1.0, 2.0, 3.0, 4.0])
1.0    0
1.5    1
1.6    2
2.0    3
3.0    4
3.5    5
4.0    6
dtype: int64

创建一个临时系列s进行分组。 s的值是所有s2条目的有效s1索引。

s = pd.Series([np.nan], index=s1.index)
s[s2.index] = s2.index
s = s.fillna(method='ffill')

1.0    1.0
1.5    1.0
1.6    1.0
2.0    2.0
3.0    3.0
3.5    3.0
4.0    4.0
dtype: float64


1.0    3
2.0    3
3.0    9
4.0    6
dtype: int64




# Filling nan's that may interfere with the results
extra_idx = df1.index.difference(df2.index)
df1.loc[extra_idx] = df1.loc[extra_idx].fillna(0)
# If nan's in df1 and df2 coincide, the following would also work:
# df1 = df1.fillna(0)

result_cols = []
s = pd.Series(index=df1.index)

for col in df1.columns:
    c1 = df1[col]
    c2 = df2[col].dropna()

    s[:] = np.NaN
    s[c2.index] = c2.index
    s = s.fillna(method='ffill')

    out_col = c1.groupby(s).sum()

result = pd.concat(result_cols, axis=1)


答案 3 :(得分:0)


def back_add(df1, df2):
    cols1 = [df1.iloc[:, x].dropna() for x in range(len(df1.columns))]
    cols2 = [df2.iloc[:, x].dropna() for x in range(len(df2.columns))]

    for i, ser in enumerate(cols1):
        for j, val in enumerate(ser):
            if ser.index[j] not in cols2[i].index:
                ser.at[ser.iloc[:j].last_valid_index()] += val
                ser.iat[j] = float('nan')
        ser = ser.dropna()
    return pandas.concat(cols1, axis=1).dropna(how='all')


答案 4 :(得分:0)


from __future__ import print_function
import numpy as np
import pandas as pd

df1_col1 = pd.Series([2.5, .5, 1, 1, .5, .5, 2],
                     index=[0.0, 2.5, 3.0, 4.0, 5.0, 5.5, 6])
df1_col2 = pd.Series([2, 2, 2, 1, 1],
                     index=[0.0, 2.0, 4.0, 6.0, 7.0])
df1 = pd.concat([df1_col1, df1_col2], axis=1)

df2_col1 = pd.Series(['val', 'val', 'val', 'val', 'val', 'val'],
                     index=[0.0, 2.5, 3.0, 5.0, 5.5, 6])
df2_col2 = pd.Series(['val', 'val', 'val', 'val'],
                     index=[0.0, 2.0, 6.0, 7.0])
df2 = pd.concat([df2_col1, df2_col2], axis=1)

bin_bdrys = list(df2.index)
bin_bdrys.append(df2.index[-1] + 1)
bins = pd.cut(df1.index, bin_bdrys, right=False, labels=df2.index)

res_df = df1.groupby(bins).sum()



#        0    1
# 0.0  2.5  2.0
# 2.0  NaN  2.0
# 2.5  0.5  NaN
# 3.0  2.0  2.0
# 5.0  0.5  NaN
# 5.5  0.5  NaN
# 6.0  2.0  1.0
# 7.0  NaN  1.0

问题是df1.loc[4.0, 1]已添加到res_df.loc[3.0, 1]。但是df1.loc[3.0, 1]NaN ...您可以轻松识别出现这种情况:

incorrect = (res_df.notnull() & df1.isnull()).dropna()

#          0      1
# 0.0  False  False
# 2.0  False  False
# 2.5  False  False
# 3.0  False   True
# 5.0  False  False
# 5.5  False  False
# 6.0  False  False
# 7.0  False  False


# Iterate over columns
for col, values in incorrect.iteritems():
    # Get the indices of the entries that are wrong
    old_idx = values.nonzero()[0]
    # Get the valid indices
    valid_idx = df1[col].notnull().nonzero()[0]
    # Get the previous valid index for each wrong entry
    new_idx = np.searchsorted(valid_idx, old_idx) - 1
    # Add the wrong entry to the correct position, and `NaN` the former
    for i, j in zip(old_idx, new_idx):
        res_df.iloc[j, col] += res_df.iloc[i, col]
        res_df.iloc[i, col] = np.nan


#        0    1
# 0.0  2.5  2.0
# 2.0  NaN  4.0
# 2.5  0.5  NaN
# 3.0  2.0  NaN
# 5.0  0.5  NaN
# 5.5  0.5  NaN
# 6.0  2.0  1.0
# 7.0  NaN  1.0


old_idx = []
valid_idx = [0, 2, 3, 4, 5, 6, 7]
new_idx = []


old_idx = [3]
valid_idx = [0, 1, 4, 7, 8]
new_idx = [1]

因此res_df.iloc[3, 1]已添加到res_df.iloc[1, 1]res_df.iloc[3, 1]已重置为NaN

答案 5 :(得分:0)


# Step 1: Merge df1 and df2 on df2 (to make the shape the same):
df_merge = df2.join(df1, lsuffix='_x', rsuffix='_y')

# Step 2: Bit of indexing elbow grease:
for col in df2.columns:
    non_nan = df_merge[str(col)+'_x'].notnull()
    df_merge.loc[non_nan,str(col)+'_x'] = df_merge.loc[non_nan,str(col)+'_y']

# Step 3: Drop the columns from df1:
df1_cols = [str(col)+'_x' for col in df1.columns]
df_merge.drop(df1_cols, axis=1, inplace=True)
df_merge.columns = df2.columns
