Question

我有这两个数据框：

df = pd.DataFrame({'Points' : ['A','B','C','D','E'],'ColY' : [1,2,3,4,5]})
df
    Points  ColY
0       A      1
1       B      2
2       C      3
3       D      4
4       E      5

df2 = pd.DataFrame({'Points' : ['A','D'],'ColX' : [2,9]})
df2
    Points  ColX
0       A      2
1       D      9

这两个功能：

# equivalent of the Excel vlookup function applied to a dataframe
def vlookup(df,ref,col_ref,col_goal):
    return pd.DataFrame(df[df.apply(lambda x: ref == x[col_ref],axis=1)][col_goal]).iloc[0,0]

# if x is in column Points of df2, return what is in column ColX in the same row
def update_if_belong_to_df2(x):
    if x in df2['Points']:
        return vlookup(df2,x,'Points','ColX')
    return x

我想将功能update_if_belong_to_df2应用于df的ColY列。我尝试了以下操作，但不起作用：

df['ColY'] = df['ColY'].apply(lambda x : update_if_belong_to_df2(x))

我想得到：

df
    Points  ColY
0       A      2
1       B      2
2       C      3
3       D      9
4       E      5

能请您帮我了解原因吗？谢谢

Answer 1

我会做merge

df=df.merge(df2,how='left')
df.ColX=df.ColX.fillna(df.ColY)
df
  Points  ColY  ColX
0      A     1   2.0
1      B     2   2.0
2      C     3   3.0
3      D     4   9.0
4      E     5   5.0

Answer 2

IIUC，使用map和fillna可以更轻松地解决问题：

df['ColY'] = (df['Points'].map(df2.set_index('Points')['ColX'])
                   .fillna(df['ColY'])
              )

输出：

  Points  ColY
0      A   2.0
1      B   2.0
2      C   3.0
3      D   9.0
4      E   5.0

Answer 3

改为使用熊猫import tensorflow as tf import pandas as pd import numpy as np import matplotlib from matplotlib import pyplot as plt tf.reset_default_graph() tf.set_random_seed(777) # reproducibility def MinMaxScaler(data): numerator = data - np.min(data, 0) denominator = np.max(data, 0) - np.min(data, 0) # noise term prevents the zero division return numerator / (denominator + 1e-7) # train Parameters seq_length = 6 data_dim = 5 hidden_dim = 10 output_dim = 1 learning_rate = 0.01 iterations = 500 # Open, High, Low, Volume, Close #df = pd.read_csv("precipitation_post.csv", quotechar='"', decimal=".") #df = df.interpolate(method ='linear', limit_direction ='forward') #xy = df.reindex(index=df.index[::-1]) xy = np.loadtxt('df.txt', dtype='double', delimiter=' ', skiprows=1) #xy = xy[::-1] # train/test split train_size = int(len(xy) * 0.7) train_set = xy[0:train_size] test_set = xy[train_size - seq_length:] # Index from [train_size - seq_length] to utilize past sequence # Scale each train_set = MinMaxScaler(train_set) test_set = MinMaxScaler(test_set) x = xy y = xy[:, [-1]] # close as label # build datasets def build_dataset(time_series, seq_length): dataX = [] dataY = [] for i in range(0, len(time_series) - seq_length): _x = time_series[i:i + seq_length] _y = time_series[i + seq_length] print(_x, "->", _y) dataX.append(_x) dataY.append(_y) return np.array(dataX), np.array(dataY) trainX, trainY = build_dataset(train_set, seq_length) testX, testY = build_dataset(test_set, seq_length) # input place holders X = tf.placeholder(tf.float32, shape=[None, seq_length, data_dim]) Y = tf.placeholder(tf.float32, shape=[None, 1]) # build a LSTM network cell = tf.contrib.rnn.BasicLSTMCell( num_units=hidden_dim, state_is_tuple=True, activation=tf.tanh) outputs, _states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32) Y_pred = tf.contrib.layers.fully_connected( outputs[:, -1], output_dim, activation_fn=None) # We use the last cell's output # cost/loss loss = tf.reduce_sum(tf.square(Y_pred - Y)) # sum of the squares # optimizer optimizer = tf.train.AdamOptimizer(learning_rate) train = optimizer.minimize(loss) # RMSE targets = tf.placeholder(tf.float32, [None, 1]) predictions = tf.placeholder(tf.float32, [None, 1]) rmse = tf.sqrt(tf.reduce_mean(tf.square(targets - predictions))) with tf.Session() as sess: init = tf.global_variables_initializer() sess.run(init) # Training step for i in range(iterations): _, step_loss = sess.run([train, loss], feed_dict={ X: trainX, Y: trainY}) print("[step: {}] loss: {}".format(i, step_loss)) # Test step test_predict = sess.run(Y_pred, feed_dict={X: testX}) rmse_val = sess.run(rmse, feed_dict={ targets: testY, predictions: test_predict}) print("RMSE: {}".format(rmse_val)) # Plot predictions plt.plot(testY) plt.plot(test_predict) plt.xlabel("Time Period") plt.ylabel("Precipitation") plt.show()：

update

在pandas数据框列上应用功能时出现问题

3 个答案: