我有这两个数据框:
df = pd.DataFrame({'Points' : ['A','B','C','D','E'],'ColY' : [1,2,3,4,5]})
df
Points ColY
0 A 1
1 B 2
2 C 3
3 D 4
4 E 5
df2 = pd.DataFrame({'Points' : ['A','D'],'ColX' : [2,9]})
df2
Points ColX
0 A 2
1 D 9
这两个功能:
# equivalent of the Excel vlookup function applied to a dataframe
def vlookup(df,ref,col_ref,col_goal):
return pd.DataFrame(df[df.apply(lambda x: ref == x[col_ref],axis=1)][col_goal]).iloc[0,0]
# if x is in column Points of df2, return what is in column ColX in the same row
def update_if_belong_to_df2(x):
if x in df2['Points']:
return vlookup(df2,x,'Points','ColX')
return x
我想将功能update_if_belong_to_df2应用于df的ColY列。我尝试了以下操作,但不起作用:
df['ColY'] = df['ColY'].apply(lambda x : update_if_belong_to_df2(x))
我想得到:
df
Points ColY
0 A 2
1 B 2
2 C 3
3 D 9
4 E 5
能请您帮我了解原因吗? 谢谢
答案 0 :(得分:3)
我会做merge
df=df.merge(df2,how='left')
df.ColX=df.ColX.fillna(df.ColY)
df
Points ColY ColX
0 A 1 2.0
1 B 2 2.0
2 C 3 3.0
3 D 4 9.0
4 E 5 5.0
答案 1 :(得分:2)
IIUC,使用map
和fillna
可以更轻松地解决问题:
df['ColY'] = (df['Points'].map(df2.set_index('Points')['ColX'])
.fillna(df['ColY'])
)
输出:
Points ColY
0 A 2.0
1 B 2.0
2 C 3.0
3 D 9.0
4 E 5.0
答案 2 :(得分:2)
改为使用熊猫
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
tf.reset_default_graph()
tf.set_random_seed(777) # reproducibility
def MinMaxScaler(data):
numerator = data - np.min(data, 0)
denominator = np.max(data, 0) - np.min(data, 0)
# noise term prevents the zero division
return numerator / (denominator + 1e-7)
# train Parameters
seq_length = 6
data_dim = 5
hidden_dim = 10
output_dim = 1
learning_rate = 0.01
iterations = 500
# Open, High, Low, Volume, Close
#df = pd.read_csv("precipitation_post.csv", quotechar='"', decimal=".")
#df = df.interpolate(method ='linear', limit_direction ='forward')
#xy = df.reindex(index=df.index[::-1])
xy = np.loadtxt('df.txt', dtype='double', delimiter=' ', skiprows=1)
#xy = xy[::-1]
# train/test split
train_size = int(len(xy) * 0.7)
train_set = xy[0:train_size]
test_set = xy[train_size - seq_length:] # Index from [train_size - seq_length] to utilize past sequence
# Scale each
train_set = MinMaxScaler(train_set)
test_set = MinMaxScaler(test_set)
x = xy
y = xy[:, [-1]] # close as label
# build datasets
def build_dataset(time_series, seq_length):
dataX = []
dataY = []
for i in range(0, len(time_series) - seq_length):
_x = time_series[i:i + seq_length]
_y = time_series[i + seq_length]
print(_x, "->", _y)
dataX.append(_x)
dataY.append(_y)
return np.array(dataX), np.array(dataY)
trainX, trainY = build_dataset(train_set, seq_length)
testX, testY = build_dataset(test_set, seq_length)
# input place holders
X = tf.placeholder(tf.float32, shape=[None, seq_length, data_dim])
Y = tf.placeholder(tf.float32, shape=[None, 1])
# build a LSTM network
cell = tf.contrib.rnn.BasicLSTMCell(
num_units=hidden_dim, state_is_tuple=True, activation=tf.tanh)
outputs, _states = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
Y_pred = tf.contrib.layers.fully_connected(
outputs[:, -1], output_dim, activation_fn=None) # We use the last cell's output
# cost/loss
loss = tf.reduce_sum(tf.square(Y_pred - Y)) # sum of the squares
# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate)
train = optimizer.minimize(loss)
# RMSE
targets = tf.placeholder(tf.float32, [None, 1])
predictions = tf.placeholder(tf.float32, [None, 1])
rmse = tf.sqrt(tf.reduce_mean(tf.square(targets - predictions)))
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
# Training step
for i in range(iterations):
_, step_loss = sess.run([train, loss], feed_dict={
X: trainX, Y: trainY})
print("[step: {}] loss: {}".format(i, step_loss))
# Test step
test_predict = sess.run(Y_pred, feed_dict={X: testX})
rmse_val = sess.run(rmse, feed_dict={
targets: testY, predictions: test_predict})
print("RMSE: {}".format(rmse_val))
# Plot predictions
plt.plot(testY)
plt.plot(test_predict)
plt.xlabel("Time Period")
plt.ylabel("Precipitation")
plt.show()
:
update