我在一个文件夹中有两个csv文件,第一个看起来像这样:
DF1
Value Count
1 10
2 1
3 2
4 15
5 8
,第二个是这样的:
DF2
Value Count
1 5
2 22
3 13
4 16
5 11
6 18
我希望每个Value
都匹配。基本上我想把df1变成这个:
Value Count
1 10
2 1
3 2
4 15
5 8
6 0
即使df1中不存在Value
为6,我希望它填充零。
我正在读这样的文件:
pth=r'C:\pathway'
for f in os.listdir(pth):
df=pd.read_csv(os.path.join(pth,f)
我尝试添加此内容:
df=df.append({'Value': '6', 'Count': '0'}, ignore_index=True)
但是这又将新行添加到df2,我想避免使用。
我在一个文件夹中有大约20个csv文件,我只希望Value
在所有文件中匹配,如果特定Value
不存在则填充0。
答案 0 :(得分:2)
from glob import glob
filenames = glob('C:/pathway/*.csv')
def rfile(fn):
return pd.read_csv(fn, index_col=0)
dfs = [rfile(fn) for fn in filenames]
idx = dfs[0].index
for i in range(1, len(dfs)):
idx = idx.union(dfs[i].index)
idx
现在是您的索引,它是所有文件中所有索引的并集。以下列方式reindex
任意一个单独的数据帧,以获得您想要的内容。
df = dfs[0].reindex(idx, fill_value=0)
答案 1 :(得分:1)
这很容易using reindex
。首先,将每个df的索引设置为等于其自己的Value列。然后根据较大的(df2)重新索引。
'''
A Recurrent Neural Network implementation example using TensorFlow Library.
Author: *********
'''
import numpy as np
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D
# Parameters
training_iters = 1000
n_epochs = 1000
batch_size = 128
display_step = 100
learning_rate = 0.001
n_observations = 100
n_input = 2 # Input data (Num + Num)
n_steps = 28 # timesteps
n_hidden_1 = 256 # 1st layer number of features
n_hidden_2 = 256 # 2nd layer number of features
n_classes = 1 # Output
X = tf.placeholder("float", [None, n_input])
X1 = tf.placeholder(tf.float32)
X2 = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)
# Random input data
x1 = 100 * np.random.random_sample([100,])
x2 = 100 * np.random.random_sample([100,])
add = tf.add(x1, x2)
mul = tf.mul(X1, X2)
weights = {
'hidden1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
#'hidden2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'hidden1': tf.Variable(tf.random_normal([n_hidden_1])),
#'hidden2': tf.Variable(tf.random_normal([n_hidden_2])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
def RNN(_X1, _weights, _biases):
# Layer 1.1
layer_1 = tf.add(tf.matmul(_X1, weights['hidden1']), biases['hidden1'])
layer_1 = tf.nn.relu(layer_1)
# Layer 1.2
# layer_1_2 = tf.add(tf.matmul(_X2, weights['hidden2']), biases['hidden2'])
# layer_1_2 = tf.nn.relu(layer_1_2)
# Hidden layer with RELU activation
layer_2 = tf.add(tf.matmul(layer_1, weights['out']), biases['out'])
output = tf.nn.relu(layer_2)
return output
pred = RNN(X1, weights, biases)
cost = tf.reduce_sum(tf.pow(pred - Y, 2)) / (n_observations - 1)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Adam Optimizer
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(Y,1))
init = tf.initialize_all_variables()
# initData = tf.initialize_variables(x1.all(), x2.all())
with tf.Session() as sess:
# Here we tell tensorflow that we want to initialize all
# the variables in the graph so we can use them
sess.run(init)
# Fit all training data
prev_training_cost = 0.0
for epoch_i in range(n_epochs) :
for (_x1) in x1:
for (_x2) in x2:
print("Input 1:")
print(_x1)
print("Input 2:")
print(_x2)
print("Add function: ")
print(sess.run(add, feed_dict={X1: x1, X2: x2}))
y = sess.run(add, feed_dict={X1: x1, X2: x2})
print(y)
sess.run(optimizer, feed_dict={X: x, Y: y})
training_cost = sess.run(
cost, feed_dict={X: xs, Y: ys})
print(training_cost)
if epoch_i % 20 == 0:
ax.plot(X1, X2, pred.eval(
feed_dict={X1: x1, X2: x2}, session=sess),
'k', alpha=epoch_i / n_epochs)
fig.show()
plt.draw()
# Allow the training to quit if we've reached a minimum
if np.abs(prev_training_cost - training_cost) < 0.000001:
break
prev_training_cost = training_cost