Question

我在一个文件夹中有两个csv文件，第一个看起来像这样：

DF1

Value  Count
1      10
2      1
3      2
4      15
5      8

，第二个是这样的：

DF2

Value  Count
1      5 
2      22
3      13
4      16
5      11
6      18

我希望每个Value都匹配。基本上我想把df1变成这个：

   Value  Count
    1      10
    2      1
    3      2
    4      15
    5      8
    6      0

即使df1中不存在Value为6，我希望它填充零。

我正在读这样的文件：

pth=r'C:\pathway'
for f in os.listdir(pth):
    df=pd.read_csv(os.path.join(pth,f)

我尝试添加此内容：

df=df.append({'Value': '6', 'Count': '0'}, ignore_index=True)

但是这又将新行添加到df2，我想避免使用。

我在一个文件夹中有大约20个csv文件，我只希望Value在所有文件中匹配，如果特定Value不存在则填充0。

Answer 1

from glob import glob
filenames = glob('C:/pathway/*.csv')

def rfile(fn):
    return pd.read_csv(fn, index_col=0)

dfs = [rfile(fn) for fn in filenames]
idx = dfs[0].index
for i in range(1, len(dfs)):
    idx = idx.union(dfs[i].index)

idx现在是您的索引，它是所有文件中所有索引的并集。以下列方式reindex任意一个单独的数据帧，以获得您想要的内容。

df = dfs[0].reindex(idx, fill_value=0)

Answer 2

这很容易using reindex。首先，将每个df的索引设置为等于其自己的Value列。然后根据较大的（df2）重新索引。

'''
A Recurrent Neural Network implementation example using TensorFlow Library.

Author: *********
'''

import numpy as np
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
# import matplotlib.pyplot as plt
# from mpl_toolkits.mplot3d import Axes3D

# Parameters
training_iters = 1000
n_epochs       = 1000
batch_size     = 128
display_step   = 100
learning_rate  = 0.001

n_observations = 100
n_input        = 2   # Input data (Num + Num)
n_steps        = 28  # timesteps
n_hidden_1     = 256 # 1st layer number of features
n_hidden_2     = 256 # 2nd layer number of features
n_classes      = 1   # Output

X  = tf.placeholder("float", [None, n_input])
X1 = tf.placeholder(tf.float32)
X2 = tf.placeholder(tf.float32)
Y  = tf.placeholder(tf.float32)

# Random input data
x1 = 100 * np.random.random_sample([100,])
x2 = 100 * np.random.random_sample([100,])

add = tf.add(x1, x2)
mul = tf.mul(X1, X2)

weights = {
    'hidden1': tf.Variable(tf.random_normal([n_input,    n_hidden_1])),
    #'hidden2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out':     tf.Variable(tf.random_normal([n_hidden_1,  n_classes]))
}

biases = {
    'hidden1': tf.Variable(tf.random_normal([n_hidden_1])),
    #'hidden2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out':     tf.Variable(tf.random_normal([n_classes]))
}

def RNN(_X1, _weights, _biases):

    # Layer 1.1
    layer_1 = tf.add(tf.matmul(_X1, weights['hidden1']), biases['hidden1'])
    layer_1 = tf.nn.relu(layer_1)
    # Layer 1.2
    # layer_1_2 = tf.add(tf.matmul(_X2, weights['hidden2']), biases['hidden2'])
    # layer_1_2 = tf.nn.relu(layer_1_2)
    # Hidden layer with RELU activation
    layer_2   = tf.add(tf.matmul(layer_1, weights['out']), biases['out'])

    output    = tf.nn.relu(layer_2)

    return output

pred         = RNN(X1, weights, biases)
cost         = tf.reduce_sum(tf.pow(pred - Y, 2)) / (n_observations - 1)
optimizer    = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Adam Optimizer
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(Y,1))

init     = tf.initialize_all_variables()
# initData = tf.initialize_variables(x1.all(), x2.all())

with tf.Session() as sess:
    # Here we tell tensorflow that we want to initialize all
    # the variables in the graph so we can use them
    sess.run(init)

    # Fit all training data
    prev_training_cost = 0.0

    for epoch_i in range(n_epochs) :
        for (_x1) in x1:
            for (_x2) in x2:
                print("Input 1:")
                print(_x1)
                print("Input 2:")
                print(_x2)
                print("Add function: ")
                print(sess.run(add, feed_dict={X1: x1, X2: x2}))
                y =   sess.run(add, feed_dict={X1: x1, X2: x2})
                print(y)
                sess.run(optimizer, feed_dict={X: x, Y: y})


        training_cost = sess.run(
            cost, feed_dict={X: xs, Y: ys})
        print(training_cost)

        if epoch_i % 20 == 0:
            ax.plot(X1, X2, pred.eval(
                feed_dict={X1: x1, X2: x2}, session=sess),
                    'k', alpha=epoch_i / n_epochs)
            fig.show()
            plt.draw()

        # Allow the training to quit if we've reached a minimum
        if np.abs(prev_training_cost - training_cost) < 0.000001:
            break
        prev_training_cost = training_cost

填充数据框以便一致地填充行

2 个答案: