使用LSTM节点训练RNN

时间:2017-10-26 21:26:00

标签: python scikit-learn keras keras-layer

以下是我使用LSTM节点训练RNN的代码:

# LSTM RNN with dropout for sequence classification
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import pickle, numpy, pandas as pd

###################################### CONSTANTS #############################################

SEED = 7                        # Fixes random seed for reproducibility.
URL = 'ibcData.tsv'             # Specified dataset to gather data from.
SEPERATOR = '\t'                # Seperator the dataset uses to divide data.
RANDOM_STATE = 1                # Pseudo-random number generator state used for random sampling.
TOP_WORDS = 5000                # Most used words in the dataset.
MAX_REVIEW_LENGTH = 500         # Length of each sentence being sent in (necessary).
EMBEDDING_VECTOR_LENGTH = 32    # The specific Embedded later will have 32-length vectors to
                                # represent each word.
BATCH_SIZE = 64                 # Takes 64 sentences at a time and continually retrains RNN.
NUMBER_OF_EPOCHS = 3            # Fits RNN to more accurately guess the data's political bias.
DROPOUT = 0.2                   # Helps slow down overfitting of data (slower convergence rate)
RECURRENT_DROPOUT = 0.2         # Helps slow down overfitting of data when recurrently training

##############################################################################################

# fix random seed for reproducibility
numpy.random.seed(SEED)


readData = pd.read_csv(URL, header=None, names=['label', 'message'], sep=SEPERATOR)

# convert label to a numerical variable
readData['label_num'] = readData.label.map({'Liberal' : 0, 'Neutral': 0.5, 'Conservative' : 1})
X = readData.message    # Contains the dataset's actual sentences that were labeled
Y = readData.label_num  # Either 0.0, 0.5, or 1.0 depending on label mapped to

# load the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=RANDOM_STATE)

# truncate and pad input sequences
for sentence in X_train:
    sentence.zfill(MAX_REVIEW_LENGTH)
for sentence in X_test:
    sentence.zfill(MAX_REVIEW_LENGTH)

# create the model
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_VECTOR_LENGTH, input_length=MAX_REVIEW_LENGTH))
model.add(LSTM(100, recurrent_dropout=RECURRENT_DROPOUT dropout=DROPOUT))    # Dropouts help prevent overfitting

model.add(Dense(2, activation='sigmoid'))                   # Layers deal with a 2D tensor, and output a 2D tensor
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS, batch_size=BATCH_SIZE)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

它正在训练一个包含以下数据的.tsv文件:

"Liberal","Forcing middle-class workers to bear a greater share of the cost of government weakens their support for needed investments and stirs resentment toward those who depend on public services the most ."

"Liberal", "Because it would not be worthwhile to bring a case for $ 30.22 , the arbitration clause would , as a practical matter , deny the Concepcions any relief and , more important , eliminate a class action that might punish AT&T for its pattern of fraudulent behavior ."

我尝试运行它,我从控制台得到这个,我不知道如何解决它,也不知道我的教授是否试图帮助我进行这项研究:

Layer (type)                 Output Shape              Param #
=================================================================
embedding_1 (Embedding)      (None, 500, 32)           160000
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               53200
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 202
=================================================================
Total params: 213,402
Trainable params: 213,402
Non-trainable params: 0
_________________________________________________________________
None
Traceback (most recent call last):
  File "LSTM-RNN.py", line 55, in <module>
    model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS
, batch_size=BATCH_SIZE)
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\models.py", line 871, in f
it
    initial_epoch=initial_epoch)
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
1525, in fit
    batch_size=batch_size)
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
1379, in _standardize_user_data
    exception_prefix='input')
  File "C:\Users\Hydur\Anaconda3\lib\site-packages\keras\keras\engine\training.py", line
144, in _standardize_input_data
    str(array.shape))
ValueError: Error when checking input: expected embedding_1_input to have shape (None, 50
0) but got array with shape (3244, 1)

1 个答案:

答案 0 :(得分:1)

主要问题似乎是X包含原始字符串,而嵌入层预期数据已经用数字编码。 Keras text preprocessing实用程序将负责:

#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>

int main(int arc, char **argv)
{
    struct stat s1, s2;
    char *err;

    if (stat(argv[1], &s1) < 0) {
        err = strerror(errno);
        fprintf(stderr, "Can't stat %s: %s\n", argv[1], err);
        exit(1);
    }
    if (stat(argv[2], &s2) < 0) {
        err = strerror(errno);
        fprintf(stderr, "Can't stat %s: %s\n", argv[2], err);
        exit(1);
    }
    if (S_ISCHR(s1.st_mode) && S_ISCHR(s2.st_mode) && s1.st_rdev == s2.st_rdev) {
        printf("Same char device\n");
        exit(0);
    }
    if (S_ISBLK(s1.st_mode) && S_ISBLK(s2.st_mode) && s1.st_rdev == s2.st_rdev) {
        printf("Same block device\n");
        exit(0);
    }
    printf("devices do not match\n");
    exit(1);
}

一旦修复了,我在&#34; dense_1&#34;上也出现了错误。层。网络中的最后一层被指定为具有两个输出节点,但您使用的丢失函数(binary_cross_entropy)需要将单个列编码为0/1。我编辑它,以便图层只有一个输出节点,因此该过程将完成,但怀疑使用0,0.5,1和二进制交叉熵将做你想要的。我认为你可能介于3级单热编码和categorical_cross_entropy之间,但这个问题超出了这个问题。

这是为我运行的完整编辑过的脚本。我只能在你提供的两个观察中运行它,但它确实完成了。

from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=MAX_REVIEW_LENGTH)
tokenizer.fit_on_texts(readData.message)
X = numpy.array(tokenizer.texts_to_matrix(readData.message)) # shape (None, 500)

This will code each message as a 500 integers, with a unique integer assigned to each word. 

然后我收到了以下输出:

from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import os, pickle, numpy, pandas as pd
from keras.preprocessing.text import Tokenizer
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

################################### CONSTANTS ################################################
SEED = 7                        # Fixes random seed for reproducibility.
URL = 'ibcData.tsv'             # Specified dataset to gather data from.
SEPERATOR = '\t'                # Seperator the dataset uses to divide data.
RANDOM_STATE = 1                # Pseudo-random number generator state used for random sampling.
TOP_WORDS = 5000                # Most used words in the dataset.
MAX_REVIEW_LENGTH = 500         # Length of each sentence being sent in (necessary).
EMBEDDING_VECTOR_LENGTH = 32    # The specific Embedded later will have 32-length vectors to
                                # represent each word.
BATCH_SIZE = 64                 # Takes 64 sentences at a time and continually retrains RNN.
NUMBER_OF_EPOCHS = 3            # Fits RNN to more accurately guess the data's political bias.

# fix random seed for reproducibility
numpy.random.seed(SEED)


readData = pd.read_csv(URL, header=None, names=['label', 'message'], sep=SEPERATOR)

# convert label to a numerical variable
tokenizer = Tokenizer(num_words=MAX_REVIEW_LENGTH)
tokenizer.fit_on_texts(readData.message)
X = numpy.array(tokenizer.texts_to_matrix(readData.message)) # shape (None, 32)
readData['label_num'] = readData.label.map({'Liberal' : 0, 'Neutral': 0.5, 'Conservative' : 1})
Y = numpy.array(readData.label_num)  # Either 0.0, 0.5, or 1.0 depending on label mapped to


# load the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=RANDOM_STATE)

# create the model
model = Sequential()
model.add(Embedding(TOP_WORDS, EMBEDDING_VECTOR_LENGTH, input_length=MAX_REVIEW_LENGTH))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))   # Layers deal with a 2D tensor, and output a 2D tensor
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=NUMBER_OF_EPOCHS, batch_size=BATCH_SIZE)

# Final evaluation of the model
scores = model.evaluate(X_test, Y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

希望有所帮助。