Question

我已经遇到过处理类似问题的问题。但是，他们无法回答我的特定问题。因此，对于解决我面临的以下问题的任何建议，我将深表感谢。

我正在尝试为文本分类问题实现RNN模型。我的triple文件中有一个句子（truth）的csv文件和一个类标签[0，1]（triple.csv）。

Triple.csv文件的示例

triple,truth
sportsteam hawks teamplaysincity city atlanta,1
stadiumoreventvenue hondacenter stadiumlocatedincity city anaheim,1
sportsteam ducks teamplaysincity city anaheim,1
sportsteam n1985chicagobears teamplaysincity city chicago,1
...

我正在尝试使用RNN及其word2vec嵌入来训练句子（三元组）。但是，我不断收到以下错误。

ValueError：无法将字符串转换为float：'sportsleague nfl Leaguestadiums Stadiumoreventvenue heinzfield'

我的主要代码

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import gensim
import pandas as pd
import os
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec, KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from termcolor import colored
from keras.utils import to_categorical

nltk.download('stopwords')
# one hot encode

df = pd.DataFrame()
df = pd.read_csv('data/triple.csv')
triple_lines = list()
lines = df['triple'].values.tolist()

for line in lines:
    tokens = word_tokenize(line)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('','',string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    triple_lines.append(words)

print(colored(len(triple_lines),'green'))
EMBEDDING_DIM = 100
model = gensim.models.Word2Vec(sentences=triple_lines, size=EMBEDDING_DIM, window =5, workers=4, min_count=1)
words = list(model.wv.vocab)
print(colored('Vocabulary size: %d' % len(words),'green'))

filename = 'embedding_word2vec.txt'
model.wv.save_word2vec_format(filename,binary=False)

embedding_index = {}
f = open(os.path.join('', 'embedding_word2vec.txt'), encoding="utf-8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embedding_index[word] = coefs
f.close()

#Vectorize the text samples into a S2 integer tensor
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(triple_lines)
sequences = tokenizer_obj.texts_to_sequences(triple_lines)

#pad sequences
word_index = tokenizer_obj.word_index
print(colored('Found %s unique tokens.'% len(word_index),'magenta'))

max_length = 9

triple_pad = pad_sequences(sequences, maxlen=max_length)
truth = df['triple'].values
print('Shape of triple tensor: ', triple_pad.shape)
print('Shape of truth tensor: ', truth.shape)

#map embeddings from loaded word2vec model for each word to the tokenizer_obj.word_index vocabulary & create a wordvector matrix

num_words = len(word_index)+1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word,i in word_index.items():
    if i>num_words:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        # words not found in the embedding index will be all-zero
        embedding_matrix[i] = embedding_vector

print(colored(num_words,'cyan'))

# Define Model
model = Sequential()
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=max_length,
                            trainable=False)

model.add(embedding_layer)
model.add(GRU(units=32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(colored(model.summary(),'cyan'))

#Split the data into training set and validation set
VALIDATION_SPLIT = 0.2

indices = np.arange(triple_pad.shape[0])
np.random.shuffle(indices)
triple_pad = triple_pad[indices]
truth = truth[indices]
num_validation_samples = int(VALIDATION_SPLIT * triple_pad.shape[0])

X_train_pad = triple_pad[:-num_validation_samples]
y_train = truth[:-num_validation_samples]
X_test_pad = triple_pad[-num_validation_samples:]
y_test = truth[-num_validation_samples:]

print('Shape of X_train_pad tensor: ',X_train_pad.shape)
print('Shape of y_train tensor: ',y_train.shape)
print('Shape of X_test_pad tensor: ',X_test_pad.shape)
print('Shape of y_test tensor: ',y_test.shape)

print(colored('Training...','green'))
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

在解决此问题方面的任何帮助将不胜感激。

Answer 1

我在将字符串y_train传递到model.fit()时遇到了此错误。

我没有将布尔真值定义为目标类值，而是将三元组定义为目标类，它将字符串传递到model.fit()中，如下所示。

truth = df['triple'].values

因此，只需按如下所示修改上面的行即可解决此问题。

truth = df['truth'].values

我多么想念这些琐碎的细节真是太疯狂了。愚蠢的我！

Keras错误-ValueError：无法将字符串转换为float

Triple.csv文件的示例

我的主要代码

1 个答案: