有人可以协助解决以下代码错误吗?我正在准备数据以使用深度学习模型进行训练,但是由于numpy ValueError而无法完成此操作。
这是我的原始数据:https://drive.google.com/file/d/1skaoLARqjrEeLOf4R-9Ulh89M8KWOTYD/view?usp=sharing。清理之后,这是用于训练模型的最终输出:https://drive.google.com/file/d/1i_OOkuSTQ7Y6iQJALbGUtJ5Fs10POuBY/view?usp=sharing。
下面是用于训练模型的WordEmbedding
类:
from gensim.models import Word2Vec
from sklearn.decomposition import PCA
from matplotlib import pyplot
import string
import re
import numpy as np
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
class WordEmbedding:
def __init__(self):
print(" ")
def load_dataset(self, filename):
file = open(filename, 'rt')
dataset = file.read()
file.close()
return dataset
def createSequence(self, tokens):
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
seq = tokens[i-length:i]
line = ''.join(seq)
sequences.append(line)
data = '\n'.join(sequences)
return data
def encode_words(self, dataset):
data = dataset.split('\n')
newShape = 2, -1
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)
sequences = tokenizer.texts_to_sequences(data)
vocab_size = len(tokenizer.word_index) + 1
sequences = array(sequences)
#sequences = np.array2string(sequences)
sequences = np.reshape(sequences, newShape)
#sequences = np.array2string(sequences)
print(sequences.dtype)
print(sequences.shape)
X, y = sequences[:,:-1], sequences[:,-1]
print(y.dtype)
#y = np.array2string(y)
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]
return X, y, vocab_size, seq_length, tokenizer
以下代码用于测试类WordEmbedding
:
from WordEmbedding import WordEmbedding
emb = WordEmbedding()
data = emb.load_dataset('trecis2018-test.parisAttacks2015.txt')
seq_data = emb.createSequence(data)
X,y,vocab_size,seq_length,tokenizer = emb.encode_words(seq_data)
model = emb.define_model(vocab_size, seq_length)
model.fit(X, y, batch_size=128, epochs=100)
model.save('model.h5')
emb.dump(tokenizer, open('tokenizer.pkl', 'wb'))
print("successful")
下面是运行代码时的错误消息:
Reloaded modules: WordEmbedding
object
(2, 104309)
object
Traceback (most recent call last):
File "<ipython-input-18-9db02c6b1f06>", line 1, in <module>
runfile('/home/asifa/anaconda3/deep_learning_project/processor.py', wdir='/home/asifa/anaconda3/deep_learning_project')
File "/home/asifa/anaconda3/envs/researchProject/lib/python3.6/site-packages/spyder_kernels/customize/spydercustomize.py", line 827, in runfile
execfile(filename, namespace)
File "/home/asifa/anaconda3/envs/researchProject/lib/python3.6/site-packages/spyder_kernels/customize/spydercustomize.py", line 110, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "/home/asifa/anaconda3/deep_learning_project/processor.py", line 15, in <module>
X,y,vocab_size,seq_length,tokenizer = emb.encode_words(seq_data)
File "/home/asifa/anaconda3/deep_learning_project/WordEmbedding.py", line 77, in encode_words
y = to_categorical(y, num_classes=vocab_size)
File "/home/asifa/anaconda3/envs/researchProject/lib/python3.6/site-packages/keras/utils/np_utils.py", line 25, in to_categorical
y = np.array(y, dtype='int')
ValueError: setting an array element with a sequence.
答案 0 :(得分:0)
return data
中的方法:createSequence,然后重试该流(如果可行)。