我有很多带注释的照片(2个注释/ Foto)。在脚本中,我使用word2vec表示向量中的单词。现在,我想将单词(属于每张图片)连同矢量一起写在csv中。 每行应代表一个Foto,其每列中都有一个注释,然后是该列中每个注释的向量。
错误信息:AssertionError:传递了8列,传递的数据有1列
错误应该在第89行 datalist.append([pd.DataFrame(rowlist,columns = header)])
我已经进行了很多更改和尝试(将行列表放入数据框),等等,但是仍然出现相同的错误
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd
import warnings
warnings.filterwarnings(action = 'ignore')
import gensim
from gensim.models import Word2Vec
# csv einlesen
testdata=pd.read_csv("./Test1.csv", sep=";", header=None)
# Reads ‘alice.txt’ file
sample = open("G:/Dropbox/_Uni/1. Dissertation/Scripts/word2vec/input.txt", "r", encoding="ISO-8859-1")
s = sample.read()
# Replaces escape character with space
f = s.replace("\n", " ")
data = []
# iterate through each sentence in the file
for i in sent_tokenize(f):
temp = []
# tokenize the sentence into words
for j in word_tokenize(i):
temp.append(j.lower())
data.append(temp)
# Create CBOW model
model1 = gensim.models.Word2Vec(data, min_count = 1,
size = 100, window = 5)
datalist=[]
for i, row in testdata.iterrows():
rowlist= []
fotocounter="Foto "+ str(i+1)
for x in row:
try:
x=str(x)
#i+1 = Fotonummer
rowlist.append([model1[x]])
except KeyError:
rowlist.append(["False"])
print(len(rowlist))
header= ["1","2","3","4","5","6","7","8"]
print(header)
datalist.append([fotocounter, pd.DataFrame(rowlist, columns=header)])
print(datalist)
df=pd.DataFrame(datalist)
#transponiert zeilen und spalten
#df_transposed = df.T
df.to_csv("vector-output.csv", index=False)
# Create Skip Gram model
#model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100,
#
# window = 5, sg = 1)