我正在尝试制作一个分类器,以了解电影评论的内容是正面还是负面。我使用了几个相关的文件,每个文档中的总词汇表(每行一个单词)的文件,两个CSV(一个用于训练集,一个用于测试),其中包含每个文档在特定文档中得到的分数order和两个CSV(与上面相同),在一行中,它是该评论中出现的每个单词的索引,将该单词作为列表查看。因此,对于像#34这样的评论,我喜欢这部电影"有一个像1的得分线(0:不喜欢,1喜欢)和[2,13,64,33]的字线。我使用的是DNNClassifier,目前正在使用1个功能,这是一个包裹在categorical_column_with_identity周围的嵌入列。我的代码运行但它需要非常可怕的结果,我不知道为什么。也许对张量流有更多了解的人可以帮助我。此外,我不会再去这里了,但老实说,我试过并找不到直接帮助我的帖子。
import tensorflow as tf
import pandas as pd
import numpy as np
import os
embedding_d = 18
label_name = ['Label']
col_name = ["Words"]
hidden_unit = [10]*5
BATCH = 50
STEP = 5000
#Ignore some warning messages but an optional compiler
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
##Function to feed into training
def train_input_fn(features, labels, batch_size):
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
# Shuffle, repeat, and batch the examples.
dataset = dataset.shuffle(1000).repeat().batch(batch_size)
# Return the dataset.
return dataset
##Orignal Eval. Untouched so far. Mostly likely will need to be changed.
def eval_input_fn(features, labels, batch_size):
"""An input function for evaluation or prediction"""
features=dict(features)
if labels is None:
# No labels, use only features.
inputs = features
else:
inputs = (features, labels)
# Convert the inputs to a Dataset.
dataset = tf.data.Dataset.from_tensor_slices(inputs)
# Batch the examples
assert batch_size is not None, "batch_size must not be None"
dataset = dataset.batch(batch_size)
# Return the dataset.
return dataset
## Produces dataframe for labels and features(words) using pandas
def loadData():
train_label =pd.read_csv("aclImdb/train/yaynay.csv",names=label_name)
test_label =pd.read_csv("aclImdb/test/yaynay.csv",names=label_name)
train_feat = pd.read_csv("aclImdb/train/set.csv", names = col_name)
test_feat = pd.read_csv("aclImdb/test/set.csv", names = col_name)
train_feat[col_name] =train_feat[col_name].astype(np.int64)
test_feat[col_name] =test_feat[col_name].astype(np.int64)
return (train_feat,train_label),(test_feat,test_label)
## Stuff that I believe is somewhat working
# Get labels for test and training data
(train_x,train_y), (test_x,test_y) = loadData()
## Get the features for each document
train_feature = []
#Currently only one key but this could change in the future
for key in train_x.keys():
#Create a categorical_column column
idCol = tf.feature_column.categorical_column_with_identity(
key= key,
num_buckets=89528)
embedding_column = tf.feature_column.embedding_column(
categorical_column= idCol,
dimension=embedding_d)
train_feature.append(embedding_column)
##Create the neural network
classifier = tf.estimator.DNNClassifier(
feature_columns=train_feature,
# Species no. of layers and no. of neurons in each layer
hidden_units=hidden_unit,
# Number of output options(here there are 11 for scores 0-10 inclusive)
n_classes= 2)
# Train the Model
#First numerical value is batch size, second is total steps to take.
classifier.train(input_fn= lambda: train_input_fn(train_x, train_y, BATCH),steps=STEP)
#Evaluate the model
eval_result = classifier.evaluate(
input_fn=lambda:eval_input_fn(test_x, test_y,
BATCH), steps = STEP)
print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))