机器学习的新手,请多多包涵。
我正在尝试训练卷积神经网络,以将烂番茄评论分类为正面还是负面。原始数据帧本身并不太复杂。我添加了一个列,指示该行是否是培训,验证或测试拆分的一部分,并清除了文本,但除此之外,它看起来或多或少类似于以下内容:
class Vocabulary(object):
def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
if token_to_idx is None:
token_to_idx = {}
self.token_to_idx_ = token_to_idx
self.idx_to_token_ = {
idx: token
for token, idx in self.token_to_idx_.items()
}
self.add_unk_ = add_unk
self.unk_token_ = unk_token
self.unk_index = -1
if self.add_unk_:
self.unk_index_ = self.add_token(unk_token)
def to_serializable(self):
return {
"token_to_idx": self.token_to_idx_,
"add_unk": self.add_unk_,
"unk_token": self.unk_token_
}
@classmethod
def from_serializable(cls, contents):
return cls(**contents)
def add_token(self, token):
if token in self.token_to_idx_:
index = self.token_to_idx_[token]
else:
index = len(self.token_to_idx_)
self.token_to_idx_[token] = index
self.idx_to_token_[index] = token
return index
def lookup_token(self, token):
if self.add_unk_:
return self.token_to_idx_.get(token, self.unk_index)
else:
return self.token_to_idx_[token]
def lookup_index(self, index):
if index not in self.idx_to_token_:
raise KeyError("index (%d) is not in the Vocabulary" % index)
return self.idx_to_token_[index]
def __str__(self):
return "<Vocabulary(size=%d)>" % len(self)
def __len__(self):
return len(self.token_to_idx_)
我很难理解的是如何确定输入到我的分类器中的输入的正确尺寸。相关代码如下。
类定义:
class Vectorizer(object):
def __init__(self, review_vocab, rating_vocab):
self.review_vocab = review_vocab
self.rating_vocab = rating_vocab
def vectorize(self, review):
# create collapsed one-hot vector for new observations
one_hot = np.zeros(len(self.review_vocab), dtype=np.float64)
for token in review.split(" "):
if token not in string.punctuation:
one_hot[self.review_vocab.lookup_token(token)] = 1
return one_hot
@classmethod
def from_dataframe(cls, review_df, cutoff=10):
review_vocab = Vocabulary(add_unk=True)
rating_vocab = Vocabulary(add_unk=False)
for rating in sorted(set(review_df["freshness"])):
rating_vocab.add_token(rating)
word_counts = Counter()
for review in review_df["review"]:
for word in review.split(" "):
if word not in string.punctuation:
word_counts[word] += 1
for word, count in word_counts.items():
if count > cutoff:
review_vocab.add_token(word)
return cls(review_vocab, rating_vocab)
@classmethod
def from_serializable(cls, contents):
review_vocab = Vocabulary.from_serializable(contents["review_vocab"])
rating_vocab = Vocabulary.from_serializable(contents["rating_vocab"])
return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)
def to_serializable(self):
return {
"review_vocab": self.review_vocab.to_serializable(),
"rating_vocab": self.rating_vocab.to_serializable()
}
class Classifier(nn.Module):
def __init__(self, initial_n_channels, n_classes, network_n_channels):
super(Classifier, self).__init__()
self.network = nn.Sequential(
nn.Conv1d(in_channels=initial_n_channels,
out_channels=network_n_channels,
kernel_size=args["kernel_size"]),
nn.ReLU(),
nn.Conv1d(in_channels=network_n_channels,
out_channels=network_n_channels,
kernel_size=args["kernel_size"],
stride=args["stride"]),
nn.ReLU(),
nn.Conv1d(in_channels=network_n_channels,
out_channels=network_n_channels,
kernel_size=args["kernel_size"],
stride=args["stride"]),
nn.ReLU(),
nn.Conv1d(in_channels=network_n_channels,
out_channels=network_n_channels,
kernel_size=args["kernel_size"],
stride=args["stride"]),
nn.ReLU()
)
self.fc = nn.Linear(network_n_channels, n_classes)
def forward(self, x_in, apply_sigmoid=False):
features = self.network(x_in) # squeeze?
prediction_vector = self.linear(features)
if apply_sigmoid:
prediction_vector = F.sigmoid(prediction_vector, dim=1)
return prediction_vector
# dataset and vectorizer
dataset = ReviewDataset.load_and_vectorize(args["review_csv"])
vectorizer = dataset.get_vectorizer()
# model
classifier = Classifier(initial_n_channels=len(vectorizer.review_vocab),
n_classes=len(vectorizer.rating_vocab),
network_n_channels=args["num_channels"])
# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args["learning_rate"])
RuntimeError: Expected 3-dimensional input for 3-dimensional weight 128 7882, but got 2-dimensional input of size [128, 7882] instead
实例:
(batch_size, len(review_vocab))
运行以上代码的结果:
react-intl
我认为采用Intl.PluralRules
形状的二维张量会很好,但是显然我的CNN需要数据的第三维?我该怎么做才能增加额外的维度,或更改分类器本身的要求?