我正在尝试使用BERT进行培训 这是创建遮罩的代码
def bert_encode(texts, tokenizer, max_len=512):
all_tokens = []
all_masks = []
all_segments = []
for text in texts:
text = tokenizer.tokenize(text)
text = text[:max_len-2]
input_sequence = ["[CLS]"] + text + ["[SEP]"]
pad_len = max_len - len(input_sequence)
tokens = tokenizer.convert_tokens_to_ids(input_sequence)
tokens += [0] * pad_len
pad_masks = [1] * len(input_sequence) + [0] * pad_len
segment_ids = [0] * max_len
all_tokens.append(tokens)
all_masks.append(pad_masks)
all_segments.append(segment_ids)
return np.array(all_tokens), np.array(all_masks), np.array(all_segments)
当我运行它
train_input = bert_encode(train.text.values, tokenizer, max_len=160)
test_input = bert_encode(test.tweet.values, tokenizer, max_len=160)
train_labels = train.average.values
由于ram用完了,我的Google colab崩溃了。任何解决方案