我尝试执行python脚本,但出现错误,说“ charmap”无法解码字节,因为字符映射为未定义。我想这与unicode有关,但是我对解决问题的经验不足。
def load_imdb_sentiment_analysis_dataset(data_path =
"C:/Users/name/Desktop", seed=123):
imdb_data_path = os.path.join(data_path, 'aclImdb')
# Load the training data
train_texts = []
train_labels = []
for category in ['pos', 'neg']:
train_path = os.path.join(imdb_data_path, 'train', category)
for fname in sorted(os.listdir(train_path)):
if fname.endswith('.txt'):
with open(os.path.join(train_path, fname)) as f:
train_texts.append(f.read())
train_labels.append(0 if category == 'neg' else 1)
# Load the validation data.
test_texts = []
test_labels = []
for category in ['pos', 'neg']:
test_path = os.path.join(imdb_data_path, 'test', category)
for fname in sorted(os.listdir(test_path)):
if fname.endswith('.txt'):
with open(os.path.join(test_path, fname)) as f:
test_texts.append(f.read())
test_labels.append(0 if category == 'neg' else 1)
# Shuffle the training data and labels.
random.seed(seed)
random.shuffle(train_texts)
random.seed(seed)
random.shuffle(train_labels)
return ((train_texts, np.array(train_labels)),
(test_texts, np.array(test_labels)))
我收到以下错误:UnicodeDecodeError:'charmap'编解码器无法解码位置489的字节0xaa:字符映射到
答案 0 :(得分:1)
您需要弄清楚尝试打开的文件的编码。并在开放功能中使用它。
例如utf8:open(filename,encoding ='utf8')
因此您可以更改
从
with open(os.path.join(train_path, fname))
至
with open(os.path.join(train_path, fname), encoding='utf8')
如果您不关心无法打开的字符,则可以跳过它们(在这种方法中要小心):open(filename,errors ='ignore')
with open(os.path.join(train_path, fname), errors='ignore')