我遇到了文本分类的问题,我需要对俄语文本进行分类。对于特征提取,我使用scikit学习TfidfTransformer和CountVectorizer,但在编译代码后出现错误:
'UnicodeDecodeError: 'utf8' codec can't decode byte 0xc2 in position 0:
invalid continuation byte'.
我该如何纠正这个错误?这是Python中的代码:
# -*- coding: utf-8 -*-
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords
import numpy as np
import numpy.linalg as LA
import os
import nltk
import re
import sys
from nltk import NaiveBayesClassifier
import nltk.classify
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import re
data_path = os.path.abspath(os.path.join('/home/lena/','corpus'))
official_path = os.path.join(data_path,'official')
#print official_path
official2_path = os.path.join(data_path,'official_2')
talk_path = os.path.join(data_path,'talk')
talk2_path = os.path.join(data_path,'talk_2')
#fiction_path = os.path.join(data_path,'fiction')
#fiction2_path = os.path.join(data_path,'fiction_2')
def get_text(path):
with open(path,'rU') as file:
line = file.readlines()
return ''.join(line)
def get_textdir(path):
filelist = os.listdir(path)
all_text = [get_text(os.path.join(path,f)) for f in filelist]
return all_text
all_talk = get_textdir(talk_path)
all_official = get_textdir(official_path)
official_2 = get_textdir(official2_path)
talk_2 = get_textdir(talk2_path)
train_set = all_talk
test_set = talk_2
stopWords = stopwords.words('russian')
vectorizer = CountVectorizer(stop_words = stopWords)
print vectorizer
train = vectorizer.fit_transform(train_set).toarray()
test = vectorizer.transform(test_set).toarray()
print 'train set', train
print 'test set', test
transformer.fit(train)
print transformer.transform(train).toarray()
transformer.fit(test)
tfidf = transformer.transform(test)
print tfidf.todense()
答案 0 :(得分:1)
在矢量图上设置charset
(或0.14,encoding
)参数。对于俄语文本,这可能是
CountVectorizer(charset='koi8r', stop_words=stopWords)
(但不要相信我的话,并在文本文件上运行类似chardet
或file
的内容)。