Question

我正在尝试模拟以前完成的项目，并且我遇到了CountVectorizer函数的问题。以下是与该问题相关的代码。

from __future__ import division
import nltk, textmining, pprint, re, os.path 
#import numpy as np
from nltk.corpus import gutenberg
import fileinput

list = ["carmilla.txt", "pirate-caribbee.txt", "rider-sage.txt"]

for l in list:
    f = open(l)
    raw1 = f.read()
    print "<-----Here goes nothing"
    head = raw1[:680]
    foot = raw1[157560:176380]
    content = raw1[680:157560]
    print "Done---->"

content=[re.sub(r'[\']', '', text)for text in content]
content=[re.sub(r'[^\w\s\.]', ' ', text) for text in content]

print content

propernouns = []
for story in content:
    propernouns = propernouns+re.findall(r'Mr.[\s][\w]+', story)
    propernouns = propernouns+re.findall(r'Mrs.[\s][\w]+', story)
    propernouns = propernouns+re.findall(r'Ms.[\s][\w]+', story)
    propernouns = propernouns+re.findall(r'Miss.[\s][\w]+', story)

propernouns = set(propernouns)
print "\nNumber of proper nouns: " + str(len(propernouns))
print "\nExamples from our list of proper nouns: "+str(sorted(propernouns))

#Strip all of the above out of text
for word in propernouns:
    content = [re.sub(" "+word+" "," ",story) for story in content]

import string
content = [story.translate(string.maketrans("",""), "_.0123456789")]

print "\n[2] -----Carmilla Text-----"
print content

#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
f2 = open('stopwords2.txt', 'w')
for line in f1:
    f2.write(line.replace('\n', ' '))
    f1.close()
    f2.close()

stopfile = open('stopwords2.txt')

print "Examples of stopwords: "
print stopfile.read()

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words = stopfile , min_df=1)
stories_tdm = cv.fit_transform(content).toarray()

执行此操作无法完成，我收到了以下错误：

Traceback (most recent call last):
  File "C:\Users\mnate_000\workspace\de.vogella.python.third\src\TestFile_EDIT.py", line 84, in <module>
    stories_tdm = cv.fit_transform(content).toarray()
  File "C:\Users\mnate_000\Anaconda\lib\site-packages\sklearn\feature_extraction\text.py", line 780, in fit_transform
    vocabulary, X = self._count_vocab(raw_documents, self.fixed_vocabulary)
  File "C:\Users\mnate_000\Anaconda\lib\site-packages\sklearn\feature_extraction\text.py", line 727, in _count_vocab
    raise ValueError("empty vocabulary; perhaps the documents only"
**ValueError: empty vocabulary; perhaps the documents only contain stop words**

我不知道该往哪里去，因为我已经尝试过更换内容＆＃34;使用另一个文件作为测试，它确定我没有使用停止文件..我似乎无法让它正常运行。还有其他人遇到过这个问题吗？我错过了一些简单的东西吗？

Answer 1

请记得正确关闭文件。 f.close()无处，f2.close()不应缩进，f1.close()

也不应

我认为这可能会解决您的问题。

for l in list:
    f = open(l)
    raw1 = f.read()
    print "<-----Here goes nothing"
    head = raw1[:680]
    foot = raw1[157560:176380]
    content = raw1[680:157560]
    print "Done---->"
    f.close()

...

#Prepare a list of stopwords
f1 = open('stopwords.txt', 'r')
f2 = open('stopwords2.txt', 'w')
for line in f1:
    f2.write(line.replace('\n', ' '))
f1.close()
f2.close()

修改我还看到了另外两个问题：

一个是这样的： content = [story.translate（string.maketrans（“”，“”），“_ 0.0123456789”）]

此缩进级别不存在story变量，因此请澄清。

另一个问题是stop_words可能是string，list或None。对于string，唯一支持的值为'english'。但是，在您的情况下，您传递文件句柄：

stopfile = open('stopwords2.txt') #... cv = CountVectorizer(stop_words = stopfile , min_df=1)

您应该做的是将stopfile中的所有文字转换为字符串列表。替换这个：

#Prepare a list of stopwords f1 = open('stopwords.txt', 'r') f2 = open('stopwords2.txt', 'w') for line in f1: f2.write(line.replace('\n', ' ')) f1.close() f2.close() stopfile = open('stopwords2.txt') print "Examples of stopwords: " print stopfile.read() from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(stop_words = stopfile , min_df=1)

有了这个：

#Prepare a list of stopwords f1 = open('stopwords.txt', 'r') stoplist = [] for line in f1: nextlist = line.replace('\n', ' ').split() stoplist.extend(nextlist) f1.close() print "Examples of stopwords: " print stoplist from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(stop_words = stoplist, min_df=1)

Python - sklearn - 值错误：空词汇

1 个答案: