Question

我想从句子中删除停用词。我是这段代码：

splitted = text.split()

for index, word in enumerate(splitted):
    if word in self.stopWords:
        del splitted[index]

text = " ".join(splitted)

使用此说明self.stopWords.update(['.', ',', "\"", "\'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '),', '],', '},', '",', "',", '")', '"]', '"}', "-", "--", '\".', "\'.", '/', ').', '-', '--', '%', '°\'', '(-', '("', '."', '.),', ');', '–', '$', 'a'])更新了stopWords，但是，例如，字母＆＃39; a＆＃39;，例如＆＃39;。＆＃39;或＆＃39;;＆＃ 39;没有从句子中删除。

我该怎么办？

Answer 1

我认为使用列表理解（或者像我这样做的生成器表达式）更容易：

' '.join(w for w in text.split() if w not in stop_words)

Answer 2

你能试试我的代码吗？如果您对代码有任何疑问，请向我询问。

def splitFile(lines,splitvalue):
    documents={};
    documentCount=1
    dcmnt="";
    for line in lines:
        dcmnt+=line;
        if (line.__contains__(splitvalue)):
            key="documents"+(str)(documentCount);
            documents[key]=dcmnt;
            dcmnt="";
            documentCount=documentCount+1;
    return documents;

documentswords = []
with open('reuter10.txt','r') as f:  #reading a text file and splitting it into single words
    for line in f:
        for word in line.split():
            documentswords.append(word)
stopwords=[]
with open('stopwords.txt','r') as f:  #reading a text file and splitting it into single words
    for line in f:
        for word in line.split():
            stopwords.append(word)

readFile=open("reuter10.txt","r");
lines=readFile.readlines();
readFile.close();
alldocuments=splitFile(lines, "</reuters>");
temp=[]

for i in range(0,documentswords.__len__()):
    count = 0;
    for ii in range(0, stopwords.__len__()):
        if documentswords[i]==stopwords[ii]:
            count=count+1
        if ii+1==stopwords.__len__() and count==0:
            temp.append(documentswords[i])
print("")
print("*****PRINTING WORDS WITHOUT STOPWORDS*****")
print("")
for i in range(0, temp.__len__()): #printing words without stopwords
    print(temp[i]);

从句子

2 个答案: