我想从句子中删除停用词。 我是这段代码:
splitted = text.split()
for index, word in enumerate(splitted):
if word in self.stopWords:
del splitted[index]
text = " ".join(splitted)
使用此说明self.stopWords.update(['.', ',', "\"", "\'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '),', '],', '},', '",', "',", '")', '"]', '"}', "-", "--", '\".', "\'.", '/', ').', '-', '--', '%', '°\'', '(-', '("', '."', '.),', ');', '–', '$', 'a'])
更新了stopWords,但是,例如,字母' a',例如'。'或';&# 39;没有从句子中删除。
我该怎么办?
答案 0 :(得分:1)
我认为使用列表理解(或者像我这样做的生成器表达式)更容易:
' '.join(w for w in text.split() if w not in stop_words)
答案 1 :(得分:-1)
你能试试我的代码吗?如果您对代码有任何疑问,请向我询问。
def splitFile(lines,splitvalue):
documents={};
documentCount=1
dcmnt="";
for line in lines:
dcmnt+=line;
if (line.__contains__(splitvalue)):
key="documents"+(str)(documentCount);
documents[key]=dcmnt;
dcmnt="";
documentCount=documentCount+1;
return documents;
documentswords = []
with open('reuter10.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
documentswords.append(word)
stopwords=[]
with open('stopwords.txt','r') as f: #reading a text file and splitting it into single words
for line in f:
for word in line.split():
stopwords.append(word)
readFile=open("reuter10.txt","r");
lines=readFile.readlines();
readFile.close();
alldocuments=splitFile(lines, "</reuters>");
temp=[]
for i in range(0,documentswords.__len__()):
count = 0;
for ii in range(0, stopwords.__len__()):
if documentswords[i]==stopwords[ii]:
count=count+1
if ii+1==stopwords.__len__() and count==0:
temp.append(documentswords[i])
print("")
print("*****PRINTING WORDS WITHOUT STOPWORDS*****")
print("")
for i in range(0, temp.__len__()): #printing words without stopwords
print(temp[i]);