import re, codecs
import string
import sys
stopwords=codecs.open('stopwords_harkat1.txt','r','utf_8')
lines=codecs.open('Corpus_v2.txt','r','utf_8')
for line in lines:
line = line.rstrip().lstrip()
#print line
tokens = line.split('\t')
token=tokens[4]
if token in stopwords:
print token
此代码没有错误,但不适用于不同文件的字符串匹配。任何人都可以帮助我吗?
$我也试过方法匹配,但没有工作
答案 0 :(得分:0)
您需要加载内容文件,而不仅仅是打开它。
替换以下行:
stopwords = codecs.open('stopwords_harkat1.txt','r','utf_8')
使用:
with codecs.open('stopwords_harkat1.txt','r','utf_8') as f:
# assuming one stop word in one line.
stopwords = set(line.strip() for line in f)
# Otherwise, use the following line
# stopwords = set(word for line in f for word in line.split())