在Python中使用jieba删除停用词

时间:2018-07-27 07:35:40

标签: python stop-words

运行以下代码时遇到错误。
我想删除停用词,但是不起作用!

def cut_txt(old_file):
from string import punctuation
import jieba
jieba.load_userdict("user_dictionary.csv") 

stopwords = [line.strip().decode('utf-8') for line in open('stop_words.txt').readlines() ]

global cut_file     # 分词之后保存的文件名
cut_file = old_file + '_cut.txt'

try:
    fi = open(old_file, 'r', encoding='utf-8')
except BaseException as e:  # 因BaseException是所有错误的基类,用它可以获得所有错误类型
    print(Exception, ":", e)    # 追踪错误详细信息

text = fi.read()  # 获取文本内容
new_text = jieba.cut(text, cut_all=False)  # 精确模式
str_out = ' '.join(new_text).replace(',', '').replace('。', '').replace('?', '').replace('!', '') \
    .replace('“', '').replace('”', '').replace(':', '').replace('…', '').replace('(', '').replace(')', '') \
    .replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '') \
    .replace('’', '').replace(',', '').replace('【', '').replace('】', '').replace('"', '')\
    .replace('#','').replace('...','').replace('?','').replace('『','')                                     # 去掉标点符号

#去除停用词
final = ''
for seg in str_out:
    seg = seg.encode('gbk')
    if seg not in stopwords:
           final += seg      
fo = open(cut_file, 'w', encoding='utf-8')
fo.write(final)

cut_txt('你的天空')

0 个答案:

没有答案