我正在尝试在python中清理文本文件。我想取出停用词,数字和新行字符。但我一直在强制使用Unicode python文本。这是我的代码:
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from string import digits
def cleanupDoc(s):
s = s.translate(None,digits)
s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
doc = open(fdoc)
newDoc = cleanupDoc(doc)
doc.close()
我的错误
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
TypeError: coercing to Unicode: need string or buffer, list found
答案 0 :(得分:1)
tfile.readlines()
为您提供了一个行列表,您将其附加到另一个列表中:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist.append(line)
结果,您在mylist
中有一个列表列表。
以下应解决问题:
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
mylist += line
这将为您提供mylist
中的字符串列表。
答案 1 :(得分:0)
import nltk
form nltk import word_tokenize
from nltk.corpus import stopwords
#nltk.download()
import string
from string import digits
import glob
import re
def cleanupDoc(s):
#s = s.translate(None,digits)
#s = s.rstrip('\n')
stopset = set(stopwords.words('english'))
tokens = nltk.word_tokenize(s)
cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
return cleanup
flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
tfile = open(fname, 'r+')
line = tfile.readlines()
#line = cleanupDoc(line)
mylist.append(line)
for fdoc in mylist:
# remove \n or digit from fdoc
fdoc = [re.sub(r'[\"\n]|\d', '', x) for x in fdoc]
# convert list to string
fdoc = ''.join(fdoc)
print fdoc
newDoc = cleanupDoc(fdoc)
print " newDoc: " , newDoc