Question

我正在尝试在python中清理文本文件。我想取出停用词，数字和新行字符。但我一直在强制使用Unicode python文本。这是我的代码：

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from string import digits

 def cleanupDoc(s):
     s = s.translate(None,digits)
     s = s.rstrip('\n')  
     stopset = set(stopwords.words('english'))
     tokens = nltk.word_tokenize(s)
     cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
     return cleanup

flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
    tfile = open(fname, 'r+')
    line = tfile.readlines()
    #line = cleanupDoc(line)
    mylist.append(line)

for fdoc in mylist:
    doc = open(fdoc)
    newDoc = cleanupDoc(doc)
    doc.close()

我的错误

Traceback (most recent call last):
File "<stdin>", line 3, in <module>
TypeError: coercing to Unicode: need string or buffer, list found

Answer 1

tfile.readlines()为您提供了一个行列表，您将其附加到另一个列表中：

for fname in flist:
    tfile = open(fname, 'r+')
    line = tfile.readlines()
    mylist.append(line)

结果，您在mylist中有一个列表列表。以下应解决问题：

for fname in flist:
    tfile = open(fname, 'r+')
    line = tfile.readlines()
    mylist += line

这将为您提供mylist中的字符串列表。

Answer 2

import nltk
form nltk import word_tokenize
from nltk.corpus import stopwords
#nltk.download() 
import string
from string import digits
import glob
import re


def cleanupDoc(s):    
     #s = s.translate(None,digits)
     #s = s.rstrip('\n')  
     stopset = set(stopwords.words('english'))
     tokens = nltk.word_tokenize(s)
     cleanup = " ".join(filter(lambda word: word not in stopset, s.split()))
     return cleanup

flist=glob.glob('/home/uiucinfo/Desktop/*txt')
mylist=[]
for fname in flist:
    tfile = open(fname, 'r+')
    line = tfile.readlines()
    #line = cleanupDoc(line)
    mylist.append(line)

for fdoc in mylist:
    # remove \n or digit from fdoc
    fdoc = [re.sub(r'[\"\n]|\d', '', x) for x in fdoc]
    # convert list to string 
    fdoc = ''.join(fdoc)
    print fdoc
    newDoc = cleanupDoc(fdoc)
    print " newDoc: " , newDoc

清除python 2中的文本文件：TypeError：强制转换为Unicode：

2 个答案: