我正在尝试从文本文件中清除文本数据,并且遇到此错误:TypeError:预期的字符串或类似字节的对象。
filenames= os.listdir("/input")
raw_files = []
for filename in filenames:
with open('/input') as myfile:
raw_files.append(myfile.read().split())
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
global stopwords
import gensim
import re
stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")
def clean_sentences(text):
tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)
sent_list = []
for sent in tokens:
sent_str = ''
for i, word in enumerate(nltk.word_tokenize(sent)):
# nltk doesn't handle apostrophes correctly
if word[0] == "'":
sent_str = sent_str[:-1]
# only adds words and digits
if re.sub('[a-zA-Z0-9]',"", str(word)):
sent_str += str(word.lower() + ' ')
sent_list.append(sent_str.strip()).apply(str)
return str(sent_list)
# takes list of clean sentences and converts to list of tokens
def tokens_only(text):
tokens = []
for sentence in text:
tokens.extend(sentence.split(" "))
return tokens
# takes in text, cleans it, and returns lemma only
def lemma_tokens(text):
import gensim
tokens = tokens_only (str(clean_sentences(text)))
return [stemmer.stem(token) for token in tokens]
all_lemma = []
all_tokens = []
all_sentences = []
all_sentences_label = []
for i, doc in enumerate(raw_files):
# clean sentences
tmp_list= str(clean_sentences(doc))
all_sentences.extend(tmp_list)
for j in range(len(tmp_list)):
all_sentences_label.append(filenames[i])
# convert list of clean sentences to tokens
tmp_list = tokens_only(tmp_list)
all_tokens.extend(tmp_list)
# gets root word for tokens in document
all_lemma.extend(lemma_tokens(doc))
我在下面收到这些错误。追溯(最近一次通话): 追溯:
File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\exception.py" in inner
34. response = get_response(request)
File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
115. response = self.process_exception_by_middleware(e, request)
File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
113. response = wrapped_callback(request, *callback_args, **callback_kwargs)
File "C:\Users\User\waqaf\waqaf\views.py" in output4
572. tmp_list= str(clean_sentences(doc))
File "C:\Users\User\waqaf\waqaf\views.py" in clean_sentences
531. tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py" in sent_tokenize
106. return tokenizer.tokenize(text)
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in tokenize
1277. return list(self.sentences_from_text(text, realign_boundaries))
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in sentences_from_text
1331. return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in <listcomp>
1331. return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in span_tokenize
1321. for sl in slices:
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _realign_boundaries
1362. for sl1, sl2 in _pair_iter(slices):
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _pair_iter
318. prev = next(it)
File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _slices_from_text
1335. for match in self._lang_vars.period_context_re().finditer(text):
Exception Type: TypeError at /output4
Exception Value: expected string or bytes-like object
我看过很多类似的文章,但没有一个能帮助我解决问题。我已经用过str()和apply(str)了,但是还是不行。我继续得到错误。
答案 0 :(得分:1)
最终传递给sent_tokenize
的是raw_files
中的一项,即myfile.read().split()
的输出,它是一个字符串列表。但是它需要一个字符串。
我建议省略.split()
。