Question

我正在尝试从文本文件中清除文本数据，并且遇到此错误：TypeError：预期的字符串或类似字节的对象。

filenames= os.listdir("/input")
raw_files = []

for filename in filenames:
    with open('/input') as myfile:
        raw_files.append(myfile.read().split())

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
global stopwords
import gensim
import re

stopwords = stopwords.words('english')
stemmer = SnowballStemmer("english")


def clean_sentences(text):
    tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)

    sent_list = []
    for sent in tokens:
        sent_str = ''
        for i, word in enumerate(nltk.word_tokenize(sent)):
            # nltk doesn't handle apostrophes correctly
            if word[0] == "'":
                sent_str = sent_str[:-1]

            # only adds words and digits
            if re.sub('[a-zA-Z0-9]',"", str(word)):
                sent_str += str(word.lower() + ' ')
                sent_list.append(sent_str.strip()).apply(str)

    return str(sent_list)

# takes list of clean sentences and converts to list of tokens
def tokens_only(text):
    tokens = []

    for sentence in text:
        tokens.extend(sentence.split(" "))

        return tokens

# takes in text, cleans it, and returns lemma only
def lemma_tokens(text):
     import gensim
     tokens = tokens_only (str(clean_sentences(text)))
     return [stemmer.stem(token) for token in tokens]

all_lemma = []
all_tokens = []
all_sentences = []
all_sentences_label = []

for i, doc in enumerate(raw_files):

    # clean sentences    
    tmp_list= str(clean_sentences(doc))
    all_sentences.extend(tmp_list)
    for j in range(len(tmp_list)):
        all_sentences_label.append(filenames[i])

    # convert list of clean sentences to tokens
    tmp_list = tokens_only(tmp_list)
    all_tokens.extend(tmp_list)

    # gets root word for tokens in document
    all_lemma.extend(lemma_tokens(doc))

我在下面收到这些错误。追溯（最近一次通话）：追溯：

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\exception.py" in inner
  34.             response = get_response(request)

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
  115.                 response = self.process_exception_by_middleware(e, request)

File "C:\Users\User\Anaconda3\lib\site-packages\django\core\handlers\base.py" in _get_response
  113.                 response = wrapped_callback(request, *callback_args, **callback_kwargs)

File "C:\Users\User\waqaf\waqaf\views.py" in output4
  572.      tmp_list= str(clean_sentences(doc))

File "C:\Users\User\waqaf\waqaf\views.py" in clean_sentences
  531.      tokens = [sent for sent in nltk.sent_tokenize(text)].apply(str)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\__init__.py" in sent_tokenize
  106.     return tokenizer.tokenize(text)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in tokenize
  1277.         return list(self.sentences_from_text(text, realign_boundaries))

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in sentences_from_text
  1331.         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in <listcomp>
  1331.         return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in span_tokenize
  1321.         for sl in slices:

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _realign_boundaries
  1362.         for sl1, sl2 in _pair_iter(slices):

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _pair_iter
  318.         prev = next(it)

File "C:\Users\User\Anaconda3\lib\site-packages\nltk\tokenize\punkt.py" in _slices_from_text
  1335.         for match in self._lang_vars.period_context_re().finditer(text):

Exception Type: TypeError at /output4
Exception Value: expected string or bytes-like object

我看过很多类似的文章，但没有一个能帮助我解决问题。我已经用过str（）和apply（str）了，但是还是不行。我继续得到错误。

Answer 1

最终传递给sent_tokenize的是raw_files中的一项，即myfile.read().split()的输出，它是一个字符串列表。但是它需要一个字符串。

我建议省略.split()。

TypeError：预期的字符串或类似字节的对象；

1 个答案: