通过python将长文本拆分为较小的样本

时间:2018-02-28 12:42:18

标签: python python-3.x text-chunking

我已经尝试了https://de.dariah.eu/tatom/preprocessing.html中的示例代码,将58个文本文件分块到分隔的n号分块文本文件中。代码如下。但最后我在输出目录中得到一个.txt0001而不是chunked文件。请问有谁请告诉我哪里出错了?

import os
import numpy as np
# speeches are in the directory corpus/CorpusTopicModeling
# gather all the filenames, sorted alphabetically
corpus_path = os.path.join('c:/','corpus', 'CorpusTopicModeling')
# look at the first few filenames
# (we are sorting because different operating systems may list files in different orders)
sorted(os.listdir(path=corpus_path))[0:5]
speeches_filenames = [os.path.join(corpus_path, fn) for fn in sorted(os.listdir(corpus_path))]
#One way to split a text is to read through it and create a chunk every n words
#where n is a number such as 500, 1,000 or 10,000
def split_text(filename, n_words):
#Split a text into chunks approximately `n_words` words in length."""
    input = open(filename, 'r')
    words = input.read().split(' ')
    input.close()
    chunks = []
    current_chunk_words = []
    current_chunk_word_count = 0
    for word in words:
        current_chunk_words.append(word)
        current_chunk_word_count += 1
        if current_chunk_word_count == n_words:
            chunks.append(' '.join(current_chunk_words))
            current_chunk_words = []
            current_chunk_word_count = 0
            chunks.append(' '.join(current_chunk_words) )
            return chunks
speeches_filenames = [os.path.join(corpus_path, fn) for fn in sorted(os.listdir(corpus_path))]
#for consistency across platforms (Linux, OS X, Windows) we must sort the filenames  
speeches_filenames.sort()
chunk_length = 1000
chunks = []
for filename in speeches_filenames: 
         chunk_counter = 0
         texts = split_text(filename, chunk_length)
         if texts is not None: #You need to add check for None prior iteration
             for text in texts:
                 chunk = {'text': text, 'number': chunk_counter, 'filename': filename}
                 chunks.append(chunk)
                 chunk_counter += 1

# we started with this many files
len(speeches_filenames)
# ... and now we have this many
len(chunks)
# from the triples we can create a document-term matrix
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=5, max_df=.95)
dtm = vectorizer.fit_transform([c['text'] for c in chunks])
vocab = np.array(vectorizer.get_feature_names())
#These chunks may be saved in a directory for reference or for analysis in another program
output_dir = 'C:/corpus/ChunkedOutput'
for chunk in chunks:
    basename = os.path.basename(chunk['filename'])
    fn = os.path.join(output_dir,
                          "{}{:04d}".format(basename, chunk['number']))
with open(fn, 'w') as f:
    f.write(chunk['text'])

0 个答案:

没有答案