读取子文件夹中的文件

时间:2017-08-01 01:22:31

标签: python gensim word2vec

嗨我有这样的bbc文件夹中的文件 Files present in parent folder

bbc文件夹中的每个子文件夹都包含文本文件 Text files in sub folders

此代码有助于访问文件夹

中的文件
class MySentences(object):
def __init__(self, dirname):
    self.dirname = dirname

def __iter__(self):
    for fname in os.listdir(self.dirname):
        for line in open(os.path.join(self.dirname, fname)):
            yield line.split()
sentences = MySentences('C:/Users/JAYASHREE/Documents/NLP/bbc-fulltext/bbc/business')

但是我想从每个子文件夹中访问文件。我这样做时会出现以下错误

sentences = MySentences('C:/Users/JAYASHREE/Documents/NLP/bbc-fulltext/bbc')
IOError                                   Traceback (most recent call last)
<ipython-input-29-26fb31de4fec> in <module>()
      1 sentences = MySentences('C:/Users/JAYASHREE/Documents/NLP/bbc-fulltext/bbc') # a memory-friendly iterator
----> 2 model = gensim.models.Word2Vec(sentences)

C:\Users\JAYASHREE\Anaconda2\lib\site-packages\gensim-2.3.0-py2.7-win-amd64.egg\gensim\models\word2vec.pyc in __init__(self, sentences, size, alpha, window, min_count, max_vocab_size, sample, seed, workers, min_alpha, sg, hs, negative, cbow_mean, hashfxn, iter, null_word, trim_rule, sorted_vocab, batch_words, compute_loss)
    501             if isinstance(sentences, GeneratorType):
    502                 raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.")
--> 503             self.build_vocab(sentences, trim_rule=trim_rule)
    504             self.train(sentences, total_examples=self.corpus_count, epochs=self.iter,
    505                        start_alpha=self.alpha, end_alpha=self.min_alpha)

C:\Users\JAYASHREE\Anaconda2\lib\site-packages\gensim-2.3.0-py2.7-win-amd64.egg\gensim\models\word2vec.pyc in build_vocab(self, sentences, keep_raw_vocab, trim_rule, progress_per, update)
    575 
    576         """
--> 577         self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule)  # initial survey
    578         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
    579         self.finalize_vocab(update=update)  # build tables & arrays

C:\Users\JAYASHREE\Anaconda2\lib\site-packages\gensim-2.3.0-py2.7-win-amd64.egg\gensim\models\word2vec.pyc in scan_vocab(self, sentences, progress_per, trim_rule)
    587         vocab = defaultdict(int)
    588         checked_string_types = 0
--> 589         for sentence_no, sentence in enumerate(sentences):
    590             if not checked_string_types:
    591                 if isinstance(sentence, string_types):

<ipython-input-28-48533b12127a> in __iter__(self)
      5     def __iter__(self):
      6         for fname in os.listdir(self.dirname):
----> 7             for line in open(os.path.join(self.dirname, fname)):
      8                 yield line.split()

IOError: [Errno 13] Permission denied: 'C:/Users/JAYASHREE/Documents/NLP/bbc-fulltext/bbc\\business'

请建议我修改代码

1 个答案:

答案 0 :(得分:1)

在您的示例中,您似乎触发了Permission Denied IOError,因为您尝试直接在文件夹上调用open()。

此外,您可能会获得os.listdir()返回的系统文件(如NTUSER.DAT)的名称,并且您无法依赖它们打开它们。

这是一个递归遍历目录树的示例,从我们可以成功打开的任何文件中打印行:

import os

def print_files_in_dir(dirname):
    try:
        for fname in os.listdir(dirname):
            path = os.path.join(dirname, fname)
            if os.path.isfile(path):
                try:
                    for line in open(path, 'r'):
                        print(line)
                except:
                    pass
            elif os.path.isdir(path):
                print_files_in_dir(path)  # recurse
    except:
        pass

dirname = 'C:/Users/MackayA'
print_files_in_dir(dirname)

或者,更简单地说,你可以使用像COLDSPEED建议的os.walk():

import os

dirname = 'C:/Users/MackayA'

for root, dirs, files in os.walk(dirname):
    for fname in files:
        path = os.path.join(root, fname)
        try:
            for line in open(path, 'r'):
                print(line)
        except:
            pass

希望这很有用