Question

我正在尝试制作一个脚本，该脚本从一个大文件中随机提取 500 行，然后将它们写入不同的文件。我希望能够这样做 n 次，这样我最终会得到很多小文件。

问题是，我制作的脚本只将数据写入它创建的第一个文件。它也创建所有其他较小的文件，但它实际上并不将数据写入这些文件。他们只是空的。我错过了什么？

import random

def main():
    lines_in_file = file_len('data.json')
    file_size = 500
    iterations = 10
    with open('data.json') as data_file: # open the big data file
        for iteration_count in range(iterations): # for x in range(how many files I want to make)
            picked_numbers = [] # what line numbers have been picked, to avoid duplicates
            smallfile = None # the small file written to disk
            for _ in range(file_size):
                # for x in range of how many lines I want to appear in the file
                line_num = random.randint(0, lines_in_file-1) # the line that was picked at random
                while line_num in picked_numbers: # if the line exists in the file, find another one
                    line_num = random.randint(0, lines_in_file-1)
                picked_numbers.append(line_num) # add the number to the file so we can look it up later
            small_filename = 'data_{}.json'.format(iteration_count) # create the new small file to write to
            smallfile = open(small_filename, "w")
            for position, line in enumerate(data_file): # enumerate the big data file to find all the lines we have in picked_numbers
                if position in picked_numbers:
                    smallfile.write(line)
            smallfile.close()

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

if __name__ == '__main__':
    main()

Answer 1

每次迭代结束时都缺少 data_file.seek(0)

import random

def main():
    lines_in_file = file_len('data.json')
    file_size = 500
    iterations = 10
    with open('data.json') as data_file: # open the big data file
        for iteration_count in range(iterations): # for x in range(how many files I want to make)
            picked_numbers = [] # what line numbers have been picked, to avoid duplicates
            smallfile = None # the small file written to disk
            for _ in range(file_size):
                # for x in range of how many lines I want to appear in the file
                line_num = random.randint(0, lines_in_file-1) # the line that was picked at random
                while line_num in picked_numbers: # if the line exists in the file, find another one
                    line_num = random.randint(0, lines_in_file-1)
                picked_numbers.append(line_num) # add the number to the file so we can look it up later
            small_filename = 'data_{}.json'.format(iteration_count) # create the new small file to write to
            smallfile = open(small_filename, "w")
            for position, line in enumerate(data_file): # enumerate the big data file to find all the lines we have in picked_numbers
                if position in picked_numbers:
                    smallfile.write(line)
            smallfile.close()
            data_file.seek(0)

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

if __name__ == '__main__':
    main()

我通常不会“窃取”答案，但我认为将来来的人应该直接知道该怎么做。如果 Nobby Noobs 想添加一个答案，你可以接受他的，但我认为我们应该保持堆栈溢出的正确结构

脚本只写入它创建的第一个文件？

1 个答案: