我正在尝试制作一个脚本,该脚本从一个大文件中随机提取 500 行,然后将它们写入不同的文件。我希望能够这样做 n 次,这样我最终会得到很多小文件。
问题是,我制作的脚本只将数据写入它创建的第一个文件。它也创建所有其他较小的文件,但它实际上并不将数据写入这些文件。他们只是空的。我错过了什么?
import random
def main():
lines_in_file = file_len('data.json')
file_size = 500
iterations = 10
with open('data.json') as data_file: # open the big data file
for iteration_count in range(iterations): # for x in range(how many files I want to make)
picked_numbers = [] # what line numbers have been picked, to avoid duplicates
smallfile = None # the small file written to disk
for _ in range(file_size):
# for x in range of how many lines I want to appear in the file
line_num = random.randint(0, lines_in_file-1) # the line that was picked at random
while line_num in picked_numbers: # if the line exists in the file, find another one
line_num = random.randint(0, lines_in_file-1)
picked_numbers.append(line_num) # add the number to the file so we can look it up later
small_filename = 'data_{}.json'.format(iteration_count) # create the new small file to write to
smallfile = open(small_filename, "w")
for position, line in enumerate(data_file): # enumerate the big data file to find all the lines we have in picked_numbers
if position in picked_numbers:
smallfile.write(line)
smallfile.close()
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
if __name__ == '__main__':
main()
答案 0 :(得分:1)
每次迭代结束时都缺少 data_file.seek(0)
import random
def main():
lines_in_file = file_len('data.json')
file_size = 500
iterations = 10
with open('data.json') as data_file: # open the big data file
for iteration_count in range(iterations): # for x in range(how many files I want to make)
picked_numbers = [] # what line numbers have been picked, to avoid duplicates
smallfile = None # the small file written to disk
for _ in range(file_size):
# for x in range of how many lines I want to appear in the file
line_num = random.randint(0, lines_in_file-1) # the line that was picked at random
while line_num in picked_numbers: # if the line exists in the file, find another one
line_num = random.randint(0, lines_in_file-1)
picked_numbers.append(line_num) # add the number to the file so we can look it up later
small_filename = 'data_{}.json'.format(iteration_count) # create the new small file to write to
smallfile = open(small_filename, "w")
for position, line in enumerate(data_file): # enumerate the big data file to find all the lines we have in picked_numbers
if position in picked_numbers:
smallfile.write(line)
smallfile.close()
data_file.seek(0)
def file_len(fname):
with open(fname) as f:
for i, l in enumerate(f):
pass
return i + 1
if __name__ == '__main__':
main()
我通常不会“窃取”答案,但我认为将来来的人应该直接知道该怎么做。如果 Nobby Noobs 想添加一个答案,你可以接受他的,但我认为我们应该保持堆栈溢出的正确结构