我想将多个.txt文件聚合到一个包含.txt文件所有信息的矩阵中。每个.txt文件的结构如下:
NO0010735129_DE # this is the filename
10px:0.0 # this is the font size ("px") used with their relative (percentage) share in document
8px:54.44
16px:0.43
2px:0.0
11px:2.35
7px:41.96
0px:0.0
6px:0.77
1px:0.01
3px:0.04
所需的输出(矩阵)应采用以下格式:
filename 1; fontsize1:XX.XX; fontsize2:XX.XX; fontsize3:XX.XX; /n
filename 2; fontsize1:XX.XX; fontsize2:XX.XX; fontsize3:XX.XX; /n
filename 3; fontsize1:XX.XX; fontsize2:XX.XX; fontsize3:XX.XX; /n
...
到目前为止我的尝试:
directory = 'C:/Sample'
min_file_size = 500
def list_textfiles(directory, min_file_size):
# Creates a list of all files stored in DIRECTORY ending on '.txt'
textfiles = []
for root, dirs, files in os.walk(directory):
for name in files:
filename = os.path.join(root, name)
if os.stat(filename).st_size > min_file_size:
textfiles.append(filename)
return textfiles
out_list = []
for i in list_textfiles(folder, minimum_size):
with open(i) as f:
for j in f:
out_list = []
split_to_out = j.rsplit(":",1)
temp = next(f, None)
while temp:
out_list.append(temp.split(":")[-1])
temp = next(f, None)
out_list = [i.strip() for i in out_list]
string = str(f) # inread filename
to_write = string[string.rfind('Sample')+7:string.rfind('.txt')] + ";" + ";".join(out_list) + "\n" # identify input filename from file
with open(outFile, 'a') as outfile:
outfile.write(str(to_write))
不幸的是,输出看起来只有这样:
filename (last file);value1;value2;value3