我正在使用下面的代码从大约读取第2列。 5000个文件,然后输出矩阵(NxL)
N = number of files
L = number of lines (identical for all the files=600000 lines)
格式:
filename1 filename2 filename3 ... filename5000
file1.col2 fiel2.col2 file3.col2 ... fiel5000.col2
...
这是我正在使用的代码:
#!/usr/bin/env python
### module load python/2.7.8 ####
import os
import glob
import pandas as pd
import numpy as np
class Backpack:
def __init__(self):
self.inside = []
def add(self, toadd):
self.inside.append(toadd)
def addmany(self, listtoadd):
self.inside += listtoadd
def __str__(self):
return ', '.join(str(i) for i in self.inside)
pack = Backpack()
def concatenate(path):
outfile="matrix.dat"
f=open(outfile,'ab')
np_array_list = []
inside = []
fileList=glob.glob(os.path.join(path, '*.txt'))
for filename in fileList:
base=os.path.splitext(os.path.basename(filename))[0]
inside.append(base)
df=pd.read_table(filename,usecols=[1],header=None)
np_array_list.append(df.as_matrix())
inside.append('\n')
comb_np_array = np.column_stack(np_array_list)
f.write(' '.join(inside))
np.savetxt(f,comb_np_array,fmt='%d')
f.close()
concatenate('.')
如何改进此代码?