使用numpy loadtxt加载多个文件很慢

时间:2014-10-13 19:17:00

标签: python performance numpy io

我正在使用numpy.loadtxt()从目录加载一系列文件并将其加载到两个数组中。每个文件都是一个包含不同行数的双列csv文件。

我注意到代码非常慢,98%的时间花在了代码的阅读文本部分; [yout1,yout2]=numpy.loadtxt('training_set/mod_data/'+fnamelst[riter],delimiter=',',usecols=(0,1), dtype=int,unpack=True)

我已经使用line_profiler对代码进行了分析,并附加了输出。

有没有办法让这一步更快?

感谢。

import numpy as numpy
import scipy.io
from math import log
from scipy.sparse import coo_matrix
import time


mat = scipy.io.loadmat('netf_proc_data.mat',variable_names='nMc')
nMc=mat['nMc']
frosq=numpy.sum(nMc)
un=2649429+1
#m=17770
iter=0
samples= numpy.ceil(8*log(m))
Indexi=Indexj=SampMatrix=numpy.zeros(2*samples)
fnamelst=numpy.loadtxt('training_set/print.txt',usecols=(0,), dtype='str')

for i in range(0,1):
    [xout1,xout2]=numpy.loadtxt('training_set/mod_data/'+fnamelst[i],delimiter=',',usecols=(0,1), dtype=int,unpack=True)
    x=coo_matrix((xout2,(xout1,numpy.zeros(len(xout1)))),shape=(un,1),dtype=int)

   # sampling
    P=samples*(nMc[i]*numpy.ones((m,1))+nMc)/(2*frosq)
    for riter in range(0,m):
        indices=numpy.random.binomial(1,P[riter])
        if indices==1:
            [yout1,yout2]=numpy.loadtxt('training_set/mod_data/'+fnamelst[riter],delimiter=',',usecols=(0,1), dtype=int,unpack=True)
            y=coo_matrix((yout2,(yout1,numpy.zeros(len(yout1)))),shape=(un,1),dtype=int)            
            Indexi[iter]=i
            Indexj[iter]=riter
            SampMatrix[iter]=x.T.dot(y)[0,0]
            iter=iter+1
   # end
    print i
#end

计时器单位:3.66606e-07 s

总时间:91.5677秒

Line#命中每次命中时间%时间线内容

15                                           def read_netf(m):
16         1           10     10.0      0.0      t0 = time.time()
17         1         3733   3733.0      0.0      mat = scipy.io.loadmat('netf_proc_data.mat',variable_names='nMc')
18         1            7      7.0      0.0      nMc=mat['nMc']
19         1          313    313.0      0.0      frosq=numpy.sum(nMc)
20         1            5      5.0      0.0      un=2649429+1
21                                               #m=17770
22         1            4      4.0      0.0      iter=0
23         1           42     42.0      0.0      samples= numpy.ceil(8*log(m))
24         1          116    116.0      0.0      Indexi=Indexj=SampMatrix=numpy.zeros(2*samples)
25         1       935968 935968.0      0.4      fnamelst=numpy.loadtxt('training_set/print.txt',usecols=(0,), dtype='str')
26                                               
27         2           20     10.0      0.0      for i in range(0,1):
28         1        46857  46857.0      0.0          [xout1,xout2]=numpy.loadtxt('training_set/mod_data/'+fnamelst[i],delimiter=',',usecols=(0,1), dtype=int,unpack=True)
29         1          652    652.0      0.0          x=coo_matrix((xout2,(xout1,numpy.zeros(len(xout1)))),shape=(un,1),dtype=int)
30                                                  
31                                                  # sampling
32         1          766    766.0      0.0          P=samples*(nMc[i]*numpy.ones((m,1))+nMc)/(2*frosq)
33     17771        59955      3.4      0.0          for riter in range(0,m):
34     17770       191466     10.8      0.1              indices=numpy.random.binomial(1,P[riter])
35     17770        68157      3.8      0.0              if indices==1:
36        48    244680827 5097517.2     98.0                  [yout1,yout2]=numpy.loadtxt('training_set/mod_data/'+fnamelst[riter],delimiter=',',usecols=(0,1), dtype=int,unpack=True)
37        48       131806   2746.0      0.1                  y=coo_matrix((yout2,(yout1,numpy.zeros(len(yout1)))),shape=(un,1),dtype=int)            
38        48          760     15.8      0.0                  Indexi[iter]=i
39        48          230      4.8      0.0                  Indexj[iter]=riter
40        48      3649053  76021.9      1.5                  SampMatrix[iter]=x.T.dot(y)[0,0]
41        48          470      9.8      0.0                  iter=iter+1
42                                                  # end
43         1          241    241.0      0.0          print i
44                                               #end
45         1            6      6.0      0.0      t1 = time.time()
46         1            3      3.0      0.0      total = t1-t0
47         1            3      3.0      0.0      return(total)

0 个答案:

没有答案