如何提高Python和numpy中提取数据代码的性能

时间:2019-02-05 18:00:01

标签: python pandas numpy

我正在尝试在DIC(数字图像相关性)结果中实现缺失点,由于材料(照片)被破坏而导致缺失。

enter image description here

我想在缺少该区域的地方放置点(无值)。 (源数据中的本机点)

enter image description here

此代码是我论文的一部分,我想找出由于拉伸试验中材料破坏而遗漏的那些点。来自DIC(数字图像相关)方法的数据,该方法测量样品表面的应变。当样品遭受局部破坏时,DIC软件将无法找到该区域的像素簇并最终丢失点。我有30多个标本,每个标本50帧。此数据的总计算时间约为一周。在版本1中显示的代码如下-我的计算机上没有任何更改可以运行约4分钟。版本2中的代码缩短了:从头到尾的语句MARK1被注释,版本2的注释行未注释,但仍需要3分45秒。

输入数据: https://github.com/MarekSawicki/data/blob/master/022_0_29-03-2018_e11_45.csv

import numpy as np
import os
# changing of folder
os.chdir("D:/Marek/doktorat/Badania_obrobione/test")
# load data from file
data = np.genfromtxt('022_0_29-03-2018_e11_45.csv', delimiter=',',dtype='float64')
# separation of coordintates (points) and values (both floats64)
# data in format: list of points (X,Y) and list of values
points = data[1:,1:3]
values = data[1:,4]
#shifting coordinates to zero (points coordinates might be negative or offset from 0) (-x0)
points[:,0] -= min(points[:,0])
points[:,1] -= min(points[:,1])
#scale factor K_scale
k_scale=2
points[:,0:2] *= k_scale
# vector reshape
values=  np.reshape(values, [len(data)-1,1])
# sort the points to keep order in X direction 
# points X are assumed as points[:,0]
# points Y are assumed as points[:,1]
array1 = np.ascontiguousarray(points)
a_view = array1.view(dtype=[('', array1.dtype)]*array1.shape[-1])
a_view.sort(axis=0)
points_sorted = array1
# Start of processing points
# a and b are respectively X and Y limits 
a = np.int32(np.ceil(np.max(points[:,0])))+1
b = np.int32(np.ceil(np.max(points[:,1])))+1
# length 1 unit array cluster
array2=np.empty((0,2))
for m in range(0,3):
    for n in range(0,3):
        array2=np.append(array2,[[m*.5,n*.5]],axis=0)
# initialization of variables
k=0 # searching limits
bool_array_del=np.zeros((9,1), dtype=bool) # determine which line should be deleted - bool type
# array4 is a container of values which meets criteria
array4=np.empty((0,2))
# array7 is output container
array7=np.empty((0,2))
# main loop of concerned code:
for i in range(0,a): # X wise loop, a is a X limit
    for i2 in range(0,b): # Y wise loop, a is a Y limit
        array3 = np.copy(array2) # creating a cluster in range (i:i+1,i2:i2+1, step=0.5)
        array3[:,0]+=i
        array3[:,1]+=i2
        # value container (each loop it should be cleaned)
        array4=np.empty((0,2))
        # container which determine data to delete (each loop it should be cleaned)
        bool_array_del = np.empty((0,1),dtype=bool)
        k=0 # set zero searching limits
# loop for searching points which meet conditions.
# I think it is the biggest time waster
#To make it shorter I deal with sorted points which allows me 
#to browse part of array insted of whole array 
#(that is why I used k parameter and if-break condition )
        for i3 in range(k,points_sorted.shape[0]):
            valx = points_sorted[i3,0]
            valy = points_sorted[i3,1]
            if valx>i-1:
                k=i3
            if valx>i+1.5:
                break
#this condition check does considered point has X and  coordinates is range : i-0.5:i+1.5
# If yes then append this coordinate to empty container (array4)
            if np.abs(valx-(i+.5))<=1:
                if np.abs(valy-(i2+.5))<=1:
                    array4=np.append(array4,[[valx,valy]],axis=0)
# (version 2)       break
# Then postprocessing of selected points container - array4. To determine - do all point out of array4 should are close enough to be deleted?
    if array4.shape[0]!=0:
# (version 2) pass
# begin(MARK1)
# round the values from array4 to neares .5 value
        array5 = np.round(array4*2)/2
# if value from array5 are out of bound for proper cluster values then shift it to the closest correct value
        for i4 in range(0,array5.shape[0]):
            if array5[i4,0]>i+1:
                array5[i4,0]= i+1
            elif array5[i4,0]<i:
                array5[i4,0]=i
            if array5[i4,1]>i2+1:
                array5[i4,1]=i2+1
            elif array5[i4,1]<i2:
                array5[i4,1]=i2
# substract i,i2 vector and double from value of array5 to get indices which should be deleted
        array5[:,0]-=i
        array5[:,1]-=i2
        array5*=2
# create empty container with bool values - True - delete this value, False - keep
        array_bool1=np.zeros((3,3), dtype=bool)
        for i5 in range(0,array5.shape[0]):
# below condition doesn't work - it is too rough
            #array_bool1[int(array5[i5,0]),int(array5[i5,1])]=True
# this approach works with correct results but I guess it is second the biggest time waster.
            try: 
                array_bool1[int(array5[i5,0]),int(array5[i5,1])]=True
                array_bool1[int(array5[i5,0]+1),int(array5[i5,1]-1)]=True
                    array_bool1[int(array5[i5,0]+1),int(array5[i5,1])+1]=True
                    array_bool1[int(array5[i5,0]+1),int(array5[i5,1])]=True
                    array_bool1[int(array5[i5,0]-1),int(array5[i5,1]+1)]=True
                    array_bool1[int(array5[i5,0]-1),int(array5[i5,1]-1)]=True
                    array_bool1[int(array5[i5,0]-1),int(array5[i5,1])]=True
                    array_bool1[int(array5[i5,0]),int(array5[i5,1]+1)]=True
                    array_bool1[int(array5[i5,0]),int(array5[i5,1]-1)]=True
                except: 
                    pass
# convert bool array to list
            for i6 in range(0,array_bool1.shape[0]):
                for i7 in range(0,array_bool1.shape[1]):
                    bool_array_del=np.append(bool_array_del,    [[array_bool1[i6,i7]]],axis=0)
# get indices where bool list (unfotunatelly called bool_array_del) is true
            result= np.argwhere(bool_array_del)
            array6=np.delete(array3,result[:,0],axis=0)
# append it to output container
            array7=np.append(array7,array6,axis=0)
# if nothing is found in loop for searching points which meet conditions append full cluster to output array
# end(MARK1)
        else:
            array7=np.append(array7,array3,axis=0)

此代码为我提供了版本1(图3)的令人满意的结果和版本2(图4)的可接受的结果

enter image description here

enter image description here

我是python和numpy的新手。你能告诉我我可以做些什么来加快我的代码的速度吗?我考虑过改用熊猫

1 个答案:

答案 0 :(得分:1)

实际上,Jordan Singer提出了一种非常方便的解决方案-通过numba使用JIT编译。 时间减少从4分钟到4-5秒。另一方面,我被迫不使用np.append,但是我使用了定长零数组,然后从数组中删除了零。

以下重做的代码:

import numpy as np
import matplotlib.pyplot as plt
import os
from numba import jit
import time
start = time.time()

# creating array4 from previous code changed to function with JIT decorator
@jit(nopython=True)
def Numba_Function_array4(if1,if2,source):
    k=0 # set zero searching limits
    index=0 # browse index for output array
    rarray=np.zeros((10,2)) # fixed length output array
# loop for searching points which meet conditions.
# I think it is the biggest time waster - in fact is!
#To make it shorter I deal with sorted points which allows me 
#to browse part of array insted of whole array 
#(that is why I used k parameter and if-break condition )
    for i3 in range(k,source.shape[0]):
        valx = source[i3,0]
        valy = source[i3,1]
        if valx>if1-1:
            k=i3
        if valx>if1+1.5:
        break
#this condition check does considered point has X and  coordinates is range : i-0.5:i+1.5
# If yes then append this coordinate to empty container (array4)
    if np.abs(valx-(if1+.5))<=1:
        if np.abs(valy-(if2+.5))<=1:
            rarray[index,:] = [valx,valy]
            index+=1
    return rarray

# changing of folder
os.chdir("D:/Marek/doktorat/Badania_obrobione/test")
cwd = os.getcwd()
# load data from file
data = np.genfromtxt('022_0_29-03-2018_e11_45.csv', delimiter=',',dtype='float64')
# separation of coordintates (points) and values (both floats64)
# data in format: list of points (X,Y) and list of values
points = data[1:,1:3]
values = data[1:,4]
#shifting coordinates to zero (points coordinates might be negative or offset from 0) (-x0)
points[:,0] -= min(points[:,0])
points[:,1] -= min(points[:,1])
#scale factor K_scale
ks=2
points[:,0:2] *= ks
# vector reshape
values=  np.reshape(values, [len(data)-1,1])
# sort the points to keep order in X direction 
# points X are assumed as points[:,0]
# points Y are assumed as points[:,1]
array1 = np.ascontiguousarray(points)
a_view = array1.view(dtype=[('', array1.dtype)]*array1.shape[-1])
a_view.sort(axis=0)
Input_points_sorted = array1
# Start of processing points
# a and b are respectively X and Y limits 
a = np.int32(np.ceil(np.max(points[:,0])))+1
b = np.int32(np.ceil(np.max(points[:,1])))+1
# length 1 unit array cluster (step 0.5)
array2=np.empty((0,2))
for m in range(0,3):
    for n in range(0,3):
    array2=np.append(array2,[[m*.5,n*.5]],axis=0)
# array7 is output container
array7=np.empty((0,2))
# main loop of concerned code:
for i in range(0,a): # X wise loop, a is a X limit
    for i2 in range(0,b): # Y wise loop, a is a Y limit
        array3 = np.copy(array2) # creating a cluster in range (i:i+1,i2:i2+1, step=0.5)
        array3[:,0]+=i
        array3[:,1]+=i2
        # function which contail the most time consuming part of the code
        array4 = Numba_Function_array4(i,i2,Input_points_sorted)
        # container which determine data to delete (each loop it should be cleaned)
        bool_array_del = np.empty((0,1),dtype=bool)
        # because in function umba_Function_array4 I created fixed length array I have to delete from the end zeros which indicates empty index
        for i8 in range(9,-1,-1):
            if array4[i8,0]==0:
                array4=np.delete(array4,i8,axis=0)

# Then postprocessing of selected points container - array4. To determine - do all point out of array4 should are close enough to be deleted?
        if array4.shape[0]!=0: 
# round the values from array4 to neares .5 value
            array5 = np.round(array4*2)/2
# if value from array5 are out of bound for proper cluster values then shift it to the closest correct value
            for i4 in range(0,array5.shape[0]):
                if array5[i4,0]>i+1:
                    array5[i4,0]= i+1
                elif array5[i4,0]<i:
                    array5[i4,0]=i
                if array5[i4,1]>i2+1:
                    array5[i4,1]=i2+1
                elif array5[i4,1]<i2:
                    array5[i4,1]=i2
# substract i,i2 vector and double from value of array5 to get indices which should be deleted
            array5[:,0]-=i
            array5[:,1]-=i2
            array5*=2
# create empty container with bool values - True - delete this value, False - keep
            array_bool1=np.zeros((3,3), dtype=bool)
            for i5 in range(0,array5.shape[0]):
                try: 
                    array_bool1[int(array5[i5,0]),int(array5[i5,1])]=True
                    array_bool1[int(array5[i5,0]+1),int(array5[i5,1]-1)]=True
                    array_bool1[int(array5[i5,0]+1),int(array5[i5,1])+1]=True
                    array_bool1[int(array5[i5,0]+1),int(array5[i5,1])]=True
                    array_bool1[int(array5[i5,0]-1),int(array5[i5,1]+1)]=True
                    array_bool1[int(array5[i5,0]-1),int(array5[i5,1]-1)]=True
                    array_bool1[int(array5[i5,0]-1),int(array5[i5,1])]=True
                    array_bool1[int(array5[i5,0]),int(array5[i5,1]+1)]=True
                    array_bool1[int(array5[i5,0]),int(array5[i5,1]-1)]=True
                except: 
                    pass
# convert bool array to list
            for i6 in range(0,array_bool1.shape[0]):
                for i7 in range(0,array_bool1.shape[1]):
                    bool_array_del=np.append(bool_array_del,[[array_bool1[i6,i7]]],axis=0)
# get indices where bool list (unfotunatelly called bool_array_del) is true
            result= np.argwhere(bool_array_del)
            array6=np.delete(array3,result[:,0],axis=0)
# append it to output container
            array7=np.append(array7,array6,axis=0)
# if nothing is found in loop for searching points which meet conditions append full cluster to output array
        else:
            array7=np.append(array7,array3,axis=0)
    end = time.time()
    print("Elapsed (after compilation) = %s" % (end - start))
    print("Done!") 

再次感谢约旦,问题已解决!