我有熊猫数据框。
使用此数据帧的 start , end 和 signal 列,我使用该信号从头到尾填充numpy数组值。
这是使用下面的apply和list理解的工作代码。
有没有一种方法可以对这一过程进行矢量化或细胞化?
import numpy as np
import pandas as pd
import time
import random
def f(start,end,signal,chrBasedSignalArray):
chrBasedSignalArray[start:end]+=signal
def updateChrBasedSignalArray(data_row,chrBasedSignalArray):
chrBasedSignalArray[data_row['start']:data_row['end']] += data_row['signal']
numberofRows=1000000
startList = random.sample(range(1, 240000000), numberofRows)
endList = [x+100 for x in startList]
signalList = [random.randrange(0,10) for i in range(numberofRows)]
df = pd.DataFrame({'chrom': ['chr1'] * numberofRows, 'start': startList, 'end':endList, 'signal':signalList})
print('##################################')
chrBasedSignalArray = np.zeros(240000000, dtype=np.float32)
print('Before np.sum(chrBasedSignalArray: %f' %np.sum(chrBasedSignalArray))
start_time = time.time()
[f(start,end,signal,chrBasedSignalArray) for start,end,signal in zip(df['start'],df['end'],df['signal'])]
print("--- %s seconds using list comprehension---" % ((time.time() - start_time)))
print('After np.sum(chrBasedSignalArray): %f' %np.sum(chrBasedSignalArray))
print('##################################')
print('##################################')
chrBasedSignalArray = np.zeros(240000000, dtype=np.float32)
print('Before np.sum(chrBasedSignalArray: %f' %np.sum(chrBasedSignalArray))
start_time = time.time()
df.apply(updateChrBasedSignalArray, chrBasedSignalArray=chrBasedSignalArray, axis=1)
print("--- %s seconds using apply---" % ((time.time() - start_time)))
print('After np.sum(chrBasedSignalArray): %f' %np.sum(chrBasedSignalArray))
print('##################################')