import pandas as pd
import dask.dataframe as dd
import time
import warnings
warnings.simplefilter('ignore')
data['x'] = range(1000)
data['y'] = range(1000)
def add(s):
s['sum'] = s['x']+s['y']
return s
start = time.time()
n_data = data.apply(add, axis=1)
print('it cost time is {} sec'.format(time.time()-start))
start = time.time()
d_data = dd.from_pandas(data, npartitions=10)
s_data = d_data.apply(add, axis=1)
print('it cost time is {} sec'.format(time.time()-start))
start = time.time()
s_data = s_data.compute()
print('but transform it cost time is {} sec'.format(time.time()-start))
import warnings
warnings.simplefilter('ignore')
data['x'] = range(1000)
data['y'] = range(1000)
def add(s):
s['sum'] = s['x']+s['y']
return s
start = time.time()
n_data = data.apply(add, axis=1)
print('it cost time is {} sec'.format(time.time()-start))
start = time.time()
d_data = dd.from_pandas(data, npartitions=10)
s_data = d_data.apply(add, axis=1)
print('it cost time is {} sec'.format(time.time()-start))
start = time.time()
s_data = s_data.compute()
print('but transform it cost time is {} sec'.format(time.time()-start))
结果是:
它的花费时间为1.0297248363494873秒
它的花费时间是0.008629083633422852秒
但转换所需的时间为1.3664238452911377秒
答案 0 :(得分:0)
熊猫应用很慢。因为您使用Python函数逐行操作,所以必须使用Python进行循环,而不是C进行循环。
Dask数据框的默认调度程序使用线程,这些线程通常对快速矢量化的Pandas操作非常有用,但对于受Python代码约束的慢速Pandas操作却无济于事。您可以考虑尝试多处理程序或分布式调度程序。参见http://docs.dask.org/en/latest/scheduling.html
但是,我建议您在尝试Dask之前更好地使用Pandas。可能使用快速的Pandas API可以比Dask更快地加快计算速度。