我想在这里修改最后的例子:
http://numba.pydata.org/numba-doc/0.15.1/examples.html
实现多线程groupby-max函数。
到目前为止,我所做的工作如下。在一个IPython会话中,它开始打印'主线程',但之后,当我尝试输入新命令时,所有控制台都打印出来'控制台已经退出,值等于:-1073741819,等待答案。'
如何使线程正常工作?
我有numba。版本 0.15.1。
import numpy as np
import pandas as pd
from numba import void, double, jit,int64
import threading
from ctypes import pythonapi, c_void_p
N = 1e1
m = 3
p = 3
x = np.random.randint( 0, m, N )
y = np.random.randint( 0, p, N )
z = np.random.randn(N)
key = [ 'x', 'y' ]
df = pd.DataFrame( {'x':x, 'y':y, 'z':z} )
df = df.sort( key )
grps = df.groupby( key )
print( df.head() )
fld = 'z'
comp_ids, _, ngroups = grps.grouper.group_info
data = grps.obj[ fld ].values
length = len( data )
numthreads = 2
def get_group_splits( grps, num_splits ):
comp_ids, _, ngroups = grps.grouper.group_info
length = len( comp_ids )
arrs = np.array_split( comp_ids, num_splits )
borders_comp_ids = [a[0] for a in arrs[1:]]
borders_indices = np.searchsorted( comp_ids, borders_comp_ids )
borders_indices = np.append( np.insert( borders_indices, 0, 0), length )
return borders_indices
def make_inner_func( fn, *args ):
signature = void( *args )
@jit(signature)
def inner_func( result, comp_ids, data ):
threadstate = savethread()
fn( result, comp_ids, data )
restorethread(threadstate)
return inner_func
@jit(void(double[:], int64[:], double[:]))
def pgb_max( result, comp_ids, data ):
i0 = comp_ids[0]
i1 = comp_ids[1]
result[i0:i1] = np.finfo( np.float ).min
for i in range( len( comp_ids ) ):
cid = comp_ids[i]
ai = data[i]
if ai>result[cid]:
result[ cid ] = ai
return result
savethread = pythonapi.PyEval_SaveThread
savethread.argtypes = []
savethread.restype = c_void_p
restorethread = pythonapi.PyEval_RestoreThread
restorethread.argtypes = [c_void_p]
restorethread.restype = None
group_splits = get_group_splits( grps, numthreads )
n_length_args = (comp_ids,) + ( data, )
chunks = [[arg[ group_splits[i]:group_splits[i+1]] for arg in n_length_args]
for i in range(numthreads)]
inner_func = make_inner_func( pgb_max, double[:], int64[:], double[:] )
result2 = np.ndarray( ngroups, dtype=df[fld].dtype)
result2[:] = np.finfo( result2.dtype ).min
threads = [ threading.Thread( target=inner_func, args=[result2] + chunks[i] ) for i in range(numthreads-1) ]
print( 'starting threads')
for thread in threads:
thread.start()
pass
print( 'on main thread')
inner_func( result2, *chunks[-1] )
print( 'joining')
for thread in threads:
pass
#thread.join()##
print( 'joined')
print( result2 )