Question

我想在这里修改最后的例子：

http://numba.pydata.org/numba-doc/0.15.1/examples.html

实现多线程groupby-max函数。

到目前为止，我所做的工作如下。在一个IPython会话中，它开始打印'主线程'，但之后，当我尝试输入新命令时，所有控制台都打印出来'控制台已经退出，值等于：-1073741819，等待答案。'

如何使线程正常工作？

我有numba。版本 0.15.1。

import numpy as np
import pandas as pd
from numba import void, double, jit,int64

import threading
from ctypes import pythonapi, c_void_p

N = 1e1
m = 3
p = 3

x = np.random.randint( 0, m, N )
y = np.random.randint( 0, p, N )
z = np.random.randn(N)

key = [ 'x', 'y' ]

df = pd.DataFrame( {'x':x, 'y':y, 'z':z} )
df = df.sort( key )

grps = df.groupby( key )
print( df.head() )    

fld      = 'z'
comp_ids, _, ngroups = grps.grouper.group_info
data       = grps.obj[ fld ].values
length     = len( data )
numthreads = 2

def get_group_splits( grps, num_splits ):
    comp_ids, _, ngroups = grps.grouper.group_info
    length               = len( comp_ids )

    arrs              = np.array_split( comp_ids, num_splits )
    borders_comp_ids  = [a[0] for a in arrs[1:]]
    borders_indices   = np.searchsorted( comp_ids, borders_comp_ids )
    borders_indices   = np.append( np.insert( borders_indices, 0, 0), length )

    return borders_indices

def make_inner_func( fn, *args ):
    signature = void( *args )
    @jit(signature)
    def inner_func( result, comp_ids, data ):
        threadstate = savethread()
        fn( result, comp_ids, data )
        restorethread(threadstate)
    return inner_func

@jit(void(double[:], int64[:], double[:]))
def pgb_max( result, comp_ids, data ):

    i0 = comp_ids[0]
    i1 = comp_ids[1]
    result[i0:i1] = np.finfo( np.float ).min

    for i in range( len( comp_ids ) ):        
        cid = comp_ids[i]
        ai  = data[i]
        if ai>result[cid]:
            result[ cid ] = ai
    return result 

savethread = pythonapi.PyEval_SaveThread
savethread.argtypes = []
savethread.restype = c_void_p

restorethread = pythonapi.PyEval_RestoreThread
restorethread.argtypes = [c_void_p]
restorethread.restype = None

group_splits = get_group_splits( grps, numthreads )

n_length_args = (comp_ids,) + ( data, )

chunks = [[arg[ group_splits[i]:group_splits[i+1]] for arg in n_length_args]
          for i in range(numthreads)]

inner_func = make_inner_func( pgb_max, double[:], int64[:], double[:] )

result2    = np.ndarray( ngroups, dtype=df[fld].dtype)
result2[:] = np.finfo( result2.dtype ).min

threads = [ threading.Thread( target=inner_func, args=[result2] + chunks[i] ) for i in range(numthreads-1) ]

print( 'starting threads')
for thread in threads:
    thread.start()
    pass
print( 'on main thread')
inner_func( result2, *chunks[-1] )

print( 'joining')
for thread in threads:
    pass
    #thread.join()##

print( 'joined')

print( result2 )

为什么这个多线程脚本会冻结？

0 个答案: