我想从一个较大的上三角矩阵中计算元素的总和。常规的Julia代码如下。
HRESULT DVCPlugin::Initialize(__in IWTSVirtualChannelManager *pChannelMgr)
{
HRESULT hr = S_OK;
// chanA
CComObject<ListenerCallback_chanA> *pListenerCallback_chanA;
CComPtr<ListenerCallback_chanA> ptrListenerCallback_chanA;
CComPtr<IWTSListener> ptrListener_chanA;
// Create an instance of the DVCPlugin object.
HRESULT hr_chanA = CComObject<ListenerCallback_chanA>::CreateInstance(&pListenerCallback_chanA);
ptrListenerCallback_chanA = pListenerCallback_chanA;
// Attach the callback to the endpoint.
hr_chanA = pChannelMgr->CreateListener(
DVC_chanA,
0,
(ListenerCallback_chanA*)ptrListenerCallback_chanA,
&ptrListener_chanA);
if (hr_chanA != S_OK) {
hr = hr_chanA;
}
// chanB
CComObject<ListenerCallback_chanB> *pListenerCallback_chanB;
CComPtr<ListenerCallback_chanB> ptrListenerCallback_chanB;
CComPtr<IWTSListener> ptrListener_chanB;
// Create an instance of the DVCPlugin object.
HRESULT hr_chanB = CComObject<ListenerCallback_chanB>::CreateInstance(&pListenerCallback_chanB);
ptrListenerCallback_chanB = pListenerCallback_chanB;
hr_chanB = pChannelMgr->CreateListener(
DVC_chanB,
0,
(ListenerCallback_chanB*)ptrListenerCallback_chanB,
&ptrListener_chanB);
if (hr_chanB != S_OK) {
hr = hr_chanB;
// All listeners created check
if (hr == S_OK) {
MessageBox(NULL, L"DVC plugin is enabled!", L"Info...", MB_OK | MB_ICONWARNING);
}
else {
MessageBox(NULL, L"DVC plugin is NOT ENABLED!", L"Info...", MB_OK | MB_ICONWARNING);
}
return hr;
}
由于矩阵非常大,所以我想知道仍然可以提高速度。在这里如何使用并行计算?
答案 0 :(得分:4)
在这种情况下,我将使用非并行处理的线程。这是示例代码:
using Base.Threads
function upsum_threads(M)
n = size(M, 1)
chunks = nthreads()
sums = zeros(eltype(M), chunks)
chunkend = round.(Int, n * sqrt.((1:chunks) ./ chunks))
@assert minimum(diff(chunkend)) > 0
chunkstart = [2; chunkend[1:end-1] .+ 1]
@threads for job in 1:chunks
s = zero(eltype(M))
for i in chunkstart[job]:chunkend[job]
@simd for j in 1:(i-1)
@inbounds s += M[j, i]
end
end
sums[job] = s
end
return sum(sums)
end
R = randn(10000,10000)
upsum_threads(R)
它应该可以大大提高速度(即使您删除@threads
也应该更快)。
您可以通过设置JULIA_NUM_THREADS
环境变量来选择Julia使用的线程数。