我正在使用OpenMP处理scrypt
的并行版本。该计划即将结束。
可并行化部分在没有OpenMP的情况下工作正常(1),在(2)中使用OpenMP以及循环中的类变量XY
和V
:
// 2: for i = 0 to p - 1 do
#pragma omp parallel for
for (unsigned int i = 0; i < parallel; ++i)
{
SecByteBlock XY(static_cast<size_t>(blockSize * 256U));
SecByteBlock V(static_cast<size_t>(blockSize * cost * 128U));
// 3: B_i <-- MF(B_i, N)
const ptrdiff_t offset = static_cast<ptrdiff_t>(blockSize*i*128);
Smix(B+offset, static_cast<size_t>(blockSize), cost, V, XY);
}
SecByteBlock
包裹new
和delete
,其行为与vector
非常相似。问题是,SecByteBlock
有一个归零者。每次破坏类时,所有内存都设置为0.然后,为循环顶部的线程创建一个新内存。这是低效的。
我试图将XY
和V
提升到循环之外,这样它们就会被破坏一次。尝试如下所示,但它们都会导致分段错误:
$ time OMP_NUM_THREADS=4 ./test.exe
Threads: 4
Segmentation fault (core dumped)
real 0m9.561s
user 0m0.011s
sys 0m2.750s
如何从并行for循环中提升类变量,以便它不会经历不需要的创建/破坏循环?
提升循环 :
SecByteBlock XY(static_cast<size_t>(blockSize * 256U));
SecByteBlock V(static_cast<size_t>(blockSize * cost * 128U));
// 2: for i = 0 to p - 1 do
#pragma omp parallel for
for (unsigned int i = 0; i < parallel; ++i)
{
// 3: B_i <-- MF(B_i, N)
const ptrdiff_t offset = static_cast<ptrdiff_t>(blockSize*i*128);
Smix(B+offset, static_cast<size_t>(blockSize), cost, V, XY);
}
线程私有 :
SecByteBlock XY(static_cast<size_t>(blockSize * 256U));
SecByteBlock V(static_cast<size_t>(blockSize * cost * 128U));
// 2: for i = 0 to p - 1 do
#pragma omp parallel for private(XY) private(V)
for (unsigned int i = 0; i < parallel; ++i)
{
// 3: B_i <-- MF(B_i, N)
const ptrdiff_t offset = static_cast<ptrdiff_t>(blockSize*i*128);
Smix(B+offset, static_cast<size_t>(blockSize), cost, V, XY);
}
并行部分和线程私有 :
#pragma omp parallel
{
SecByteBlock XY(static_cast<size_t>(blockSize * 256U));
SecByteBlock V(static_cast<size_t>(blockSize * cost * 128U));
// 2: for i = 0 to p - 1 do
#pragma omp parallel for private(XY) private(V)
for (unsigned int i = 0; i < parallel; ++i)
{
// 3: B_i <-- MF(B_i, N)
const ptrdiff_t offset = static_cast<ptrdiff_t>(blockSize*i*128);
Smix(B+offset, static_cast<size_t>(blockSize), cost, V, XY);
}
}
这是测试程序。使用循环中的类变量清理Valgrind,Coverity和Sanitizer。将它们吊出循环时会出现问题。
$ cat test.cxx
#include "cryptlib.h"
#include "secblock.h"
#include "scrypt.h"
#include "osrng.h"
#include "files.h"
#include "hex.h"
#include <iostream>
#include <omp.h>
int main()
{
int threads = 1;
#pragma omp parallel
{
threads = omp_get_num_threads();
}
std::cout << "Threads: " << threads << std::endl;
using namespace CryptoPP;
SecByteBlock derived(64);
const byte pwd[] = "password";
const byte salt[] = "NaCl";
Scrypt scrypt;
scrypt.DeriveKey(derived, derived.size(), pwd, 8, salt, 4, 1<<20, 8, 16);
std::cout << "Derived: ";
StringSource(derived, 16, true, new HexEncoder(new FileSink(std::cout)));
std::cout << "..." << std::endl;
return 0;
}
答案 0 :(得分:2)
您的描述不清楚变量是否可以在多次迭代中重复使用而无需任何初始化。但是我们假设情况就是这样。
然后如果你为每个线程声明变量,然后让OpenMP在for循环中进行并行化
#pragma omp parallel
{
// Each OpenMP thread has its own variable
SecByteBlock XY(static_cast<size_t>(blockSize * 256U));
SecByteBlock V(static_cast<size_t>(blockSize * cost * 128U));
// 2: for i = 0 to p - 1 do
#pragma omp for
for (unsigned int i = 0; i < parallel; ++i)
{
// 3: B_i <-- MF(B_i, N)
const ptrdiff_t offset = static_cast<ptrdiff_t>(blockSize*i*128);
Smix(B+offset, static_cast<size_t>(blockSize), cost, V, XY);
}
}