Question

我正在尝试编写一段代码，该代码经过8 ^ 12次迭代的循环，并且在每次迭代中，当满足某些条件时，我都将push_back返回到一个向量（每个线程将自己的向量传递给push_back，我在循环后合并）。但是似乎我的执行花费更多的时间，更多的线程处于活动状态。这是传递给每个线程的函数（对象的方法）：

void HamiltonianKH::mapping_kernel(ull_int start, ull_int stop, std::vector<ull_int>* map_threaded, int _id) {
int n = 1;
out << "A new thread joined tha party! from " << start << " to " << stop << endl;
for (ull_int j = start; j < stop; j++) {
    int bSz = 0, fSz = 0, N_e = 0;
    std::tie(bSz, fSz, N_e) = calculateSpinElements(this->L, j);
    if ((bSz + fSz == this->Sz) && N_e == this->num_of_electrons) 
         map_threaded->push_back(j);
    if (show_system_size_parameters == true && (j - start) % ull_int((stop - start) * n / 4) == 0 && j > 0) { 
        out << n << "-th quarter of " << _id << endl; 
        n++; 
    }
}

} ，这是caulculate_spinelements函数：

std::tuple<int, int, int> calculateSpinElements(int L, ull_int& j) {
int bSz = 0; //bosonic total spin - spin of upper orbital locked to n=1 filling
int fSz = 0; //fermionic total spin
int N_e = 0; // numer of electrons in given state
std::vector<int> temp = int_to_binary(j, L);

for (int k = 0; k < L; k++) {
    if (temp[k] < 4) bSz += 1;
    else bSz -= 1;
    if (temp[k] % 4 == 1) {
        fSz += 1;
        N_e += 1;
    }
    else if (temp[k] % 4 == 2) {
        fSz -= 1;
        N_e += 1;
    }
    else if (temp[k] % 4 == 3)
        N_e += 2;
}

return std::make_tuple(bSz, fSz, N_e);

}

和她是线程的分离：

void HamiltonianKH::generate_mapping() {
ull_int start = 0, stop = std::pow(8, L);
//mapping_kernel(start, stop, mapping, L, Sz, num_of_electrons);
//Threaded
std::vector<std::vector<ull_int>*> map_threaded(num_of_threads);
std::vector<std::thread> threads;
threads.reserve(num_of_threads);
for (int t = 0; t < num_of_threads; t++) {
    start = t * (ull_int)std::pow(8, L) / num_of_threads;
    stop = ((t + 1) == num_of_threads ? (ull_int)std::pow(8, L) : (ull_int)std::pow(8, L) * (t + 1) / num_of_threads);
    map_threaded[t] = new std::vector<ull_int>();
    threads.emplace_back(&HamiltonianKH::mapping_kernel, this, start, stop, map_threaded[t], t);
}
for (auto& t : threads) t.join();
for (auto& t : threads) t.~thread();

ull_int size = 0;
for (auto& t : map_threaded) {
    size += t->size();
}

out << "size = " << size << endl;
for (auto & t : map_threaded)
    mapping->insert(mapping->end(), t->begin(), t->end());
//sort(mapping->begin(), mapping->end());
if (show_system_size_parameters == true) {
    out << "Mapping generated with  " << mapping->size() << "  elements" << endl;
    out << "Last element = " << mapping->at(mapping->size() - 1) << endl;
}
//out << mapping[0] << " " << mapping[mapping.size() - 1] << endl;
assert(mapping->size() > 0 && "Not possible number of electrons - no. of states < 1");

}

变量：映射，L，num_of_electrons和Sz是对象中的公共字段。整个代码有2000行以上，但是generate_mapping（）调用之后的执行与该问题无关。

你们有一个主意吗，为什么这段代码在更多线程上执行得更长？

非常感谢您。

与单线程相比，多线程C ++的执行时间更长

0 个答案: