我对C ++和线程技术还很陌生,我在这个问题上呆了几天。.它应该形成fft(快速傅立叶变换)的基本代码-只是基本代码,因此仍然缺少一些东西例如旋转字词,并且输入是双数(还不是复数)。
我想用C ++对函数 f_thread 进行并行编程...这是一个有效的'可编译'代码
#include<iostream>
#include<thread>
#include <vector>
#include <mutex>
void get_odd_elements(std::vector<double> inpt, std::vector<double> &out) {
for (int i = 0; i < inpt.size()-1; i = i + 2) {out[i/2] = inpt[i];}
}
void get_even_elements(std::vector<double> inpt, std::vector<double> &out) {
for (int i = 1; i < inpt.size(); i = i + 2) {out[i/2] = inpt[i];}
}
void attach(std::vector<double> a, std::vector<double> b, std::vector<double> &out) {
for (int i = 0; i < a.size(); i++) {out[i] = a[i];}
for (int i = a.size(); i < a.size()+b.size(); i++) {out[i] = b[i];}
}
void add_vectors(std::vector<double> &x, std::vector<double> &y, std::vector<double> &z) {for (int i = 0; i < x.size(); i++) {z[i] = x[i] + y[i];}}
void sub_vectors(std::vector<double> &x, std::vector<double> &y, std::vector<double> &z) {for (int i = 0; i < x.size(); i++) {z[i] = y[i] - x[i];}}
//the f_thread function
void f_thread(std::vector<double> in, std::vector<double> &out) {
if (in.size() == 1) {out = in;}
else {
std::vector<double> f0(in.size()/2);
std::vector<double> f1(in.size()/2);
get_odd_elements(in,std::ref(f0)); //get_odd_elements is a function that gets all odd-indexed elements of f
get_even_elements(in,std::ref(f1)); //get_even_elements is a function that gets all even-indexed elements of in
std::vector<double> a(f0.size());
std::vector<double> b(f1.size());
std::mutex mtx1; std::mutex mtx2;
std::thread t0(f_thread,std::ref(f0),std::ref(a)); //create thread for f_thread on a
std::thread t1(f_thread,std::ref(f1),std::ref(b)); //create thread for f_thread on b
t0.join(); t1.join(); // join 2 threads
std::vector<double> a_out(f0.size());
std::vector<double> b_out(f1.size());
add_vectors(std::ref(a),std::ref(b),std::ref(a_out)); //call add_vectors function : a + b
sub_vectors(std::ref(a),std::ref(b),std::ref(b_out)); //call sub_vectors function : b - a
std::vector<double> f_out(in.size());
attach(a_out,b_out,std::ref(f_out)); //attach is a function that appends b to the end of a
out = f_out;
}
}
int main() {
int n_elements = 16;
std::vector<double> sample_input(n_elements);
for (int i = 0; i < n_elements; i++) {sample_input[i] = i;}
std::vector<double> output(n_elements);
std::thread start(f_thread,std::ref(sample_input),std::ref(output));
start.join();
for (int i = 0; i < n_elements; i++) {std::cout << "output element "; std::cout << i; std::cout << ": "; std::cout << output[i]; std::cout<< "\n";}
}
因此 f_thread 被初始化为线程,然后创建2个子线程,这些子线程递归调用 f_thread 。我尝试了使用互斥锁的几种技巧,但是似乎没有用,因为两个子线程之间的同步不理想(这是竞争条件的热点)。这是我尝试的一个代码,但是没有用。我还尝试使用全局递归互斥体,但仍无改善。
#include<iostream>
#include<thread>
#include <vector>
#include <mutex>
void get_odd_elements(std::vector<double> inpt, std::vector<double> &out) {
for (int i = 0; i < inpt.size()-1; i = i + 2) {out[i/2] = inpt[i];}
}
void get_even_elements(std::vector<double> inpt, std::vector<double> &out) {
for (int i = 1; i < inpt.size(); i = i + 2) {out[i/2] = inpt[i];}
}
void attach(std::vector<double> a, std::vector<double> b, std::vector<double> &out) {
for (int i = 0; i < a.size(); i++) {out[i] = a[i];}
for (int i = a.size(); i < a.size()+b.size(); i++) {out[i] = b[i];}
}
void add_vectors(std::vector<double> &x, std::vector<double> &y, std::vector<double> &z) {for (int i = 0; i < x.size(); i++) {z[i] = x[i] + y[i];}}
void sub_vectors(std::vector<double> &x, std::vector<double> &y, std::vector<double> &z) {for (int i = 0; i < x.size(); i++) {z[i] = y[i] - x[i];}}
//the f_thread function
void f_thread(std::vector<double> in, std::vector<double> &out) {
if (in.size() == 1) {out = in;}
else {
std::vector<double> f0(in.size()/2);
std::vector<double> f1(in.size()/2);
get_odd_elements(in,std::ref(f0)); //get_odd_elements is a function that gets all odd-indexed elements of f
get_even_elements(in,std::ref(f1)); //get_even_elements is a function that gets all even-indexed elements of in
std::vector<double> a(f0.size());
std::vector<double> b(f1.size());
std::mutex mtx1; std::mutex mtx2;
mtx1.lock(); std::thread t0(f_thread,std::ref(f0),std::ref(a)); mtx1.unlock(); //create thread for f_thread on a
mtx2.lock(); std::thread t1(f_thread,std::ref(f1),std::ref(b)); mtx2.unlock(); //create thread for f_thread on b
t0.join(); t1.join(); // join 2 threads
std::vector<double> a_out(f0.size());
std::vector<double> b_out(f1.size());
add_vectors(std::ref(a),std::ref(b),std::ref(a_out)); //call add_vectors function : a + b
sub_vectors(std::ref(a),std::ref(b),std::ref(b_out)); //call sub_vectors function : b - a
std::vector<double> f_out(in.size());
attach(a_out,b_out,std::ref(f_out)); //attach is a function that appends b to the end of a
out = f_out;
}
}
int main() {
int n_elements = 16;
std::vector<double> sample_input(n_elements);
for (int i = 0; i < n_elements; i++) {sample_input[i] = i;}
std::vector<double> output(n_elements);
std::thread start(f_thread,std::ref(sample_input),std::ref(output));
start.join();
for (int i = 0; i < n_elements; i++) {std::cout << "output element "; std::cout << i; std::cout << ": "; std::cout << output[i]; std::cout<< "\n";}
}
我必须验证此代码是否可以在Linux(ubuntu 18.04)操作系统中使用带有标准C ++库的g ++ f_thread.cpp -pthread编译
代码现在可以运行(不再有“异常终止的核心转储错误”),但是线程版本的输出在每次运行时都会更改(表明同步工作不正常)。
作为参考,这是不使用子线程且运行良好的顺序版本代码(即,每次运行时输出均无变化)
// WORKING sequential version
#include<iostream>
#include<thread>
#include <vector>
#include <mutex>
void get_odd_elements(std::vector<double> inpt, std::vector<double> &out) {
for (int i = 0; i < inpt.size()-1; i = i + 2) {out[i/2] = inpt[i];}
}
void get_even_elements(std::vector<double> inpt, std::vector<double> &out) {
for (int i = 1; i < inpt.size(); i = i + 2) {out[i/2] = inpt[i];}
}
void attach(std::vector<double> a, std::vector<double> b, std::vector<double> &out) {
for (int i = 0; i < a.size(); i++) {out[i] = a[i];}
for (int i = a.size(); i < a.size()+b.size(); i++) {out[i] = b[i];}
}
void add_vectors(std::vector<double> &x, std::vector<double> &y, std::vector<double> &z) {for (int i = 0; i < x.size(); i++) {z[i] = x[i] + y[i];}}
void sub_vectors(std::vector<double> &x, std::vector<double> &y, std::vector<double> &z) {for (int i = 0; i < x.size(); i++) {z[i] = y[i] - x[i];}}
//the f_thread function
void f_thread(std::vector<double> in, std::vector<double> &out) {
if (in.size() == 1) {out = in;}
else {
std::vector<double> f0(in.size()/2);
std::vector<double> f1(in.size()/2);
get_odd_elements(in,std::ref(f0)); //get_odd_elements is a function that gets all odd-indexed elements of f
get_even_elements(in,std::ref(f1)); //get_even_elements is a function that gets all even-indexed elements of in
std::vector<double> a(f0.size());
std::vector<double> b(f1.size());
f_thread(std::ref(f0),std::ref(a)); // no thread, just call recursion
f_thread(std::ref(f1),std::ref(b)); // no thread, just call recursion
std::vector<double> a_out(f0.size());
std::vector<double> b_out(f1.size());
add_vectors(std::ref(a),std::ref(b),std::ref(a_out)); //call add_vectors function : a + b
sub_vectors(std::ref(a),std::ref(b),std::ref(b_out)); //call sub_vectors function : b - a
std::vector<double> f_out(in.size());
attach(a_out,b_out,std::ref(f_out)); //attach is a function that appends b to the end of a
out = f_out;
}
}
int main() {
int n_elements = 16;
std::vector<double> sample_input(n_elements);
for (int i = 0; i < n_elements; i++) {sample_input[i] = i;}
std::vector<double> output(n_elements);
std::thread start(f_thread,std::ref(sample_input),std::ref(output));
start.join();
for (int i = 0; i < n_elements; i++) {std::cout << "output element "; std::cout << i; std::cout << ": "; std::cout << output[i]; std::cout<< "\n";}
}
应该在每次运行代码时将结果固定到此输出。
output element 0: 120
output element 1: 0
output element 2: 0
output element 3: 7.31217e-322
output element 4: 0
output element 5: 6.46188e-319
output element 6: 56
output element 7: 0
output element 8: 0
output element 9: 4.19956e-322
output element 10: 120
output element 11: 0
output element 12: 0
output element 13: 7.31217e-322
output element 14: 0
output element 15: 6.46188e-319
答案 0 :(得分:2)
这不是线程错误,而是对函数Sub TESTpdf()
Sheets("Dashboard").PageSetup.Orientation = xlLandscape
Sheets("Dashboard").ExportAsFixedFormat _
Type:=xlTypePDF, _
FileName:="test", _
Quality:=xlQualityStandard, _
IncludeDocProperties:=True, _
IgnorePrintAreas:=False, _
OpenAfterPublish:=False
End Sub
中数组元素的越界访问:
attach
在第二个循环中,索引从void attach(std::vector<double> a, std::vector<double> b, std::vector<double> &out) {
for (int i = 0; i < a.size(); i++) {out[i] = a[i];}
for (int i = a.size(); i < a.size()+b.size(); i++) {out[i] = b[i];}
}
开始,而不是从0开始-但您可以使用它来访问a.size()
的元素,就像从0开始一样。
您可以使用b
中的std::copy
来代替编写循环:
<algorithm>
在那之后,对于递归线程,您只需要这样做:
void attach(std::vector<double> a, std::vector<double> b, std::vector<double> &out) {
std::copy(a.begin(), a.end(), out.begin());
std::copy(b.begin(), b.end(), out.begin()+a.size());
}
没有种族,因为每个线程都使用单独的输入和输出数组(您在“父”线程的堆栈上创建)。结果是确定性的,对于顺序版本和线程版本都是相同的:
std::thread t0(f_thread,std::ref(f0),std::ref(a)); //create thread for f_thread on a
std::thread t1(f_thread,std::ref(f1),std::ref(b)); //create thread for f_thread on b
t0.join(); t1.join(); // join 2 threads
顺便说一句,您可能已经猜到即使您的序列号也是不正确的,因为输入数据都是整数,而您只复制,加减这些数字即可;因此没有理由让output element 0: 120
output element 1: 64
output element 2: 32
output element 3: 0
output element 4: 16
output element 5: 0
output element 6: 0
output element 7: 0
output element 8: 8
output element 9: 0
output element 10: 0
output element 11: 0
output element 12: 0
output element 13: 0
output element 14: 0
output element 15: 0
之类的浮点数出现在输出中。
也请注意戴维斯·赫林(Davis Herring)的评论:您在向量之间大量复制数据。至少,我会通过const引用而不是通过值将向量传递给函数(除非已知消除了这些副本)。
最后,您应该比输入数组的大小为1的时候更早地停止创建新线程。对于实际的问题大小,您可能无法创建数千个线程。即使成功了,创建和运行那么多线程的开销也会使您的代码运行非常缓慢。理想情况下,您创建的线程不应超过运行代码的计算机上的硬件核心。
答案 1 :(得分:1)
您应该通过询问有多少cpus,然后拆分工作并使用队列将其重新结合在一起来解决此问题。
我不知道FFT算法,但是通过粗略地查看代码,看起来您基本上是使用越来越细的齿梳来分割数据的。除了从最好的层次开始并逐步提高,这不是拆分事物的好方法。
您不希望其他CPU处理所有其他值,因为即使在单芯片多核CPU上,也存在多个L1缓存。每个L1缓存最多与另一个内核共享。因此,您希望单个CPU处理的所有值都彼此接近,以最大程度地增加您要查找的值在缓存中的机会。
因此,您应该从最大的连续块开始分割。由于FFT算法基于2的幂进行工作,因此您应计算拥有的内核数。使用thread::hardware_concurrency()
进行计数。然后取整到下一个最高的2的幂,然后将问题分解为该数量的子FFT。然后将其结果合并到主线程中。
我有一个程序可以编写您想要的功能。 splits up a list into a number of chunks to run sort on。然后它有一个需要完成的合并队列。每个块都由一个单独的线程处理,并且每个合并也都派生到它自己的线程中。
由于我不喜欢现代CPU的一项功能,即超线程,我将内核数一分为二。我本可以忽略这一点,但它会运行的很好,尽管由于主要争执已经超过了整数ALU,所以它可能会慢一点。 (超线程在单个内核中共享资源。)
在另一个答案中,听起来您的FFT代码有一些错误。我建议将其仅与一个线程一起使用,然后弄清楚如何将其拆分。