Question

过去3天，我一直在为此苦苦挣扎。我在图像处理中做一些事情。我到了一个可以将工作流分配到更多线程的地步，因为我有图像的“补丁”，可以传递给不同的线程。不幸的是，无论使用1个或多个线程，处理图像的整个时间都是相同的。

所以我开始进行挖掘，制作补丁的副本，以便每个线程都有自己的本地数据，停止将结果写入数组，但是仍然相同。所以我做了我可以拥有的最简约的程序。创建线程后，它将生成10x10矩阵并将其行列式写入控制台。因此，它们之间没有共享，只有通过的是线程索引。

但是还是一样。我在Linux和Windows上都进行了测试。这些显示了计算一个行列式所需的时间，因此，如果没有另外说明，则在使用两个线程时，每个线程花费相同的时间：

Windows：

1个线程= 4479ms

2个线程= 7500ms

3个线程= 11300ms

4个线程= 15800毫秒

Linux：

1个线程= 490ms

2个线程= 478ms

3个线程= First：503ms;其他两个：1230ms

4个线程= 1340ms

第一件事很明显，Linux计算同一事物的速度提高了10倍。没关系。但是Windows并不是说单线程性能会更差，无论我添加多少，它都会变得更糟。仅当在逻辑核心上完成工作负载时，Linux才似乎放慢了速度。这就是为什么1和2可以正常运行的原因，因为我有2Core HT，并且在使用3个线程时，它也降低了使用HT的内核的速度，但另一个也可以。但是无论如何，Windows都很烂。

有趣的是，在Windows上，如果我在一个核心上计算4个行列式或在每个核心上计算1个行列式，则花费的时间是相同的。

我用来获取这些结果的代码。我能够用g ++和msvc编译没有问题。重要的只是最后几种方法，有些不确定的构造函数没有被使用。

#include <iostream>
#include <cmath>
#include <thread>
#include <chrono>
#include <float.h>

class FVector
{
public:
    FVector();
    FVector(int length);
    FVector(const FVector &vec);
    FVector(FVector &&vec);
    FVector &operator=(const FVector &vec);
    FVector &operator=(FVector &&vec);
    ~FVector();

    void setLength(int length);
    int getLength() const;
    double *getData();
    const double* getConstData() const;

private:
    double *data;
    int length;

    void allocateDataArray(int length);
    void deallocateDataArray();
};

FVector::FVector() {
    data = nullptr;
    length = 0;
}

FVector::FVector(int length) {
    data = nullptr;
    this->length = length;

    allocateDataArray(length);

    for (int i = 0; i < length; i++) {
        data[i] = 0.;
    }
}

FVector::FVector(const FVector &vec) {
    allocateDataArray(vec.length);
    length = vec.length;

    for (int i = 0; i < length; i++) {
        data[i] = vec.data[i];
    }
}

FVector::FVector(FVector &&vec) {
    data = vec.data;
    vec.data = nullptr;
    length = vec.length;
}

FVector &FVector::operator=(const FVector &vec) {
    deallocateDataArray();

    if (data == nullptr) {
        allocateDataArray(vec.length);

        for (int i = 0; i < vec.length; i++) {
            data[i] = vec.data[i];
        }
        length = vec.length;
    }

    return *this;
}

FVector &FVector::operator=(FVector &&vec) {
    deallocateDataArray();

    if (data == nullptr) {
        data = vec.data;
        vec.data = nullptr;
        length = vec.length;
    }

    return *this;
}

FVector::~FVector() {
    deallocateDataArray();
}

void FVector::allocateDataArray(int length) {
    data = new double[length];
}

void FVector::deallocateDataArray() {
    if (data != nullptr) {
        delete[] data;
    }

    data = nullptr;
}

int FVector::getLength() const {
    return length;
}

double *FVector::getData() {
    return data;
}

void FVector::setLength(int length) {
    deallocateDataArray();
    allocateDataArray(length);
    this->length = length;
}

const double* FVector::getConstData() const {
    return data;
}

class FMatrix
{
public:
    FMatrix();
    FMatrix(int columns, int rows);
    FMatrix(const FMatrix &mat);
    FMatrix(FMatrix &&mat);
    FMatrix& operator=(const FMatrix &mat);
    FMatrix& operator=(FMatrix &&mat);
    ~FMatrix();

    FVector *getData();
    const FVector* getConstData() const;
    void makeIdentity();
    int determinant() const;

private:
    FVector *data;
    int columns;
    int rows;

    void deallocateDataArray();
    void allocateDataArray(int count);
};


FMatrix::FMatrix() {
    data = nullptr;
    columns = 0;
    rows = 0;
}

FMatrix::FMatrix(int columns, int rows) {
    data = nullptr;
    allocateDataArray(columns);

    for (int i = 0; i < columns; i++) {
        data[i].setLength(rows);
    }

    this->columns = columns;
    this->rows = rows;
}

FMatrix::FMatrix(const FMatrix &mat) {
    data = nullptr;
    allocateDataArray(mat.columns);

    for (int i = 0; i < mat.columns; i++) {
        data[i].setLength(mat.data[i].getLength());
        data[i] = mat.data[i];
    }

    columns = mat.columns;
    rows = mat.rows;
}

FMatrix::FMatrix(FMatrix &&mat) {
    data = mat.data;
    mat.data = nullptr;

    columns = mat.columns;
    rows = mat.rows;
}

FMatrix &FMatrix::operator=(const FMatrix &mat) {
    deallocateDataArray();

    if (data == nullptr) {
        allocateDataArray(mat.columns);

        for (int i = 0; i < mat.columns; i++) {
            data[i].setLength(mat.rows);
            data[i] = mat.data[i];
        }
    }

    columns = mat.columns;
    rows = mat.rows;

    return *this;
}

FMatrix &FMatrix::operator=(FMatrix &&mat) {
    deallocateDataArray();

    data = mat.data;
    mat.data = nullptr;

    columns = mat.columns;
    rows = mat.rows;

    return *this;
}

FMatrix::~FMatrix() {
    deallocateDataArray();
}

void FMatrix::deallocateDataArray() {
    if (data != nullptr) {
        delete[] data;
    }

    data = nullptr;
}

void FMatrix::allocateDataArray(int count) {
    data = new FVector[count];
}

FVector *FMatrix::getData() {
    return data;
}

void FMatrix::makeIdentity() {
    for (int i = 0; i < columns; i++) {
        for (int j = 0; j < rows; j++) {
            if (i == j) {
                data[i].getData()[j] = 1.;
            }
            else {
                data[i].getData()[j] = 0.;
            }
        }
    }
}

int FMatrix::determinant() const {
    int det = 0;
    FMatrix subMatrix(columns - 1, rows - 1);
    int subi;

    if (columns == rows && rows == 1) {
        return data[0].getData()[0];
    }

    if (columns != rows) {
        //throw EXCEPTIONS::SINGULAR_MATRIX;
    }

    if (columns == 2)
        return ((data[0].getConstData()[0] * data[1].getConstData()[1]) - (data[1].getConstData()[0] * data[0].getConstData()[1]));
    else {
        for (int x = 0; x < columns; x++) {
            subi = 0;

            for (int i = 0; i < columns; i++) {

                for (int j = 1; j < columns; j++) {

                    if (x == i) {
                        continue;
                    }

                    subMatrix.data[subi].getData()[j - 1] = data[i].getConstData()[j];
                }

                if (x != i) {
                    subi++;
                }
            }

            det += (pow(-1, x) * data[x].getConstData()[0] * subMatrix.determinant());
        }
    }

    return det;
}

const FVector* FMatrix::getConstData() const {
    return data;
}

class FCore
{
public:
    FCore();
    ~FCore();

    void process();

private:
    int getMaxThreads() const;
    void joinThreads(std::thread *threads, int max);
};


void parallelTest(int i) {
    auto start = std::chrono::high_resolution_clock::now();
    FMatrix m(10, 10);
    m.makeIdentity();
    std::cout << "Det: " << i << "= " << m.determinant() << std::endl;
    auto finish = std::chrono::high_resolution_clock::now();
    auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(finish - start);
    std::cout << "Time: " << microseconds.count() / 1000. << std::endl;
}

FCore::FCore()
{
}


FCore::~FCore()
{
}

void FCore::process() {
    /*********************************************/
    /*Set this to limit number of created threads*/
    int threadCount =  getMaxThreads();
    /*********************************************/
    /*********************************************/
    std::cout << "Thread count: " << threadCount;
    std::thread *threads = new std::thread[threadCount];

    for (int i = 0; i < threadCount; i++) {
        threads[i] = std::thread(parallelTest, i);
    }

    joinThreads(threads, threadCount);
    delete[] threads;
    getchar();
}

int FCore::getMaxThreads() const {
    int count = std::thread::hardware_concurrency();

    if (count == 0) {
        return 1;
    }
    else {
        return count;
    }
}

void FCore::joinThreads(std::thread *threads, int max) {
    for (int i = 0; i < max; i++) {
        threads[i].join();
    }
}

int main() {
    FCore core;

    core.process();
    return 0;
}

很明显，我已经用更原始的方法进行了一些测试，就像加数一样简单，并且是相同的。所以我只想问你们是否曾经偶然发现过类似的东西。我知道我将无法像在Linux上那样在Windows上度过难忘的时光，但至少扩展性会更好。

在Win7 / Linux intel 2C + 2T和Win10 ryzen 8C + 8T上进行了测试。发布的时间是从2C + 2T开始

添加线程会增加执行相同任务所需的时间

0 个答案: