C ++ 11多线程比单线程慢

时间:2014-12-07 15:36:06

标签: c++ multithreading c++11 graphics

我是多任务处理的绝对初学者,他阅读了一些基础知识并尝试将其用于我的项目以进行对象可视化。问题是我实现的多线程解决方案比单线程解决方案慢,我不知道为什么,我有不明原因的意外应用程序代码。我给你两个案例,我试图获得更好的性能。我想知道我不理解的东西以及我在一般观点上有错误的地方。我给你部分源代码,并在最后总结所有问题。

这是我的线程工厂实现(非常基本,但它只是开始):

threadfactory.h

#pragma once

#include <vector>
#include "ThreadInterface.h"
#include "../MemoryManagement/MemoryMgr.h"
#include "../Logging/LoggingDefines.h"

class CThreadFactory : public CThreadIntearface
{
    public:
        CThreadFactory();
        CThreadFactory(BYTE max_threads);
        ~CThreadFactory();

        void Init(BYTE max_threads);
        void Clear(void);

        //update waves
        virtual void UpdateWavesInternalPoints(CWaves& waves);
        virtual void UpdateWavesNormals(CWaves& waves);

        //update vertices
        virtual void TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix);

        static const char* GetHeapName(void) { return "Thread factory"; }
#if (defined(DEBUG) | defined(_DEBUG))
        /**
        *   Return class name. This function is compiled only in debug mode.
        *   \return class name
        */
        NAME_FUNC();
#endif

    private:
        void Join(vector<std::thread>& threads);
        void ReleaseThreads(vector<std::thread>& threads);

    private:
        UINT muiNumberofThreads;

    private:
        DECLARE_HEAP;
};

threadfactory.cpp

#include "ThreadFactory.h"

CThreadFactory::CThreadFactory()
{
    TRACE(LOG_DEBUG, string("Start of initialization of object \"") + GetName() + string("\""));
    muiNumberofThreads = 1;
    TRACE(LOG_DEBUG, string("End of initialization of object \"") + GetName() + string("\""));
}

CThreadFactory::CThreadFactory(BYTE max_threads)
{
    TRACE(LOG_DEBUG, string("Start of initialization of object \"") + GetName() + string("\""));
    Init(max_threads);
    TRACE(LOG_DEBUG, string("End of initialization of object \"") + GetName() + string("\""));
}

CThreadFactory::~CThreadFactory()
{
    TRACE(LOG_DEBUG, string("Start of releasing of object \"") + GetName() + string("\""));
    Clear();
    TRACE(LOG_DEBUG, string("End of releasing of object \"") + GetName() + string("\""));
}

void CThreadFactory::Init(BYTE max_threads)
{
    muiNumberofThreads = max_threads;
}

void CThreadFactory::Clear(void)
{

}

void CThreadFactory::Join(vector<std::thread>& threads)
{
    for (auto& it : threads)
    {
        if (it.joinable())
            it.join();
    }
}

void CThreadFactory::ReleaseThreads(vector<std::thread>& threads)
{
    /*for (auto& it : threads)
    {

    }*/

    threads.clear();
}

void CThreadFactory::UpdateWavesInternalPoints(CWaves& waves)
{
    if (muiNumberofThreads <= 1)
    {
        waves.UpdateWaveInteriorPoints(1, waves.RowCount() - 1);
    }
    else
    {
        vector<std::thread> threads(muiNumberofThreads - 1);
        UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;

        DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference;
        for (UINT i = 0; i < muiNumberofThreads - 1; i++)
        {
            threads[i] = move(std::thread{ &CWaves::UpdateWaveInteriorPoints, &waves, dwMinRow, dwMaxRow });

            dwMinRow += dwWavePartDifference;
            dwMaxRow += dwWavePartDifference;
        }

        waves.UpdateWaveInteriorPoints(dwMinRow, dwMaxRow);

        Join(threads);
        ReleaseThreads(threads);
    }
}

void CThreadFactory::UpdateWavesNormals(CWaves& waves)
{
    if (muiNumberofThreads <= 1)
    {
        waves.UpdateWaveNormals(1, waves.RowCount() - 1);
    }
    else
    {
        vector<std::thread> threads(muiNumberofThreads - 1);
        UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;

        DWORD dwMinRow = 1, dwMaxRow = 1 + dwWavePartDifference;
        for (UINT i = 0; i < muiNumberofThreads - 1; i++)
        {
            threads[i] = move(std::thread{ &CWaves::UpdateWaveNormals, &waves, dwMinRow, dwMaxRow });

            dwMinRow += dwWavePartDifference;
            dwMaxRow += dwWavePartDifference;
        }

        waves.UpdateWaveNormals(dwMinRow, dwMaxRow);

        Join(threads);
        ReleaseThreads(threads);
    }
}

void CThreadFactory::TransformVertices(const CObject& object, const vector<TVertex>& input, vector<XMFLOAT3>& output, const CXNAMatrix& matrix)
{
    if (output.size() != input.size())
        output.resize(input.size());

    if ((muiNumberofThreads <= 1) || (input.size() < 1000))
    {
        object.TransformVerticesSet(input.begin(), output.begin(), input.size() - 1, matrix);
    }
    else
    {
        vector<std::thread> threads(muiNumberofThreads - 1);
        UINT uiThreadVertexCount = input.size() / muiNumberofThreads;
        UINT uiStartVertexIndex = 0;

        for (UINT i = 0; i < muiNumberofThreads - 1; i++)
        {
            if (uiStartVertexIndex >= input.size())
                uiStartVertexIndex = input.size() - 1;

            threads[i] = move(std::thread{ &CObject::TransformVerticesSet, &object, input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix });

            uiStartVertexIndex += uiThreadVertexCount;
        }

        object.TransformVerticesSet(input.begin() + uiStartVertexIndex, output.begin() + uiStartVertexIndex, uiThreadVertexCount - 1, matrix);

        Join(threads);
        ReleaseThreads(threads);
    }
}

#if (defined(DEBUG) | defined(_DEBUG))

NAME_BODY(CThreadFactory, "Threads");

#endif

DEFINE_HEAP(CThreadFactory, GetHeapName());

1。 Wave更新:

我正在使用名为Wave的对象。该对象隐含约40 000个顶点。我使用这些功能为每个帧更新它:

void CWaves::UpdateWaveInteriorPoints(DWORD min_row, DWORD max_row)
{
    if (min_row < 1)
        min_row = 1;

    if (max_row > (RowCount() - 1))
        max_row = (RowCount() - 1);

    for (DWORD i = min_row; i < max_row; ++i)
    {
        for (DWORD j = 1; j < ColumnCount() - 1; ++j)
        {
            // After this update we will be discarding the old previous
            // buffer, so overwrite that buffer with the new update.
            // Note how we can do this inplace (read/write to same element) 
            // because we won't need prev_ij again and the assignment happens last.

            // Note j indexes x and i indexes z: h(x_j, z_i, t_k)
            // Moreover, our +z axis goes "down"; this is just to 
            // keep consistent with our row indices going down.

            GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y =
                GetK1()*GetPrevSolutionVertices()[i*ColumnCount() + j].Position.y +
                GetK2()*mpObjectMesh->mVertices[i*ColumnCount() + j].Position.y +
                GetK3()*(mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y +
                mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y +
                mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y +
                mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y);
        }
    }
}

void CWaves::UpdateWaveNormals(DWORD min_row, DWORD max_row)
{
    if (min_row < 1)
        min_row = 1;

    if (max_row >(RowCount() - 1))
        max_row = (RowCount() - 1);

    for (UINT i = min_row; i < max_row; ++i)
    {
        for (UINT j = 1; j < ColumnCount() - 1; ++j)
        {
            float l = mpObjectMesh->mVertices[i*ColumnCount() + j - 1].Position.y;
            float r = mpObjectMesh->mVertices[i*ColumnCount() + j + 1].Position.y;
            float t = mpObjectMesh->mVertices[(i - 1)*ColumnCount() + j].Position.y;
            float b = mpObjectMesh->mVertices[(i + 1)*ColumnCount() + j].Position.y;
            mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.x = -r + l;
            mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.y = 2.0f*GetSpatialStep();
            mpObjectMesh->mVertices[i*ColumnCount() + j].Normal.z = b - t;

            XMVECTOR n = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal));
            XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].Normal, n);

            mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU = XMFLOAT3(2.0f*GetSpatialStep(), r - l, 0.0f);
            XMVECTOR T = XMVector3Normalize(XMLoadFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU));
            XMStoreFloat3(&mpObjectMesh->mVertices[i*ColumnCount() + j].TangentU, T);
        }
    }
}

void CWaves::UpdateWave(float dt)
{
    static float t_base = 0.0f;
    if ((g_Timer->TotalTime() - t_base) >= 0.25f)
    {
        t_base += 0.25f;

        DWORD i, j;

        do
        {
            i = 5 + rand() % (RowCount() - 5);
            j = 5 + rand() % (ColumnCount() - 5);
        } while (!((i > 1) && (i < (RowCount() - 2)) &&
            (j > 1) && (j < (ColumnCount() - 2))));

        float r = MathHelper::RandF(1.0f, 2.0f);

        Disturb(i, j, r);
    }

    static float t = 0;

    // Accumulate time.
    t += dt;

    // Only update the simulation at the specified time step.
    if (t >= TimeStep())
    {
        // Only update interior points; we use zero boundary conditions.
        if (g_ThreadFactory)
        {
            g_ThreadFactory->UpdateWavesInternalPoints(*this);
        }
        else
        {
            UpdateWaveInteriorPoints(1, RowCount() - 1);
        }

        // We just overwrote the previous buffer with the new data, so
        // this data needs to become the current solution and the old
        // current solution becomes the new previous solution.
        std::swap(GetPrevSolutionVertices(), mpObjectMesh->mVertices);

        t = 0.0f; // reset time

        if (mShapeDescription.mShapeProperties.bLightedObject)
        {
            //
            // Compute normals using finite difference scheme.
            //
            if (g_ThreadFactory)
            {
                g_ThreadFactory->UpdateWavesNormals(*this);
            }
            else
            {
                UpdateWaveNormals(1, RowCount() - 1);
            }
        }
    }
}

在那种情况下,我认为问题出现在我给所有线程的CWaves对象中,我认为这会导致持续锁定。所以我改变了另一种情况的方法,在那里我尝试使用给定的变换矩阵变换顶点。我没有使用整个对象,而是使用容器迭代器。

2。顶点转换

从上面显示的线程工厂调用的顶点变换方法:

void CObject::TransformVerticesSet(vector<TVertex>::const_iterator input, vector<XMFLOAT3>::iterator output, UINT number_of_vertices, const CXNAMatrix& matrix) const
{
    for (UINT i = 0; i <= number_of_vertices; i++)
    {
        CMatrixTransformations::TransformPoint(input[i].Position, matrix, output[i]);
    }
}

在这种情况下,我尝试使用迭代器而不是给出整个顶点向量,但结果等于先前的解决方案。它比单线程解决方案慢。

编辑&LT; 2014年8月12日&GT;

在之前的代码中,我使用以下宏:

TRACE - 用于记录系统,在发布模式下为空

NAME_FUNC,NAME_BODY - 用于声明的宏和返回类名的类方法的定义

DECLARE_HEAP,DEFINE_HEAP - 为重载的new和delete运算符创建声明和定义

这些都不会影响多线程操作的性能。

关闭我的应用程序后,这是VS 2013的输出(请注意,在这种情况下,我不会在之前的情况下使用多线程):

The thread 0x229c has exited with code 27 (0x1b).
The thread 0x22dc has exited with code 27 (0x1b).
The thread 0x11ac has exited with code 27 (0x1b).
The thread 0x328c has exited with code 27 (0x1b).
The thread 0x205c has exited with code 27 (0x1b).
The thread 0xf4c has exited with code 27 (0x1b).
The thread 0x894 has exited with code 27 (0x1b).
The thread 0x3094 has exited with code 27 (0x1b).
The thread 0x2eb4 has exited with code 27 (0x1b).
The thread 0x2ef8 has exited with code 27 (0x1b).
The thread 0x22f4 has exited with code 27 (0x1b).
The thread 0x2810 has exited with code 27 (0x1b).
The thread 0x29e0 has exited with code 27 (0x1b).
The thread 0x2e54 has exited with code 27 (0x1b).
D3D11 WARNING: Process is terminating. Using simple reporting. Please call ReportLiveObjects() at runtime for standard reporting. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING: Live Producer at 0x012F05A0, Refcount: 8. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING:  Live Object at 0x012F1D38, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN]
D3D11 WARNING:  Live Object at 0x013BA3F8, Refcount: 0. [ STATE_CREATION WARNING #0: UNKNOWN]

The program '[13272] EngineDX.exe' has exited with code 27 (0x1b).

似乎第三方API(可能是DX)正在创建线程,但在进程管理器中我看到只使用一个线程。那可能是个问题...

以下是我的问题:

  1. 我的线程工厂的实现是错误的还是更新4万个顶点不必分成更多的线程?
  2. 如果我锁定了,我想知道原因。顶点变换的解决方案是使用迭代器和顶点向量容器被分割,所以我不应该有锁定。
  3. 我决定从一个原因为每个函数调用创建线程。起初我有线程矢量容器作为线程工厂的成员类。但这导致调试模式下的内存泄漏(发布模式没有这个问题)。只是纯粹的宣言而不做任何事。我从来没有发现原因。是否还有其他必要的东西才能正确释放线程?
  4. 现在我的应用程序以代码27结束,因为所有线程都返回了此错误代码。这是什么意思?
  5. 奇怪的是,当我使用8个线程(8个线程CPU上的7 +主线程)时,在调试模式下,我看到所有8个线程都做了一些事情。但是在发布模式下,根据仅使用一个线程(主线程)没有变化。这是错误的行为还是由于某些原因可以预期?
  6. 对于长篇文章感到抱歉,但我想更精确地避免误解。谢谢你的回答。

    编辑17.12.2014:

    我重新实现了线程使用的函数(并使其独立于Wave类),没有共享对象引用或变量,但它仍然无法工作。我不明白为什么......有趣的是,当我设置8个线程使用时,在调试可执行文件中我看到我的Core i7以100%运行,但没有帧速率的好处。使用发布可执行文件,我看到只有4个线程运行,CPU占25%。

    新的多线程函数:

    void UpdateWaveInteriorPoints(TVertexFieldIterator previous_vertex_field, TVertexFieldIterator actual_vertex_field, DWORD min_row, DWORD max_row, float k1, float k2, float k3, UINT column_count)
    {
        if (min_row < 1)
            min_row = 1;
    
        /*if (max_row >(RowCount() - 1))
            max_row = (RowCount() - 1);*/
    
        for (DWORD i = min_row; i < max_row; ++i)
        {
            for (DWORD j = 1; j < column_count - 1; ++j)
            {
                // After this update we will be discarding the old previous
                // buffer, so overwrite that buffer with the new update.
                // Note how we can do this inplace (read/write to same element) 
                // because we won't need prev_ij again and the assignment happens last.
    
                // Note j indexes x and i indexes z: h(x_j, z_i, t_k)
                // Moreover, our +z axis goes "down"; this is just to 
                // keep consistent with our row indices going down.
    
                previous_vertex_field[i*column_count + j].Position.y =
                    k1*previous_vertex_field[i*column_count + j].Position.y +
                    k2*actual_vertex_field[i*column_count + j].Position.y +
                    k3*(actual_vertex_field[(i + 1)*column_count + j].Position.y +
                    actual_vertex_field[(i - 1)*column_count + j].Position.y +
                    actual_vertex_field[i*column_count + j + 1].Position.y +
                    actual_vertex_field[i*column_count + j - 1].Position.y);
            }
        }
    }
    

    创建线程的函数:

    TVertexFieldIterator tActualVertexIterator = waves.mpObjectMesh->mVertices.begin();
            TVertexFieldIterator tPreviousVertexIterator = waves.GetPrevSolutionVertices().begin();
            std::vector<std::thread> threads;
            //std::vector<std::future<void>> threads;
            UINT dwWavePartDifference = waves.RowCount() / muiNumberofThreads;
    
            DWORD dwMinRow = 1, dwMaxRow = dwWavePartDifference;
            DWORD dwVertexCount = dwWavePartDifference*waves.ColumnCount();
    
            for (UINT i = 0; i < muiNumberofThreads - 1; i++)
            {
                //threads.emplace_back(std::async( std::launch::async, &CWaves::UpdateWaveInteriorPoints, &waves, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount() ));
                threads.emplace_back(std::thread(&UpdateWaveInteriorPoints, tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount()));
    
                tActualVertexIterator += dwVertexCount;
                tPreviousVertexIterator += dwVertexCount;
            }
    
            tPreviousVertexIterator -= waves.ColumnCount(); //row - 1
            tActualVertexIterator -= waves.ColumnCount(); //row - 1
            waves.UpdateWaveInteriorPoints(tPreviousVertexIterator, tActualVertexIterator, dwMinRow, dwMaxRow, waves.GetK1(), waves.GetK2(), waves.GetK3(), waves.ColumnCount());
    
            for (UINT i = 0; i < muiNumberofThreads -1; i++)
            {
                //threads[i].wait();
                threads[i].join();
            }
    

    马立克

1 个答案:

答案 0 :(得分:2)

@mareknr当我提出你的问题时,侧边栏中有10个相关问题和答案,所有这些都与多线程实现比单线程实现慢的原因有关。我认为他们中的一个或多个会解决你的问题。 以下是其中一些内容的链接:

Multi-threaded GEMM slower than single threaded one?

C++ Boost Multithread is slower than single thread because of CPU type?

Why is this OpenMP program slower than single-thread?

2 threads slower than 1?