Question

我有一个Boids仿真，使用面向数据的设计，没有任何线程代码，它以30 fps的速度运行。我决定使用专用线程来更新对象来提高性能，但是性能从30 fps上升到10 fps。数据是全局的，因此可以被任何线程访问

在一开始，我认为这是错误共享的问题，因此我尝试仅使用一个线程来运行更新代码，但是与非线程版本相比，我的fps仍然较低。值得一提的是，主线程仅访问数据以读取（渲染），而从未写入数据。抱歉，如果代码太多，但我尝试提取其中最重要的部分，以便您可以了解其工作原理。

// Structures of arrays for the boids
struct SActorsData
{
    SColor*     Color;
    SVector3*   Position;
    SVector3*   Scale;
    Mesh**      MeshPtr;
};

struct SBoidsData
{
    SVector3*       Velocity;
    float*          Radius;
    .
    .
    .
};


std::vector<bool> WorkersSemaphore;
std::vector<std::thread> Workers;
std::vector<CActor*> Actors;
SActorsData ActorsData;
SBoidsData BoidsData;


struct CActor
{
   // points to the index corresponding to the data of this actor
   uint32_t ActorIdx;


   Update()
   {
        .
        .
        .
       ActorsData.Position[ActorIdx] = ActorsData.Position[ActorIdx] +  BoidsData.Velocity[ActorIdx] * DeltaTime;      
   }
};


void WorkerUpdate(uint32_t ThreadIdx, uint32_t ActorsStartIdx, uint32_t ActorsEndIdx)
{
    while (bIsAppRunning)
    {
        if (WorkersSemaphore[ThreadIdx])
        {
            for (uint32_t i = ActorsStartIdx; i < ActorsEndIdx; ++i)
            {
                Actors[i]->Update();
            }

            WorkersSemaphore[ThreadIdx] = false;
        }
    }
}

void main()
{
        // Code sample on how I allocate memory for each field
    uint16_t BoidsAdditionalBuffer = 100;
    BoidsData.Velocity  = new SVector3[InitialBoidsCount + BoidsAdditionalBuffer];

    .
    .
    .
    // Create the worker threads
    int32_t ChunkSize = Actors.size() / NumWorkers;
    int32_t Reminder = Actors.size() - ChunkSize * NumWorkers;

    for (uint32_t i = 0; i < NumWorkers; ++i)
    {
        uint32_t ActorsStartIdx = i * ChunkSize;
        uint32_t ActorsEndIdx = i * ChunkSize + ChunkSize;

        WorkersSemaphore.push_back(false);
        Workers.push_back(std::move(std::thread(WorkerUpdate, i, ActorsStartIdx, ActorsEndIdx)));
    }

    while (bIsAppRunning)
    {
        // Update the semaphores to true so worker threads can update
        for (uint32_t i = 0; i < WorkersSemaphore.size(); ++i)
        {
            WorkersSemaphore[i] = true;
        }

        // Wait until worker threads have completed the update
        while (true)
        {
            bool bUpdateDone = true;
            for (uint32_t i = 0; i < WorkersSemaphore.size(); ++i)
            {
                bUpdateDone &= !WorkersSemaphore[i];
            }

            if (bUpdateDone)
            {
                break;
            }
        }

        Render();
    }
}

正如我提到的那样，如果我仅使用一个工作线程运行此代码，那么性能将大打折扣。我真的不知道会是什么。

使用专用线程更新粒子会降低性能

0 个答案: