Question

我正在使用SSE指令和Windows multi-thread来实现矩阵乘法。当矩阵的维数很大时，例如1024乘1024，获得结果需要很长时间。

当它以4096 * 4096运行时，exe占用的内存很少，比如192M，但是，我机器的内存使用量从20％增加到97％，我认为可能有问题。 Windows multi-thread。

以下是我的代码。

main.cpp

#include "sse_matrix.h"
#include <ctime>

int main(int argc, char* argv[])
{
    vector<float> * left = new vector<float>(size, 0);
    vector<float> * right = new vector<float>(size, 0);
    vector<float> * result = new vector<float>(size, 0);
    // initialize value
    for (int i = 0; i < dim; i ++)
    {
        for (int j = 0; j < dim; j ++)
        {
            (*left)[i*dim + j] = j;
            (*right)[i*dim + j] = j;
        }

    }
    cout << "1. INFO: value initialized, starting matrix multiplication" << endl;
    // calculate the result
    clock_t my_time = clock();
    SSE_Matrix_Multiply(left, right, result, 4);
    cout << "2. INFO: SSE matrix multiplication result has got" << endl;
    cout << "3. INFO: time(ms): " << float(clock() - my_time) << endl;

    delete left;
    delete right;
    delete result;

    system("pause");

    return 0;

}

sse_matrix.h

#ifndef __SSE_MATRIX_H__
#define __SSE_MATRIX_H__

#include <vector>
#include <iostream>
#include <Windows.h>
using std::cin;
using std::cout;
using std::endl;
using std::vector;

const int dim = 4096;
const int size = dim * dim;

struct Matrix_Info 
{
    vector<float> * A;
    int ax, ay;
    vector<float> * B;
    int bx, by;
    vector<float> * C;
    int cx, cy;
    int m;
    int n;
};

void Transpose_Matrix_SSE(float * matrix)
{
    __m128 row1 = _mm_loadu_ps(&matrix[0*4]);
    __m128 row2 = _mm_loadu_ps(&matrix[1*4]);
    __m128 row3 = _mm_loadu_ps(&matrix[2*4]);
    __m128 row4 = _mm_loadu_ps(&matrix[3*4]);
    _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
    _mm_storeu_ps(&matrix[0*4], row1);
    _mm_storeu_ps(&matrix[1*4], row2);
    _mm_storeu_ps(&matrix[2*4], row3);
    _mm_storeu_ps(&matrix[3*4], row4);

}

float * Shuffle_Matrix_Multiply(float * left, float * right)
{
    __m128 _t1, _t2, _sum;
    _sum = _mm_setzero_ps(); // set all value of _sum to zero
    float * _result = new float[16];
    float _res[4] = {0};
    for (int i = 0; i < 4; i ++)
    {
        for (int j = 0; j < 4; j ++)
        {
            _t1 = _mm_loadu_ps(left + i * 4);
            _t2 = _mm_loadu_ps(right + j * 4);
            _sum = _mm_mul_ps(_t1, _t2);
            _mm_storeu_ps(_res, _sum);
            _result[i * 4 + j] = _res[0] + _res[1] + _res[2] + _res[3];
        }
    }
    return _result;
}

float * SSE_4_Matrix(struct Matrix_Info * my_info)
{
    int m = my_info->m;
    int n = my_info->n;
    int ax = my_info->ax;
    int ay = my_info->ay;
    int bx = my_info->bx;
    int by = my_info->by;
    //1. split Matrix A and Matrix B
    float * _a = new float[16];
    float * _b = new float[16];
    for (int i = 0; i < m; i ++)
    {
        for (int j = 0; j < m; j ++)
        {
            _a[i*m + j] = (*my_info->A)[(i + ax) * n + j + ay];
            _b[i*m + j] = (*my_info->B)[(i + bx) * n + j + by];
        }
    }
    //2. transpose Matrix B
    Transpose_Matrix_SSE(_b);
    //3. calculate result and return a float pointer
    float * result =  Shuffle_Matrix_Multiply(_a, _b);
    free(_a);
    free(_b);
    return result;
}

DWORD WINAPI Matrix_Multiply(LPVOID my_info)
{
    int m = ((struct Matrix_Info *)my_info)->m;
    int n = ((struct Matrix_Info *)my_info)->n;
    int cx = ((struct Matrix_Info *)my_info)->cx;
    int cy = ((struct Matrix_Info *)my_info)->cy;
    for (int i = 0; i < m; i ++)
    {
        for (int j = 0; j < m; j ++)
        {
            float * temp = SSE_4_Matrix(((struct Matrix_Info *)my_info));
            (*((struct Matrix_Info *)my_info)->C)[(i + cx) * n + j + cy] += temp[i*m + j];
            free(temp);
        }
    }
    return 0;
}

void SSE_Matrix_Multiply(vector<float> * left, vector<float> * right, vector<float> * result, int thread_num)
{
    struct Matrix_Info * my_info = new struct  Matrix_Info[thread_num];
    HANDLE * handle = new HANDLE[thread_num];
    for (int i = 0; i < thread_num; i ++)
    {
        my_info[i].A = left;
        my_info[i].B = right;
        my_info[i].C = result;
        my_info[i].n = dim;
        my_info[i].m = 4;
    }
    int id = 0;
    // Matrix A row:i, column:j
    for (int i = 0; i < dim; i += 4)
    {
        for (int j = 0; j < dim; j += 4)
        {
            // Matrix B row:j column:k
            for (int k = 0; k < dim; k += 4)
            {
                my_info[id].ax = i;
                my_info[id].ay = j;
                my_info[id].bx = j;
                my_info[id].by = k;
                my_info[id].cx = i;
                my_info[id].cy = k;
                if (id < thread_num)
                {
                     handle[id] = CreateThread(NULL, 0, Matrix_Multiply, (LPVOID)(&my_info[id]), 0, 0 );
                     id ++;
                }
                if (id == thread_num)
                {
                    for (int _i = 0; _i < id; _i ++)
                        WaitForMultipleObjects(thread_num, &handle[_i], TRUE, INFINITE);
                    id = 0;
                }
            }
        }
    }
    free(my_info);
    free(handle);
}

#endif

因此，当dim为4096时，当exe运行时，它需要大约192M的内存，但在得到结果之前内存使用率从20％增加到97％。

我的操作系统是Windows 10，IDE是Visual Studio 2012，我的内存是8G。

Answer 1

您正在创建dwStackSize = 0 CreateThread #include <windows.h> #include <stdio.h> #include <conio.h> DWORD WINAPI thread(LPVOID pData) { HANDLE hThread; hThread = GetCurrentThread(); while (SuspendThread(hThread) == -1); return (0); } int main() { int cnt; HANDLE hThread; DWORD tid; cnt = 0; do { hThread = CreateThread(NULL, 4096, thread, NULL, 0, &tid); if (hThread != NULL) { cnt++; } SleepEx(10, FALSE); } while (hThread != NULL); printf("%d threads; error: %d\n", cnt, GetLastError()); printf("\n\nend."); getch(); return (0); }的线索，这会产生default stack size of 1MB。任务管理器正确显示线程所需的内存不，您可以使用以下小测试应用程序进行测试：

bWaitAll = FALSE

我可以在我的机器上创建1597个线程，因此我的应用程序的内存使用量应该在1.5GB左右，但任务管理器只能显示45MB左右。

我的猜测是你所观察到的行为是由任务管理器的显示引起的...这可能值得进一步研究......

有关您实施的另一件事：

在创建新线程之前，您正在等待所有线程结束。您可能需要检查set WaitForMultipleObjects并评估float[16]的返回值，立即创建一个新线程，而不是等待所有线程完成处理......
如前所述，您不应分配少量内存，例如awk -F ':' '$3 > 499' /etc/passwd。想一想避免这种情况......

Windows多线程使内存使用量增加

1 个答案: