我正在使用SSE
指令和Windows multi-thread
来实现矩阵乘法。当矩阵的维数很大时,例如1024乘1024,获得结果需要很长时间。
当它以4096 * 4096运行时,exe
占用的内存很少,比如192M,但是,我机器的内存使用量从20%增加到97%,我认为可能有问题。 Windows multi-thread
。
以下是我的代码。
main.cpp
#include "sse_matrix.h"
#include <ctime>
int main(int argc, char* argv[])
{
vector<float> * left = new vector<float>(size, 0);
vector<float> * right = new vector<float>(size, 0);
vector<float> * result = new vector<float>(size, 0);
// initialize value
for (int i = 0; i < dim; i ++)
{
for (int j = 0; j < dim; j ++)
{
(*left)[i*dim + j] = j;
(*right)[i*dim + j] = j;
}
}
cout << "1. INFO: value initialized, starting matrix multiplication" << endl;
// calculate the result
clock_t my_time = clock();
SSE_Matrix_Multiply(left, right, result, 4);
cout << "2. INFO: SSE matrix multiplication result has got" << endl;
cout << "3. INFO: time(ms): " << float(clock() - my_time) << endl;
delete left;
delete right;
delete result;
system("pause");
return 0;
}
sse_matrix.h
#ifndef __SSE_MATRIX_H__
#define __SSE_MATRIX_H__
#include <vector>
#include <iostream>
#include <Windows.h>
using std::cin;
using std::cout;
using std::endl;
using std::vector;
const int dim = 4096;
const int size = dim * dim;
struct Matrix_Info
{
vector<float> * A;
int ax, ay;
vector<float> * B;
int bx, by;
vector<float> * C;
int cx, cy;
int m;
int n;
};
void Transpose_Matrix_SSE(float * matrix)
{
__m128 row1 = _mm_loadu_ps(&matrix[0*4]);
__m128 row2 = _mm_loadu_ps(&matrix[1*4]);
__m128 row3 = _mm_loadu_ps(&matrix[2*4]);
__m128 row4 = _mm_loadu_ps(&matrix[3*4]);
_MM_TRANSPOSE4_PS(row1, row2, row3, row4);
_mm_storeu_ps(&matrix[0*4], row1);
_mm_storeu_ps(&matrix[1*4], row2);
_mm_storeu_ps(&matrix[2*4], row3);
_mm_storeu_ps(&matrix[3*4], row4);
}
float * Shuffle_Matrix_Multiply(float * left, float * right)
{
__m128 _t1, _t2, _sum;
_sum = _mm_setzero_ps(); // set all value of _sum to zero
float * _result = new float[16];
float _res[4] = {0};
for (int i = 0; i < 4; i ++)
{
for (int j = 0; j < 4; j ++)
{
_t1 = _mm_loadu_ps(left + i * 4);
_t2 = _mm_loadu_ps(right + j * 4);
_sum = _mm_mul_ps(_t1, _t2);
_mm_storeu_ps(_res, _sum);
_result[i * 4 + j] = _res[0] + _res[1] + _res[2] + _res[3];
}
}
return _result;
}
float * SSE_4_Matrix(struct Matrix_Info * my_info)
{
int m = my_info->m;
int n = my_info->n;
int ax = my_info->ax;
int ay = my_info->ay;
int bx = my_info->bx;
int by = my_info->by;
//1. split Matrix A and Matrix B
float * _a = new float[16];
float * _b = new float[16];
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
_a[i*m + j] = (*my_info->A)[(i + ax) * n + j + ay];
_b[i*m + j] = (*my_info->B)[(i + bx) * n + j + by];
}
}
//2. transpose Matrix B
Transpose_Matrix_SSE(_b);
//3. calculate result and return a float pointer
float * result = Shuffle_Matrix_Multiply(_a, _b);
free(_a);
free(_b);
return result;
}
DWORD WINAPI Matrix_Multiply(LPVOID my_info)
{
int m = ((struct Matrix_Info *)my_info)->m;
int n = ((struct Matrix_Info *)my_info)->n;
int cx = ((struct Matrix_Info *)my_info)->cx;
int cy = ((struct Matrix_Info *)my_info)->cy;
for (int i = 0; i < m; i ++)
{
for (int j = 0; j < m; j ++)
{
float * temp = SSE_4_Matrix(((struct Matrix_Info *)my_info));
(*((struct Matrix_Info *)my_info)->C)[(i + cx) * n + j + cy] += temp[i*m + j];
free(temp);
}
}
return 0;
}
void SSE_Matrix_Multiply(vector<float> * left, vector<float> * right, vector<float> * result, int thread_num)
{
struct Matrix_Info * my_info = new struct Matrix_Info[thread_num];
HANDLE * handle = new HANDLE[thread_num];
for (int i = 0; i < thread_num; i ++)
{
my_info[i].A = left;
my_info[i].B = right;
my_info[i].C = result;
my_info[i].n = dim;
my_info[i].m = 4;
}
int id = 0;
// Matrix A row:i, column:j
for (int i = 0; i < dim; i += 4)
{
for (int j = 0; j < dim; j += 4)
{
// Matrix B row:j column:k
for (int k = 0; k < dim; k += 4)
{
my_info[id].ax = i;
my_info[id].ay = j;
my_info[id].bx = j;
my_info[id].by = k;
my_info[id].cx = i;
my_info[id].cy = k;
if (id < thread_num)
{
handle[id] = CreateThread(NULL, 0, Matrix_Multiply, (LPVOID)(&my_info[id]), 0, 0 );
id ++;
}
if (id == thread_num)
{
for (int _i = 0; _i < id; _i ++)
WaitForMultipleObjects(thread_num, &handle[_i], TRUE, INFINITE);
id = 0;
}
}
}
}
free(my_info);
free(handle);
}
#endif
因此,当dim
为4096时,当exe
运行时,它需要大约192M的内存,但在得到结果之前内存使用率从20%增加到97%。
我的操作系统是Windows 10
,IDE是Visual Studio 2012
,我的内存是8G。
答案 0 :(得分:0)
您正在创建dwStackSize = 0
CreateThread
#include <windows.h>
#include <stdio.h>
#include <conio.h>
DWORD WINAPI thread(LPVOID pData)
{
HANDLE hThread;
hThread = GetCurrentThread();
while (SuspendThread(hThread) == -1);
return (0);
}
int main()
{
int cnt;
HANDLE hThread;
DWORD tid;
cnt = 0;
do
{
hThread = CreateThread(NULL, 4096, thread, NULL, 0, &tid);
if (hThread != NULL)
{
cnt++;
}
SleepEx(10, FALSE);
}
while (hThread != NULL);
printf("%d threads; error: %d\n", cnt, GetLastError());
printf("\n\nend.");
getch();
return (0);
}
的线索,这会产生default stack size of 1MB。任务管理器正确显示线程所需的内存不,您可以使用以下小测试应用程序进行测试:
bWaitAll = FALSE
我可以在我的机器上创建1597个线程,因此我的应用程序的内存使用量应该在1.5GB左右,但任务管理器只能显示45MB左右。
我的猜测是你所观察到的行为是由任务管理器的显示引起的...这可能值得进一步研究......
有关您实施的另一件事:
WaitForMultipleObjects
并评估float[16]
的返回值,立即创建一个新线程,而不是等待所有线程完成处理...... awk -F ':' '$3 > 499' /etc/passwd
。想一想避免这种情况......