
时间:2016-01-04 05:30:42

标签: c++ windows multithreading winapi memory

我正在使用SSE指令和Windows multi-thread来实现矩阵乘法。当矩阵的维数很大时,例如1024乘1024,获得结果需要很长时间。

当它以4096 * 4096运行时,exe占用的内存很少,比如192M,但是,我机器的内存使用量从20%增加到97%,我认为可能有问题。 Windows multi-thread



#include "sse_matrix.h"
#include <ctime>

int main(int argc, char* argv[])
    vector<float> * left = new vector<float>(size, 0);
    vector<float> * right = new vector<float>(size, 0);
    vector<float> * result = new vector<float>(size, 0);
    // initialize value
    for (int i = 0; i < dim; i ++)
        for (int j = 0; j < dim; j ++)
            (*left)[i*dim + j] = j;
            (*right)[i*dim + j] = j;

    cout << "1. INFO: value initialized, starting matrix multiplication" << endl;
    // calculate the result
    clock_t my_time = clock();
    SSE_Matrix_Multiply(left, right, result, 4);
    cout << "2. INFO: SSE matrix multiplication result has got" << endl;
    cout << "3. INFO: time(ms): " << float(clock() - my_time) << endl;

    delete left;
    delete right;
    delete result;


    return 0;



#ifndef __SSE_MATRIX_H__
#define __SSE_MATRIX_H__

#include <vector>
#include <iostream>
#include <Windows.h>
using std::cin;
using std::cout;
using std::endl;
using std::vector;

const int dim = 4096;
const int size = dim * dim;

struct Matrix_Info 
    vector<float> * A;
    int ax, ay;
    vector<float> * B;
    int bx, by;
    vector<float> * C;
    int cx, cy;
    int m;
    int n;

void Transpose_Matrix_SSE(float * matrix)
    __m128 row1 = _mm_loadu_ps(&matrix[0*4]);
    __m128 row2 = _mm_loadu_ps(&matrix[1*4]);
    __m128 row3 = _mm_loadu_ps(&matrix[2*4]);
    __m128 row4 = _mm_loadu_ps(&matrix[3*4]);
    _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
    _mm_storeu_ps(&matrix[0*4], row1);
    _mm_storeu_ps(&matrix[1*4], row2);
    _mm_storeu_ps(&matrix[2*4], row3);
    _mm_storeu_ps(&matrix[3*4], row4);


float * Shuffle_Matrix_Multiply(float * left, float * right)
    __m128 _t1, _t2, _sum;
    _sum = _mm_setzero_ps(); // set all value of _sum to zero
    float * _result = new float[16];
    float _res[4] = {0};
    for (int i = 0; i < 4; i ++)
        for (int j = 0; j < 4; j ++)
            _t1 = _mm_loadu_ps(left + i * 4);
            _t2 = _mm_loadu_ps(right + j * 4);
            _sum = _mm_mul_ps(_t1, _t2);
            _mm_storeu_ps(_res, _sum);
            _result[i * 4 + j] = _res[0] + _res[1] + _res[2] + _res[3];
    return _result;

float * SSE_4_Matrix(struct Matrix_Info * my_info)
    int m = my_info->m;
    int n = my_info->n;
    int ax = my_info->ax;
    int ay = my_info->ay;
    int bx = my_info->bx;
    int by = my_info->by;
    //1. split Matrix A and Matrix B
    float * _a = new float[16];
    float * _b = new float[16];
    for (int i = 0; i < m; i ++)
        for (int j = 0; j < m; j ++)
            _a[i*m + j] = (*my_info->A)[(i + ax) * n + j + ay];
            _b[i*m + j] = (*my_info->B)[(i + bx) * n + j + by];
    //2. transpose Matrix B
    //3. calculate result and return a float pointer
    float * result =  Shuffle_Matrix_Multiply(_a, _b);
    return result;

DWORD WINAPI Matrix_Multiply(LPVOID my_info)
    int m = ((struct Matrix_Info *)my_info)->m;
    int n = ((struct Matrix_Info *)my_info)->n;
    int cx = ((struct Matrix_Info *)my_info)->cx;
    int cy = ((struct Matrix_Info *)my_info)->cy;
    for (int i = 0; i < m; i ++)
        for (int j = 0; j < m; j ++)
            float * temp = SSE_4_Matrix(((struct Matrix_Info *)my_info));
            (*((struct Matrix_Info *)my_info)->C)[(i + cx) * n + j + cy] += temp[i*m + j];
    return 0;

void SSE_Matrix_Multiply(vector<float> * left, vector<float> * right, vector<float> * result, int thread_num)
    struct Matrix_Info * my_info = new struct  Matrix_Info[thread_num];
    HANDLE * handle = new HANDLE[thread_num];
    for (int i = 0; i < thread_num; i ++)
        my_info[i].A = left;
        my_info[i].B = right;
        my_info[i].C = result;
        my_info[i].n = dim;
        my_info[i].m = 4;
    int id = 0;
    // Matrix A row:i, column:j
    for (int i = 0; i < dim; i += 4)
        for (int j = 0; j < dim; j += 4)
            // Matrix B row:j column:k
            for (int k = 0; k < dim; k += 4)
                my_info[id].ax = i;
                my_info[id].ay = j;
                my_info[id].bx = j;
                my_info[id].by = k;
                my_info[id].cx = i;
                my_info[id].cy = k;
                if (id < thread_num)
                     handle[id] = CreateThread(NULL, 0, Matrix_Multiply, (LPVOID)(&my_info[id]), 0, 0 );
                     id ++;
                if (id == thread_num)
                    for (int _i = 0; _i < id; _i ++)
                        WaitForMultipleObjects(thread_num, &handle[_i], TRUE, INFINITE);
                    id = 0;



我的操作系统是Windows 10,IDE是Visual Studio 2012,我的内存是8G。

1 个答案:

答案 0 :(得分:0)

您正在创建dwStackSize = 0 CreateThread #include <windows.h> #include <stdio.h> #include <conio.h> DWORD WINAPI thread(LPVOID pData) { HANDLE hThread; hThread = GetCurrentThread(); while (SuspendThread(hThread) == -1); return (0); } int main() { int cnt; HANDLE hThread; DWORD tid; cnt = 0; do { hThread = CreateThread(NULL, 4096, thread, NULL, 0, &tid); if (hThread != NULL) { cnt++; } SleepEx(10, FALSE); } while (hThread != NULL); printf("%d threads; error: %d\n", cnt, GetLastError()); printf("\n\nend."); getch(); return (0); } 的线索,这会产生default stack size of 1MB。任务管理器正确显示线程所需的内存,您可以使用以下小测试应用程序进行测试:

bWaitAll = FALSE




  • 在创建新线程之前,您正在等待所有线程结束。您可能需要检查set WaitForMultipleObjects并评估float[16]的返回值,立即创建一个新线程,而不是等待所有线程完成处理......
  • 如前所述,您不应分配少量内存,例如awk -F ':' '$3 > 499' /etc/passwd 。想一想避免这种情况......