矢量矩阵&使用SSE对任何大小的输入矩阵和向量进行矩阵 - 矩阵乘法

时间:2014-05-08 21:58:47

标签: c sse multicore

我正在尝试使用SSE Intrinsic进行向量矩阵乘法以及矩阵 - 矩阵乘法,但我得到一个错误说" Segmentation Fault",如果我尝试做除4的倍数以外的任何事情。无法弄清楚为什么,它不会为其他任何事情工作。请建议更改,以便它可以用于输入的分析。?

以下是我的实施:

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
#include <time.h>
#include <omp.h>  

/*****************************************************
the following function generates a "size"-element vector
and a "size x size" matrix
 ****************************************************/
void matrix_vector_gen(int size, float *matrix, float *vector){
    int i;
    for (i = 0; i < size*size; i++){
        vector[i] = i*1.2f + 1;//((float)rand())/65535.0f;
        printf("%f \n ", vector[i]);
    }
    for (i = 0; i < size*size; i++){
        matrix[i] = i*1.3f + 1;//((float)rand())/5307.0f;
        printf("%f \n ", matrix[i]);
    }
}

/****************************************************
the following function calculate the below equation
   vector_out = vector_in x matrix_in
 ***************************************************/
void matrix_mult_sq(int size, float *vector_in,
               float *matrix_in, float *vector_out){
    int i, j, k;
    for (i = 0; i < size; i++)
    {
        for (j = 0; j < size; j++)
        {
            vector_out[size*i + j] = 0.0;
            for (k = 0; k < size; k++)
                vector_out[size*i + j] += vector_in[size*i + k] * matrix_in[size*k + j];
        }
    }
}

void matrix_mult_sse(int size, float *vector_in,
    float *matrix_in, float *vector_out){
    __m128 a_line, b_line, r_line;
    int i, j, k, l;
    for (k = 0; k < size; k++)
    {

        for (i = 0; i < size; i += 4){
            j = 0;
            b_line = _mm_load_ps(&matrix_in[i]); // b_line = vec4(matrix[i][0])
            a_line = _mm_set1_ps(vector_in[j + k*size]);      // a_line = vec4(vector_in[0])
            r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
            for (j = 1; j < size; j++) {
                b_line = _mm_load_ps(&matrix_in[j*size + i]); // a_line = vec4(column(a, j))
                a_line = _mm_set1_ps(vector_in[j + k*size]);  // b_line = vec4(b[i][j])
                // r_line += a_line * b_line
                r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
            }
            _mm_store_ps(&vector_out[i + k*size], r_line);     // r[i] = r_line
        }
    }
    for (l=0; l < size*size; l++)
    {
        printf("%f \n", vector_out[l]);
    }
}

int main(int argc, char *argv[]){
  if(argc < 2){
    printf("Usage: %s matrix/vector_size\n", argv[0]);
    return 0;
  }

  int size = atoi(argv[1]);
  if(size%4 != 0){
    printf("This version implements for ""size = 4*n"" only\n");
    return 0;
  }

  float *vector = (float *)memalign(sizeof(float)*4, sizeof(float)*size);//(float *)malloc(sizeof(float)*size);
  if(vector==NULL){
    printf("can't allocate the required memory for vector\n");
    return 0;
  }

  float *matrix = (float *)memalign(sizeof(float)*4, sizeof(float)*size*size);
  if(matrix==NULL){
    printf("can't allocate the required memory for matrix\n");
    free(vector);
    return 0;
  }

  float *result_sq = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_sq==NULL){
    printf("can't allocate the required memory for result_sq\n");
    free(vector);
    free(matrix);
    return 0;
  }

  float *result_pl = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
  if(result_pl==NULL){
    printf("can't allocate the required memory for result_pl\n");
    free(vector);
    free(matrix);
    free(result_sq);
    return 0;
  }

  matrix_vector_gen(size, matrix, vector);

  double time_sq;
  double time_sse;

  time_sq = omp_get_wtime();
  matrix_mult_sq(size, vector, matrix, result_sq);
  time_sq = omp_get_wtime() - time_sq;

  time_sse = omp_get_wtime();
  matrix_mult_sse(size, vector, matrix, result_pl);
  time_sse = omp_get_wtime() - time_sse;

  printf("SEQUENTIAL EXECUTION: %f (sec)\n",time_sq);
  printf("PARALLEL EXECUTION: %f (sec)\n", time_sse);

  //check
  /*int i;
  for(i=0; i<size; i++)
    if((int)result_sq[i] != (int)result_pl[i]){
      printf("wrong at position %d\n", i);
      free(vector);
      free(matrix);
      free(result_sq);
      free(result_pl);
      return 0;
    }*/

  free(vector);
  free(matrix);
  free(result_sq);
  free(result_pl);
  return 1;
}

1 个答案:

答案 0 :(得分:3)

您似乎只使用mm_load_ps和mm_store_ps加载和存储,它在一条指令中加载和存储4个浮点数。

由于你的容器(矩阵和向量)的大小不一定是4个浮点数(16个字节)的倍数,这是不正确的。

memalign确保指针对齐(此处为16字节),但不保留末尾的填充,以便分配的块大小为16字节的倍数。

例如,当存储5维向量时,向量在内存中只分配了20个字节,但是你写了32个字节(两个mm_store_ps操作)

此外,这似乎不正确:

  

_mm_store_ps(&amp; vector_out [i + k * size],r_line);

如果我是对的,你想在这里存储一个浮点数。不是四个装好的花车。