我正在尝试使用SSE Intrinsic进行向量矩阵乘法以及矩阵 - 矩阵乘法,但我得到一个错误说" Segmentation Fault",如果我尝试做除4的倍数以外的任何事情。无法弄清楚为什么,它不会为其他任何事情工作。请建议更改,以便它可以用于输入的分析。?
以下是我的实施:
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <xmmintrin.h>
#include <time.h>
#include <omp.h>
/*****************************************************
the following function generates a "size"-element vector
and a "size x size" matrix
****************************************************/
void matrix_vector_gen(int size, float *matrix, float *vector){
int i;
for (i = 0; i < size*size; i++){
vector[i] = i*1.2f + 1;//((float)rand())/65535.0f;
printf("%f \n ", vector[i]);
}
for (i = 0; i < size*size; i++){
matrix[i] = i*1.3f + 1;//((float)rand())/5307.0f;
printf("%f \n ", matrix[i]);
}
}
/****************************************************
the following function calculate the below equation
vector_out = vector_in x matrix_in
***************************************************/
void matrix_mult_sq(int size, float *vector_in,
float *matrix_in, float *vector_out){
int i, j, k;
for (i = 0; i < size; i++)
{
for (j = 0; j < size; j++)
{
vector_out[size*i + j] = 0.0;
for (k = 0; k < size; k++)
vector_out[size*i + j] += vector_in[size*i + k] * matrix_in[size*k + j];
}
}
}
void matrix_mult_sse(int size, float *vector_in,
float *matrix_in, float *vector_out){
__m128 a_line, b_line, r_line;
int i, j, k, l;
for (k = 0; k < size; k++)
{
for (i = 0; i < size; i += 4){
j = 0;
b_line = _mm_load_ps(&matrix_in[i]); // b_line = vec4(matrix[i][0])
a_line = _mm_set1_ps(vector_in[j + k*size]); // a_line = vec4(vector_in[0])
r_line = _mm_mul_ps(a_line, b_line); // r_line = a_line * b_line
for (j = 1; j < size; j++) {
b_line = _mm_load_ps(&matrix_in[j*size + i]); // a_line = vec4(column(a, j))
a_line = _mm_set1_ps(vector_in[j + k*size]); // b_line = vec4(b[i][j])
// r_line += a_line * b_line
r_line = _mm_add_ps(_mm_mul_ps(a_line, b_line), r_line);
}
_mm_store_ps(&vector_out[i + k*size], r_line); // r[i] = r_line
}
}
for (l=0; l < size*size; l++)
{
printf("%f \n", vector_out[l]);
}
}
int main(int argc, char *argv[]){
if(argc < 2){
printf("Usage: %s matrix/vector_size\n", argv[0]);
return 0;
}
int size = atoi(argv[1]);
if(size%4 != 0){
printf("This version implements for ""size = 4*n"" only\n");
return 0;
}
float *vector = (float *)memalign(sizeof(float)*4, sizeof(float)*size);//(float *)malloc(sizeof(float)*size);
if(vector==NULL){
printf("can't allocate the required memory for vector\n");
return 0;
}
float *matrix = (float *)memalign(sizeof(float)*4, sizeof(float)*size*size);
if(matrix==NULL){
printf("can't allocate the required memory for matrix\n");
free(vector);
return 0;
}
float *result_sq = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
if(result_sq==NULL){
printf("can't allocate the required memory for result_sq\n");
free(vector);
free(matrix);
return 0;
}
float *result_pl = (float *)memalign(sizeof(float)*4, sizeof(float)*size);
if(result_pl==NULL){
printf("can't allocate the required memory for result_pl\n");
free(vector);
free(matrix);
free(result_sq);
return 0;
}
matrix_vector_gen(size, matrix, vector);
double time_sq;
double time_sse;
time_sq = omp_get_wtime();
matrix_mult_sq(size, vector, matrix, result_sq);
time_sq = omp_get_wtime() - time_sq;
time_sse = omp_get_wtime();
matrix_mult_sse(size, vector, matrix, result_pl);
time_sse = omp_get_wtime() - time_sse;
printf("SEQUENTIAL EXECUTION: %f (sec)\n",time_sq);
printf("PARALLEL EXECUTION: %f (sec)\n", time_sse);
//check
/*int i;
for(i=0; i<size; i++)
if((int)result_sq[i] != (int)result_pl[i]){
printf("wrong at position %d\n", i);
free(vector);
free(matrix);
free(result_sq);
free(result_pl);
return 0;
}*/
free(vector);
free(matrix);
free(result_sq);
free(result_pl);
return 1;
}
答案 0 :(得分:3)
您似乎只使用mm_load_ps和mm_store_ps加载和存储,它在一条指令中加载和存储4个浮点数。
由于你的容器(矩阵和向量)的大小不一定是4个浮点数(16个字节)的倍数,这是不正确的。
memalign确保指针对齐(此处为16字节),但不保留末尾的填充,以便分配的块大小为16字节的倍数。
例如,当存储5维向量时,向量在内存中只分配了20个字节,但是你写了32个字节(两个mm_store_ps操作)
此外,这似乎不正确:
_mm_store_ps(&amp; vector_out [i + k * size],r_line);
如果我是对的,你想在这里存储一个浮点数。不是四个装好的花车。