我无论如何都无法将浮点值加载到NEON 128位寄存器中!
我尝试了一切可能的方法来加载浮点数,但寄存器保持为零(通过调试找到)。
/* neon_example.c - Neon intrinsics example program */
#include <stdint.h>
#include <stdio.h>
#include <arm_neon.h>
/* fill array with increasing integers beginning with 0 */
void fill_array(float32_t *array, int size)
{ int i;
for (i = 0; i < size; i++)
{
array[i] = 2.0f;
printf("%f",array[i]);
}
}
/* return the sum of all elements in an array. This works by calculating 4 totals (one for each lane) and adding those at the end to get the final total */
float sum_array(float32_t *array, int size)
{
float32_t a,b,c,d,add;
float32x4_t acc= vdupq_n_f32(0.0);
for (; size != 0; size -= 4)
{
float32x4_t vec =vdupq_n_f32(0.0f);;
vec=vld1q_f32(array);
//The above operation does not load values??????????????????????
array += 4;
acc = vaddq_f32(acc,vec);
}
vst1q_lane_f32(&a,acc,0);
vst1q_lane_f32(&a,acc,1);
vst1q_lane_f32(&a,acc,2);
vst1q_lane_f32(&a,acc,3);
add=a+b+c+d;
//return (int)vget_lane_s64(acc2, 0);
return add;
}
/* main function */
int main()
{
float32_t my_array[100];
fill_array(my_array, 100);
printf("Sum was %f\n", sum_array(my_array, 100));
return 0;
}
答案 0 :(得分:2)
我只是运行你的代码并正确加载到寄存器中。我在Xcode 4.6中使用LLVM 4.2构建了代码。
实施Guy Sirton的更改可以修复错误并产生更具可读性的功能:
float32_t sum_array(float32_t *array, int size)
{
float32_t arr[4],add;
float32x4_t acc= vdupq_n_f32(0.0);
for (; size != 0; size -= 4)
{
float32x4_t vec=vld1q_f32(array);
array += 4;
acc = vaddq_f32(acc,vec);
}
vst1q_f32(arr, acc);
add = arr[0] + arr[1] + arr[2] + arr[3];
return add;
}