我正在使用2D数组和1D数组来计算数组a和数组s的行和列的总和。目的是计算数组s中的2D阵列a的总和。 N是数组的大小我的问题是当我们N = 512和N = 1024时我们得到重大改变,如excel
中所示/* sumcol_bycol: Basic implementation of column sum operation.
* Computes sums of the columns of array a into array s.
* s[c] = SUM(r=0..N-1) a[r][c]
*/
static void FN_ALIGN sumcol_bycol(int a[N][N], int s[N]) {
int r, c, sum;
// Iterate over all columns
for (c = 0; c < N; c++) {
// Compute the sum of data in column c
sum = 0;
for (r = 0; r < N; r++) {
sum += a[r][c];
}
// Return the sum in element c of array s
s[c] = sum;
}
}
/* Computing down columns with loop unrolling */
static void FN_ALIGN sumcol_bycol_u4(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++) {
sum = 0;
for (r = 0; r < N-3; r+=4) {
sum += a[r][c];
sum += a[r+1][c];
sum += a[r+2][c];
sum += a[r+3][c];
}
/* The additional cases if unrolling factor does not divide N evenly */
# if N%4 >= 1
sum += a[r][c];
# endif
# if N%4 >= 2
sum += a[r+1][c];
# endif
# if N%4 >= 3
sum += a[r+2][c];
# endif
s[c] = sum;
}
}
/* Unrolling to a factor of 8 */
static void FN_ALIGN sumcol_bycol_u8(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++) {
sum = 0;
for (r = 0; r < N-7; r+=8) {
sum += a[r][c];
sum += a[r+1][c];
sum += a[r+2][c];
sum += a[r+3][c];
sum += a[r+4][c];
sum += a[r+5][c];
sum += a[r+6][c];
sum += a[r+7][c];
}
# if N%8 >= 1
sum += a[r][c];
# endif
# if N%8 >= 2
sum += a[r+1][c];
# endif
# if N%8 >= 3
sum += a[r+2][c];
# endif
# if N%8 >= 4
sum += a[r+3][c];
# endif
# if N%8 >= 5
sum += a[r+4][c];
# endif
# if N%8 >= 6
sum += a[r+5][c];
# endif
# if N%8 >= 7
sum += a[r+6][c];
# endif
s[c] = sum;
}
}
/* Grouping: Compute two columns together */
static void FN_ALIGN sumcol_bycol_g2(int a[N][N], int s[N]) {
int r, c, sum0, sum1;
for (c = 0; c < N-1; c+=2) {
sum0 = sum1 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
}
s[c] = sum0;
s[c+1] = sum1;
}
/* If there is an odd number of columns */
# if N%2 == 1
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
# endif
}
/* Grouping 3 columns at once */
static void FN_ALIGN sumcol_bycol_g3(int a[N][N], int s[N]) {
int r, c, sum0, sum1, sum2;
for (c = 0; c < N-2; c+=3) {
sum0 = sum1 = sum2 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
sum2 += a[r][c+2];
}
s[c] = sum0;
s[c+1] = sum1;
s[c+2] = sum2;
}
# if N%3 != 0
for ( ; c < N; c++) {
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
}
# endif
}
/* Grouping 4 columns at once */
static void FN_ALIGN sumcol_bycol_g4(int a[N][N], int s[N]) {
int r, c, sum0, sum1, sum2, sum3;
for (c = 0; c < N-3; c+=4) {
sum0 = sum1 = sum2 = sum3 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
sum2 += a[r][c+2];
sum3 += a[r][c+3];
}
s[c] = sum0;
s[c+1] = sum1;
s[c+2] = sum2;
s[c+3] = sum3;
}
# if N%4 != 0
for ( ; c < N; c++) {
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
}
# endif
}
/* Grouping 5 columns at once */
static void FN_ALIGN sumcol_bycol_g5(int a[N][N], int s[N]) {
int r, c, sum0, sum1, sum2, sum3, sum4;
for (c = 0; c < N-4; c+=5) {
sum0 = sum1 = sum2 = sum3 = sum4 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
sum1 += a[r][c+1];
sum2 += a[r][c+2];
sum3 += a[r][c+3];
sum4 += a[r][c+4];
}
s[c] = sum0;
s[c+1] = sum1;
s[c+2] = sum2;
s[c+3] = sum3;
s[c+4] = sum4;
}
# if N%5 != 0
for ( ; c < N; c++) {
sum0 = 0;
for (r = 0; r < N; r++) {
sum0 += a[r][c];
}
s[c] = sum0;
}
# endif
}
/* sumcol_byrow: Column sums computed using row-wise array access.
*/
static void FN_ALIGN sumcol_byrow(int a[N][N], int s[N]) {
int r, c, sum;
// Initialise all sums to zero.
for (c = 0; c < N; c++)
s[c] = 0;
// Iterate over all array elements, adding
// each one onto the appropriate sum.
for (r = 0; r < N; r++) {
for (c = 0; c < N; c++) {
s[c] += a[r][c];
}
}
}
/* Row-wise array access with loop unrolling */
static void FN_ALIGN sumcol_byrow_u4(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++)
s[c] = 0;
for (r = 0; r < N; r++) {
for (c = 0; c < N-3; c+=4) {
s[c] += a[r][c];
s[c+1] += a[r][c+1];
s[c+2] += a[r][c+2];
s[c+3] += a[r][c+3];
}
# if N%4 >= 1
s[c] += a[r][c];
# endif
# if N%4 >= 2
s[c+1] += a[r][c+1];
# endif
# if N%4 >= 3
s[c+2] += a[r][c+2];
# endif
}
}
/* Row-wise computation using 2x2 blocks. The main loops are in 2x2 row-wise blocks
* then each block updates two sums */
static void FN_ALIGN sumcol_byrow_b2x2(int a[N][N], int s[N]) {
int r, c, sum;
for (c = 0; c < N; c++)
s[c] = 0;
for (r = 0; r < N-1; r+=2) {
for (c = 0; c < N-1; c+=2) {
s[c] += a[r][c] + a[r+1][c];
s[c+1] += a[r][c+1] + a[r+1][c+1];
}
# if N%2 != 0
s[c] += a[r][c] + a[r+1][c];
# endif
}
# if N%2 != 0
/* Process the remaining row */
for (c = 0; c < N-1; c+=2) {
s[c] += a[r][c];
s[c+1] += a[r][c+1];
}
/* Process the corner element */
s[c] += a[r][c];
# endif
}