我需要一个快速的内存转置算法来实现我在C / C ++中的高斯卷积函数。我现在做的是
convolute_1D
transpose
convolute_1D
transpose
事实证明,使用这种方法,滤波器的大小必须很大(或大于我的预期),或者转置需要比卷积更长的时间(例如,对于1920x1080矩阵,卷积与滤波器的转置需要相同的时间大小为35)。我正在使用的当前转置算法使用循环阻塞/平铺以及SSE和OpenMP。我尝试过使用AVX的版本,但速度并不快。关于如何加快速度的任何建议?
inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
__m128 row1 = _mm_load_ps(&A[0*lda]);
__m128 row2 = _mm_load_ps(&A[1*lda]);
__m128 row3 = _mm_load_ps(&A[2*lda]);
__m128 row4 = _mm_load_ps(&A[3*lda]);
_MM_TRANSPOSE4_PS(row1, row2, row3, row4);
_mm_store_ps(&B[0*ldb], row1);
_mm_store_ps(&B[1*ldb], row2);
_mm_store_ps(&B[2*ldb], row3);
_mm_store_ps(&B[3*ldb], row4);
}
//block_size = 16 works best
inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
#pragma omp parallel for
for(int i=0; i<n; i+=block_size) {
for(int j=0; j<m; j+=block_size) {
int max_i2 = i+block_size < n ? i + block_size : n;
int max_j2 = j+block_size < m ? j + block_size : m;
for(int i2=i; i2<max_i2; i2+=4) {
for(int j2=j; j2<max_j2; j2+=4) {
transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
}
}
}
}
}
使用AVX转置8x8浮点矩阵。它并不比四个4x4转置快。
inline void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) {
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
__t0 = _mm256_unpacklo_ps(row0, row1);
__t1 = _mm256_unpackhi_ps(row0, row1);
__t2 = _mm256_unpacklo_ps(row2, row3);
__t3 = _mm256_unpackhi_ps(row2, row3);
__t4 = _mm256_unpacklo_ps(row4, row5);
__t5 = _mm256_unpackhi_ps(row4, row5);
__t6 = _mm256_unpacklo_ps(row6, row7);
__t7 = _mm256_unpackhi_ps(row6, row7);
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}
inline void transpose8x8_avx(float *A, float *B, const int lda, const int ldb) {
__m256 row0 = _mm256_load_ps(&A[0*lda]);
__m256 row1 = _mm256_load_ps(&A[1*lda]);
__m256 row2 = _mm256_load_ps(&A[2*lda]);
__m256 row3 = _mm256_load_ps(&A[3*lda]);
__m256 row4 = _mm256_load_ps(&A[4*lda]);
__m256 row5 = _mm256_load_ps(&A[5*lda]);
__m256 row6 = _mm256_load_ps(&A[6*lda]);
__m256 row7 = _mm256_load_ps(&A[7*lda]);
transpose8_ps(row0, row1, row2, row3, row4, row5, row6, row7);
_mm256_store_ps(&B[0*ldb], row0);
_mm256_store_ps(&B[1*ldb], row1);
_mm256_store_ps(&B[2*ldb], row2);
_mm256_store_ps(&B[3*ldb], row3);
_mm256_store_ps(&B[4*ldb], row4);
_mm256_store_ps(&B[5*ldb], row5);
_mm256_store_ps(&B[6*ldb], row6);
_mm256_store_ps(&B[7*ldb], row7);
}
答案 0 :(得分:3)
我猜你最好的选择是尝试将卷积和转置结合起来 - 也就是说,当你去的时候写出转换的结果。几乎可以肯定,转置的内存带宽有限,因此减少用于转置的指令数量并没有真正帮助(因此使用AVX缺乏改进)。减少数据传递次数可以最大限度地提高性能。
答案 1 :(得分:0)
inline void transpose4x4_naive(float *A, float *B, const int lda, const int ldb) {
const float r0[] = { A[0], A[1], A[2], A[3] }; // memcpy instead?
A += lda;
const float r1[] = { A[0], A[1], A[2], A[3] };
A += lda;
const float r2[] = { A[0], A[1], A[2], A[3] };
A += lda;
const float r3[] = { A[0], A[1], A[2], A[3] };
B[0] = r0[0];
B[1] = r1[0];
B[2] = r2[0];
B[3] = r3[0];
B += ldb;
B[0] = r0[1];
B[1] = r1[1];
B[2] = r2[1];
B[3] = r3[1];
B += ldb;
B[0] = r0[2];
B[1] = r1[2];
B[2] = r2[2];
B[3] = r3[2];
B += ldb;
B[0] = r0[3];
B[1] = r1[3];
B[2] = r2[3];
B[3] = r3[3];
}
奇怪的是,旧的笔记本电脑CPU比双核E5-2630 v2台式机的速度快两倍,但这是另一回事:)
否则,您可能也对 http://research.colfaxinternational.com/file.axd?file=2013%2F8%2FColfax_Transposition-7110P.pdf http://colfaxresearch.com/multithreaded-transposition-of-square-matrices-with-common-code-for-intel-xeon-processors-and-intel-xeon-phi-coprocessors/感兴趣(现在需要登录...)
答案 2 :(得分:0)
考虑这个4x4转置。
struct MATRIX {
union {
float f[4][4];
__m128 m[4];
__m256 n[2];
};
};
MATRIX myTranspose(MATRIX in) {
// This takes 15 assembler instructions (compile not inline),
// and is faster than XMTranspose
// Comes in like this 1 2 3 4 5 6 7 8
// 9 10 11 12 13 14 15 16
//
// Want the result 1 5 9 13 2 6 10 14
// 3 7 11 15 4 8 12 16
__m256 t0, t1, t2, t3, t4, t5, n0, n1;
MATRIX result;
n0 = in.n[0]; // n0 = 1, 2, 3, 4, 5, 6, 7, 8
n1 = in.n[1]; // n1 = 9, 10, 11, 12, 13, 14, 15, 16
t0 = _mm256_unpacklo_ps(n0, n1); // t0 = 1, 9, 2, 10, 5, 13, 6, 14
t1 = _mm256_unpackhi_ps(n0, n1); // t1 = 3, 11, 4, 12, 7, 15, 8, 16
t2 = _mm256_permute2f128_ps(t0, t1, 0x20); // t2 = 1, 9, 2, 10, 3, 11, 4, 12
t3 = _mm256_permute2f128_ps(t0, t1, 0x31); // t3 = 5, 13, 6, 14, 7, 15, 8, 16
t4 = _mm256_unpacklo_ps(t2, t3); // t2 = 1, 5, 9, 13, 3, 7, 11, 15
t5 = _mm256_unpackhi_ps(t2, t3); // t3 = 2, 6, 10, 14, 4, 8, 12, 16
result.n[0] = _mm256_permute2f128_ps(t4, t5, 0x20); // t6 = 1, 5, 9, 13, 2, 6, 10, 14
result.n[1] = _mm256_permute2f128_ps(t4, t5, 0x31); // t7 = 3, 7, 11, 15, 4, 8, 12, 16
return result;
}