我尝试将一段代码与OPENMP并行,但随着处理器数量的增加,代码运行速度变慢。!
调用OMP_set_num_threads(1) - > 16.7秒
调用OMP_set_num_threads(4) - > 17.7秒
调用OMP_set_num_threads(8) - > 19秒
系统规范
Intel Corei7 3610QM 2.3GH高达3.2GH,4核8线程 /// 8GB RAM DDR3
call OMP_set_num_threads(8)
!$omp parallel
!$omp do private(k,i,j,r,epsilonxx,epsilonyy,epsilonxy,epsilonzz,epsilonxz,&
epsilonyz) reduction(+:dr)
do k=1,niac
i = pair_i(k)
j = pair_j(k)
dx(1) = x(1,j) - x(1,i)
dr = dx(1)*dx(1)
do d=2,dim
dx(d) = x(d,j) - x(d,i)
dr = dr + dx(d)*dx(d)
enddo
r = sqrt(dr)
do d=1,dim
dvx(d) = vx(d,j) - vx(d,i)
enddo
if (dim.eq.3) then
if((abs(itype(i)).gt.1000 .and. abs(itype(j)).gt.1000 ) ) then
epsilonxx = dvx(1)*dwdx(1,k)
epsilonyy = dvx(2)*dwdx(2,k)
epsilonxy = (1/2.)*(dvx(1)*dwdx(2,k)+dvx(2)*dwdx(1,k))
epsilonzz = dvx(dim)*dwdx(dim,k)
epsilonxz = (1/2.)*(dvx(1)*dwdx(dim,k)+dvx(dim)*dwdx(1,k))
epsilonyz = (1/2.)*(dvx(2)*dwdx(dim,k)+dvx(dim)*dwdx(2,k))
epsxx(i) = epsxx(i) + mass(j)*epsilonxx/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz/rho(i)
elseif( (abs(itype(i)).lt.1000 ) .and. (abs(itype(j)).gt.1000 ) ) then
epsilonxx_interface(i) =(2/3.)*(2.e0*dvx(1)*dwdx(1,k)
epsilonxx_interface(j) =dvx(1)*dwdx(1,k)
epsilonyy_interface(i) =(2/3.)*(2.e0*dvx(2)*dwdx(2,k)
epsilonyy_interface(j) =dvx(2)*dwdx(2,k)
epsilonxy_interface(i) =dvx(1)*dwdx(2,k) + dvx(2)*dwdx(1,k)
epsilonxy_interface(j) =(1/2.)*(dvx(1)*dwdx(2,k)+dvx(2)*dwdx(1,k))
epsilonzz_interface(i) =(2/3.)*(2.e0*dvx(dim)*dwdx(dim,k)
epsilonzz_interface(j) =dvx(dim)*dwdx(dim,k) epsilonxz_interface(i) =dvx(1)*dwdx(dim,k) + dvx(dim)*dwdx(1,k)
epsilonxz_interface(j) =(1/2.)*(dvx(1)*dwdx(dim,k)+dvx(dim)*dwdx(1,k))
epsilonyz_interface(i) =dvx(2)*dwdx(dim,k) + dvx(dim)*dwdx(2,k)
epsilonyz_interface(j) =(1/2.)*(dvx(2)*dwdx(dim,k)+dvx(dim)*dwdx(2,k))
epsxx(i) = epsxx(i) + mass(j)*epsilonxx_interface(i)/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx_interface(j)/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy_interface(i)/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy_interface(j)/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy_interface(i)/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy_interface(j)/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz_interface(j)/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz_interface(i)/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz_interface(j)/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz_interface(i)/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz_interface(j)/rho(i)
elseif( (abs(itype(i)).gt.1000 ) .and. (abs(itype(j)).lt.1000 ) ) then
epsilonxx_interface(j) = (2/3.)*(2.e0*dvx(1)*dwdx(1,k)
epsilonxx_interface(i) =dvx(1)*dwdx(1,k)
epsilonyy_interface(j) =(2/3.)*(2.e0*dvx(2)*dwdx(2,k)
epsilonyy_interface(i) = dvx(2)*dwdx(2,k)
epsilonxy_interface(j) =dvx(1)*dwdx(2,k) + dvx(2)*dwdx(1,k)
epsilonxy_interface(i) = (1/2.)*(dvx(1)*dwdx(2,k)+dvx(2)*dwdx(1,k))
epsilonzz_interface(j) = (2/3.)*(2.e0*dvx(dim)*dwdx(dim,k)
epsilonzz_interface(i) =dvx(dim)*dwdx(dim,k)
epsilonxz_interface(j) =dvx(1)*dwdx(dim,k) + dvx(dim)*dwdx(1,k)
epsilonxz_interface(i) =(1/2.)*(dvx(1)*dwdx(dim,k)+dvx(dim)*dwdx(1,k))
epsilonyz_interface(j) =dvx(2)*dwdx(dim,k) + dvx(dim)*dwdx(2,k)
epsilonyz_interface(i) =(1/2.)*(dvx(2)*dwdx(dim,k)+dvx(dim)*dwdx(2,k))
epsxx(i) = epsxx(i) + mass(j)*epsilonxx_interface(i)/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx_interface(j)/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy_interface(i)/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy_interface(j)/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy_interface(i)/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy_interface(j)/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz_interface(j)/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz_interface(i)/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz_interface(j)/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz_interface(i)/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz_interface(j)/rho(i)
endif
endif
enddo
!$omp end do nowait
endif
!$omp end parallel
答案 0 :(得分:2)
您观察到的性能问题来自您使用的算法的基础。每个线程选取一对粒子并计算一些值,然后修改eps??
的值(其中??
为xx
,yy
,zz
等。 )两种颗粒。根据对列表的构建方式,这可能会导致许多线程尝试同时修改相邻粒子的值或甚至同一粒子的值。在前一种情况下,它会导致错误共享,由于缓存行经常无效并从较高级别的缓存或主存储器重新加载,因此会出现大幅减速。后者导致计算的数组元素的值完全错误。
虽然使用原子更新可以很容易地解决后一个问题,例如
!$OMP ATOMIC UPDATE
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
或CRITICAL
构造,例如
!$OMP CRITICAL
epsxx(i) = epsxx(i) + mass(j)*epsilonxx_interface(i)/rho(j)
epsxx(j) = epsxx(j) + mass(i)*epsilonxx_interface(j)/rho(i)
epsyy(i) = epsyy(i) + mass(j)*epsilonyy_interface(i)/rho(j)
epsyy(j) = epsyy(j) + mass(i)*epsilonyy_interface(j)/rho(i)
epsxy(i) = epsxy(i) + mass(j)*epsilonxy_interface(i)/rho(j)
epsxy(j) = epsxy(j) + mass(i)*epsilonxy_interface(j)/rho(i)
epszz(i) = epszz(i) + mass(j)*epsilonzz_interface(i)/rho(j)
epszz(j) = epszz(j) + mass(i)*epsilonzz_interface(j)/rho(i)
epsxz(i) = epsxz(i) + mass(j)*epsilonxz_interface(i)/rho(j)
epsxz(j) = epsxz(j) + mass(i)*epsilonxz_interface(j)/rho(i)
epsyz(i) = epsyz(i) + mass(j)*epsilonyz_interface(i)/rho(j)
epsyz(j) = epsyz(j) + mass(i)*epsilonyz_interface(j)/rho(i)
!$OMP END CRITICAL
甚至数组减少,例如
!$OMP PARALLEL REDUCTION(+:epsxx,epsyy,epsxy,epszz,...)
前一个问题要求您更改算法。例如,您可以切换到不同的对列表结构,例如列表数组,其中数组索引是粒子数,每个列表包含该粒子的邻居。对邻居列表进行排序将(种类)减少错误共享。根据粒子分布的几何形状,最终可能会遇到严重不平衡的问题,因此您应该考虑使用动态循环调度。