我正在尝试并行化我的代码以进行最小二乘拟合。顺序版本给出了正确的答案。并行版本编译并运行但我的答案不正确。我想知道我是否可以在这里得到任何帮助。
在此处编写代码:
upickle
我上面使用的模块workArray如下:
PROGRAM learn_parallel
use workArrays_mod
use real_precision
!$ USE OMP_LIB, ONLY: OMP_GET_NUM_THREADS, OMP_GET_THREAD_NUM
IMPLICIT NONE
real(kind=r8), dimension(:,:,:), allocatable :: x, coeffs
real(kind=r8), dimension(:), allocatable :: time, alphastar
real(kind=r8) :: xbarlim1
integer :: ix, iy, it, IERR, nt, ncols
integer :: ilaenv
nt = 6
ncols = 2
allocate(x(100, 100, nt), time(nt), coeffs(100,100,ncols))
x = 0
do ix=1,100
do iy=1,100
x(ix, iy, :) = (/ 1.1, 2.1, 3.1, 4.0, 4.9, 5.9 /)
end do
end do
time = (/ 1.2, 2.3, 3.0, 3.8, 4.7, 5.9 /)
allocate (tmp_mat(6, ncols), rtmp_vec1(6), rtmp_vec2(6), &
& itmp_vec1(6), itmp_vec2(6))
nb = ILAENV(1,'DGELS','N',size(tmp_mat, 1),size(tmp_mat, 2),-1,-1)
CALL OMP_SET_DYNAMIC(.FALSE.)
!$OMP PARALLEL DO &
!The SHARED list contains shared variables across all threads.
!$OMP& SHARED(x, time, xbarlim1, coeffs) &
!The PRIVATE list contains uninitialized variables every
!thread should have a private copy of.
!$OMP& PRIVATE(alphastar, IERR) &
!$OMP& DEFAULT (NONE)
do ix = 1, 100
do iy = 1,1 !100
print *, OMP_GET_NUM_THREADS(), ",", OMP_GET_THREAD_NUM()
call lsfit(time, x(ix, iy, :), 0, xbarlim1, alphastar, IERR)
coeffs(ix, iy, :) = alphastar
deallocate (alphastar)
print *, coeffs(ix, iy, :)
end do
end do
!$OMP END PARALLEL DO
deallocate (tmp_mat, rtmp_vec1, rtmp_vec2, itmp_vec1, itmp_vec2)
deallocate (x, time, coeffs)
CONTAINS
SUBROUTINE lsfit(t, u, K, xbarlim1, alphaStar, IERR)
USE workarrays_mod
USE REAL_PRECISION
IMPLICIT NONE
INTEGER, INTENT(IN) :: K
REAL(KIND=R8), INTENT(IN) :: t(:), u(:)
REAL(KIND=R8), INTENT(IN) :: xbarlim1
INTEGER :: i, j, ccol, ncols, lwork, DeAllocateStatus
INTEGER :: info, IERR, sz, m, mn
REAL(KIND=R8) :: sigma, tau1, detR1, detXtX
REAL(KIND=R8), DIMENSION(:,:), ALLOCATABLE:: X
REAL(KIND=R8), DIMENSION(:), ALLOCATABLE :: work
REAL(KIND=R8), DIMENSION(:), ALLOCATABLE, intent(inout) :: alphaStar
M = size(t, 1)
ncols = 2
ALLOCATE (X(m, ncols))
ALLOCATE (alphaStar(ncols))
X(:,1) = (/ (1, i = 1,M) /)
X(:,2) = t
lwork = min(M, ncols) + max(max(M, ncols), 1) * nb
ALLOCATE (work(lwork))
!use QR factorization to solve the system, way more stable
!drawback of DGELS: It is assumed that A has full rank.
tmp_mat(1:M, 1:ncols) = X
rtmp_vec1(1:M) = u
CALL DGELS('N', M, ncols, 1,tmp_mat(1:M, 1:ncols),M, &
& rtmp_vec1(1:M),M, work(1:lwork), &
& lwork, info)
alphaStar = rtmp_vec1(1:ncols)
DEALLOCATE (X, work, stat = DeAllocateStatus)
END SUBROUTINE lsfit
END PROGRAM learn_parallel
线性回归问题的正确答案是 (-0.1432,1.0506)这是我运行此代码预并行化时得到的。