我有一个大约一百万条记录的数据,每条记录有6个浮点数。我想找到共享相同的六个值的记录集,理想情况下我想在Fortran中执行它,因为其余的处理是在Fortran中完成的。对此推荐的方法是什么?最后,我希望从原始索引到新索引的映射,这是这些数据集的压缩版本,没有重复。每条记录都有其他属性,我有兴趣根据六个属性聚合组。
我尝试通过将输出导出为csv来查找这些集合,将其导入MS Access,然后查找这些集合的查询需要10秒左右才能运行。我写了一个代码http://rosettacode.org/wiki/Remove_duplicate_elements#Fortran这个(“线性搜索”?),但是有了百万条记录它在10分钟左右没有完成,我就放弃了这种方法。
方法我现在想的是调整slatec或orderpack的排名/排序例程,我认为这比我的原始代码要好。但我想知道这些事情是否已经完成并且我可以下载,或者是否有更好的方法。
修改
我说“找到重复”,但实际上我需要从原始数据记录映射到这个简化集。我希望像imap(1:n)这样的映射数组,其中imap(1),imap(4),imap(5)具有相同的值,如果那些6浮点pt。原始记录1,4和5中的值是相同的。希望这与我原先所说的并没有太大的偏差......
答案 0 :(得分:1)
这就是我最终要做的......我从ORDERPACK获取了代码mrgrnk
,并根据我的目的进行了调整。下面的子例程findmap
似乎正在按我的意愿行事。
module fndmap
use m_mrgrnk, only:mrgrnk
implicit none
contains
subroutine findmap(stkprm, stkmap )
! given 2-d real array stkprm, find a mapping described below:
!
! (identical records are assigned with same index)
! stkmap(i) == stkmap(j) iff stkprm(:,i) == stkprm(:,j)
! (order conserved)
! if i < j and stkmap(i) /= stkmap(j), then stkmap(i) < stkmap(j)
! (new index are contiguous)
! set(stkmap) == {1,2,..,maxval(stkmap)}
!
real,dimension(:,:),intent(in) :: stkprm
integer,dimension(:), intent(out) :: stkmap
integer, dimension(size(stkprm,2)) :: irngt
integer, dimension(size(stkprm,2)) :: iwork
integer :: nrec, i, j
nrec = size(stkprm,2)
! find rank of each record, duplicate records kept
call ar_mrgrnk(stkprm, irngt)
! construct iwork array, which has index of original array where the
! record are identical, and the index is youguest
i = 1
do while(i<=nrec)
do j=i+1,nrec
if (any(stkprm(:,irngt(i))/=stkprm(:,irngt(j)))) exit
enddo
iwork(irngt(i:j-1)) = minval(irngt(i:j-1))
i = j
enddo
! now construct the map, where stkmap(i) shows index of new array
! with duplicated record eliminated, original order kept
j = 0
do i=1,nrec
if (i==iwork(i)) then
j = j+1
stkmap(i) = j
else
stkmap(i) = stkmap(iwork(i))
endif
enddo
end subroutine
recursive subroutine ar_mrgrnk(xdont, irngt)
! behaves like mrgrnk of ORDERPACK, except that array is 2-d
! each row are ranked by first field, then second and so on
real, dimension(:,:), intent(in) :: xdont
integer, dimension(:), intent(out), target :: irngt
integer, dimension(size(xdont,2)) :: iwork
integer :: nfld,nrec
integer :: i, j
integer, dimension(:), pointer :: ipt
nfld=size(xdont,1)
nrec=size(xdont,2)
! rank by the first field
call mrgrnk(xdont(1,:), irngt)
! if there's only one field, it's done
if (nfld==1) return
! examine the rank to see if multiple record has identical
! values for the first field
i = 1
do while(i<=nrec)
do j=i+1,nrec
if (xdont(1,irngt(i))/=xdont(1,irngt(j))) exit
enddo
! if one-to-one, do nothing
if (j-1>i) then
! if many-to-one,
! gather those many, and rank them
call ar_mrgrnk(xdont(2:,irngt(i:j-1)),iwork)
! rearrange my rank based on those fields to the right
ipt => irngt(i:j-1)
ipt = ipt(iwork(1:j-i))
endif
i = j
enddo
if(associated(ipt)) nullify(ipt)
end subroutine
end module