我正在尝试在R中重新创建此SAS阵列,而不会将我的数据集从宽到长(由于我的数据的大小)而换位。我看过here可以提供帮助,但似乎无法一概而论。
data esoph_pre1;
set ESOPH.Pedsf_esophagus &opts;
*get sequence of esophageal cancer;
array siterwho {*} SITERWHO1-SITERWHO3;
array site {*} SITE1-SITE3;
array yrdx{*} $ YRDX1-YRDX3;
do i = 1 to 3;
*set to 0 at b/g of loop;
esoph_index = .;
*1 rec for each instance of esophageal of the correct SITERWHO and location (SITE);
if (SITERWHO{i} in('21010') OR
site{i} in('151','152','153','154','155','158','159')) and
'2004' <= yrdx{i} <= '2013'
then esoph_index = i;
if esoph_index ne . then output;
end;
drop i;
run;
如果您本质上不熟悉SAS,那么它将遍历3列中的每一列,并确定是否应输出该行并将索引号保存为esoph_index
。
我的问题是-是否有一种方法可以在每次满足条件时输出新行,并放置与该迭代相关联的索引(即,如果在循环的第二次迭代中索引将显示2)?
这是我的尝试和期望的输出,但是希望您能提供以下输入:
#original Data Frame
dx <- data.frame(ID = c(1,2,3),
SITERWHO1 = c('21010',NA,'42322'),
SITERWHO2 = c('21010','21010','56987'),
SITERWHO3 = c(NA,NA,'21010'),
SITE1 = c('159', NA,'160'),
SITE2 = c('151', '232','160'),
SITE3 = c(NA, NA,'154'),
YEARRX1 = c('2005','2001','2004'),
YEARRX2 = c('2006','2007','2009'),
YEARRX3 = c('1998','1989','2004'),
stringsAsFactors = FALSE)
#list of codes
si <- c(as.character(151:159))
#list of years
yr <- c(as.character(2004:2013))
#list of variables names
siter <- paste0("SITERWHO",1:3)
site <- paste0("SITE", 1:3)
yeardx <- paste0("YEARRX",1:3)
#put list of variables together
df <- as.data.frame(t(data.frame(siter = siter, site = site, yeardx = yeardx,
stringsAsFactors = FALSE)),stringsAsFactors = FALSE)
#conditions work one at a time but need to get index on df
tcond <- dx[(dx[df$V1][1] == '21010'|
dx[df$V1][2] == si) &
dx[df$V1][3] == '2005',]
#can't seem to get the loop to work
lscond <- lapply(df, function(x){
dx[(dx[df[['x']]][1] == '21010'
|dx[df[['x']]][2] %in% si ) &
dx[df[['x']]][3] == yr, ]
})
#desired output
desired <- data.frame(ID = c(1,1,2,3),
SITERWHO1 = c('21010','21010',NA,'42322'),
SITERWHO2 = c('21010','21010', '21010','56987'),
SITERWHO3 = c(NA,NA,NA, '21010'),
SITE1 = c('159', '159',NA,'160'),
SITE2 = c('151', '151', '232','160'),
SITE3 = c(NA, NA,NA, '154'),
YEARRX1 = c('2005','2005','2001','2004'),
YEARRX2 = c('2006','2006', '2007','2009'),
YEARRX3 = c('1998','1998','1989','2004'),
Index = c(1,2,2,3),
stringsAsFactors = FALSE)
答案 0 :(得分:1)
library(purrr)
library(data.table) # just for %between% function
vars <- c('SITERWHO', 'SITE', 'YEARRX')
map(1:3, ~pmap_lgl(dx[paste0(vars, .x)], ~
(..1 == '21010'
| ..2 %in% c('151','152','153','154','155','158','159')
) & ..3 %between% c('2004', '2013'))) %>%
transpose %>%
map(which) %>%
imap_dfr(~dx[rep(.y, length(.x)),] %>% mutate(Index = .x))
# ID SITERWHO1 SITERWHO2 SITERWHO3 SITE1 SITE2 SITE3 YEARRX1 YEARRX2 YEARRX3 Index
# 1 1 21010 21010 <NA> 159 151 <NA> 2005 2006 1998 1
# 2 1 21010 21010 <NA> 159 151 <NA> 2005 2006 1998 2
# 3 2 <NA> 21010 <NA> <NA> 232 <NA> 2001 2007 1989 2
# 4 3 42322 56987 21010 160 160 154 2004 2009 2004 3
说明:
在这里,我们检查每一行以1结尾的列的条件。
i <- 1
pmap_lgl(dx[paste0(vars, i)], ~
(..1 == '21010'
| ..2 %in% c('151','152','153','154','155','158','159')
) & ..3 %between% c('2004', '2013'))
# [1] TRUE FALSE FALSE
然后我们还需要对以2和3结尾的用户执行此操作,因此使用map
。
map(1:3, ~pmap_lgl(dx[paste0(vars, .x)], ~
(..1 == '21010'
| ..2 %in% c('151','152','153','154','155','158','159')
) & ..3 %between% c('2004', '2013')))
# [[1]]
# [1] TRUE FALSE FALSE
#
# [[2]]
# [1] TRUE TRUE FALSE
#
# [[3]]
# [1] FALSE FALSE TRUE
您可以看到第一行的1尾列和2尾的列匹配(1 = TRUE,2 = TRUE,3 = FALSE)。但是输出并没有真正按照这种方式分组,需要进行转置。
[[1]]
[1] TRUE TRUE FALSE
[[2]]
[1] FALSE TRUE FALSE
[[3]]
[1] FALSE FALSE TRUE
然后,我们需要对此进行map
which
来获取TRUE
的索引
[[1]]
[1] 1 2
[[2]]
[1] 2
[[3]]
[1] 3
最后,我们需要从数据框中选择相应的行,如有必要,多次选择(因此rep
),并添加新变量(mutate
)
imap_dfr(~dx[rep(.y, length(.x)),] %>% mutate(Index = .x))