我有一个包含大约40000行和400列的数据集( GRE )。我需要在 LABEL 列中为每个参与者(主题)和每个试验(试验)确定两个字符串序列:1) pret1 后跟 t1 ,2) pret2 ,然后是 t2 。 pret1 , t1 , pret2 和 t2 的出现次数并不重要。
我想要的是一个新列,其中我为有序列的观察值赋值为0,没有序列时为1,为无法评估的观察/行分配NA它们不包括 pret1 , t1 , pret2 , t2 中的任何一个 遵循可重复的例子。
LABEL <-c("vc","gfda","gsgs_pret2","sfgsgt_pret2","hhjcf_t2","xa_postt2","sgs","sgsd","fgnx_pret1","wqraffsd_pret1","zdgn_t1","with_postt1","nzf","great_postt2","l","fjs","ssmlk_t1","gjkgj_t1","djdj_postt1","ityufhj","eyhjjfjfhjf","dghjdj_pret2","gjkt_t2","kuutt_t2","truetye_postt2","fj","hgfg_pret1","zetytu")
Subject <- rep(c(1,2), each=14)
Trial <- rep(1:4,each=7)
OUTPUT<-c("NA","NA","0","0","0","NA","NA","NA","0","0","0","NA","NA","NA","NA","NA","1","1","NA","NA","NA","0","0","0","NA","NA","1","NA")
GRE <- data.frame(LABEL,Subject,Trial,OUTPUT)
答案 0 :(得分:1)
这是使用dplyr
的解决方案(不是必需的,只是对其语法的偏好)。它几乎可以肯定地被简化并且更具可读性,但至少它提供了预期的输出:
library(dplyr)
res <-
GRE %>%
group_by(Trial, Subject) %>%
mutate(
pret1 = grepl("_pret1$", LABEL),
t1 = grepl("_t1$", LABEL),
pret2 = grepl("_pret2$", LABEL),
t2 = grepl("_t2$", LABEL),
seq_ = (any(pret1) & any(t1) & (pret1 | t1)) |
(any(pret2) & any(t2) & (pret2 | t2)),
no_seq_ = ((all(!pret1) | all(!t1)) & (pret1 | t1)) |
((all(!pret2) | all(!t2)) & (pret2 | t2)),
OUTPUT_2 = ifelse(seq_, 0L, ifelse(no_seq_, 1L, NA_integer_))
) %>%
ungroup() # %T>% print(n = 28)
# # A tibble: 28 × 11
# LABEL Subject Trial OUTPUT pret1 t1 pret2 t2 seq_ no_seq_ OUTPUT_2
# <fctr> <dbl> <int> <fctr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <int>
# 1 vc 1 1 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 2 gfda 1 1 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 3 gsgs_pret2 1 1 0 FALSE FALSE TRUE FALSE TRUE FALSE 0
# 4 sfgsgt_pret2 1 1 0 FALSE FALSE TRUE FALSE TRUE FALSE 0
# 5 hhjcf_t2 1 1 0 FALSE FALSE FALSE TRUE TRUE FALSE 0
# 6 xa_postt2 1 1 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 7 sgs 1 1 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 8 sgsd 1 2 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 9 fgnx_pret1 1 2 0 TRUE FALSE FALSE FALSE TRUE FALSE 0
# 10 wqraffsd_pret1 1 2 0 TRUE FALSE FALSE FALSE TRUE FALSE 0
# 11 zdgn_t1 1 2 0 FALSE TRUE FALSE FALSE TRUE FALSE 0
# 12 with_postt1 1 2 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 13 nzf 1 2 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 14 great_postt2 1 2 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 15 l 2 3 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 16 fjs 2 3 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 17 ssmlk_t1 2 3 1 FALSE TRUE FALSE FALSE FALSE TRUE 1
# 18 gjkgj_t1 2 3 1 FALSE TRUE FALSE FALSE FALSE TRUE 1
# 19 djdj_postt1 2 3 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 20 ityufhj 2 3 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 21 eyhjjfjfhjf 2 3 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 22 dghjdj_pret2 2 4 0 FALSE FALSE TRUE FALSE TRUE FALSE 0
# 23 gjkt_t2 2 4 0 FALSE FALSE FALSE TRUE TRUE FALSE 0
# 24 kuutt_t2 2 4 0 FALSE FALSE FALSE TRUE TRUE FALSE 0
# 25 truetye_postt2 2 4 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 26 fj 2 4 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
# 27 hgfg_pret1 2 4 1 TRUE FALSE FALSE FALSE FALSE TRUE 1
# 28 zetytu 2 4 NA FALSE FALSE FALSE FALSE FALSE FALSE NA
identical(as.integer(as.character(res$OUTPUT)), res$OUTPUT_2)
# [1] TRUE
# Warning message:
# In identical(as.integer(as.character(res$OUTPUT)), res$OUTPUT_2) :
# NAs introduced by coercion
答案 1 :(得分:0)
我有这段代码,但我的结果与你的例子不同:
确定哪些标签具有模式
havePattern <- grep('_t1|_t2|pret1|pret2', GRE$LABEL)
创建新列
GRE$OUTPUT2 <- NA
拆分文字并与字母匹配。如果是一个序列,则函数返回1,如果不是0.仅适用于具有模式
的行GRE$OUTPUT2[havePattern] <- sapply(GRE$LABEL[havePattern], function(x){
str_sp <- strsplit(x, '')[[1]]
numb <- na.omit(match(str_sp, letters))
any(diff(numb) == 1) * 1
})
GRE
LABEL Subject Trial OUTPUT OUTPUT2
1 vc 1 1 NA NA
2 gfda 1 1 NA NA
3 gsgs_pret2 1 1 0 0
4 sfgsgt_pret2 1 1 0 1
5 hhjcf_t2 1 1 0 0
6 xa_postt2 1 1 NA NA
7 sgs 1 1 NA NA
8 sgsd 1 2 NA NA
9 fgnx_pret1 1 2 0 1
10 wqraffsd_pret1 1 2 0 1
11 zdgn_t1 1 2 0 0
12 with_postt1 1 2 NA NA
13 nzf 1 2 NA NA
14 great_postt2 1 2 NA NA
15 l 2 3 NA NA
16 fjs 2 3 NA NA
17 ssmlk_t1 2 3 1 0
18 gjkgj_t1 2 3 1 1
19 djdj_postt1 2 3 NA NA
20 ityufhj 2 3 NA NA
21 eyhjjfjfhjf 2 3 NA NA
22 dghjdj_pret2 2 4 0 1
23 gjkt_t2 2 4 0 1
24 kuutt_t2 2 4 0 0
25 truetye_postt2 2 4 NA NA
26 fj 2 4 NA NA
27 hgfg_pret1 2 4 1 1
28 zetytu 2 4 NA NA