R中字母串的序列

时间:2017-04-25 14:22:05

标签: r string dataframe sequence

我有一个包含大约40000行和400列的数据集( GRE )。我需要在 LABEL 列中为每个参与者(主题)和每个试验(试验)确定两个字符串序列:1) pret1 后跟 t1 ,2) pret2 ,然后是 t2 pret1 t1 pret2 t2 的出现次数并不重要。

我想要的是一个新列,其中我为有序列的观察值赋值为0,没有序列时为1,为无法评估的观察/行分配NA它们不包括 pret1 t1 pret2 t2 中的任何一个 遵循可重复的例子。

LABEL <-c("vc","gfda","gsgs_pret2","sfgsgt_pret2","hhjcf_t2","xa_postt2","sgs","sgsd","fgnx_pret1","wqraffsd_pret1","zdgn_t1","with_postt1","nzf","great_postt2","l","fjs","ssmlk_t1","gjkgj_t1","djdj_postt1","ityufhj","eyhjjfjfhjf","dghjdj_pret2","gjkt_t2","kuutt_t2","truetye_postt2","fj","hgfg_pret1","zetytu") 
Subject <- rep(c(1,2), each=14)
Trial <- rep(1:4,each=7)
OUTPUT<-c("NA","NA","0","0","0","NA","NA","NA","0","0","0","NA","NA","NA","NA","NA","1","1","NA","NA","NA","0","0","0","NA","NA","1","NA") 
GRE <- data.frame(LABEL,Subject,Trial,OUTPUT)

2 个答案:

答案 0 :(得分:1)

这是使用dplyr的解决方案(不是必需的,只是对其语法的偏好)。它几乎可以肯定地被简化并且更具可读性,但至少它提供了预期的输出:

library(dplyr)
res <-
  GRE %>% 
  group_by(Trial, Subject) %>% 
  mutate(
    pret1 = grepl("_pret1$", LABEL),
    t1 = grepl("_t1$", LABEL),
    pret2 = grepl("_pret2$", LABEL),
    t2 = grepl("_t2$", LABEL),
    seq_ = (any(pret1) & any(t1) & (pret1 | t1)) |
      (any(pret2) & any(t2) & (pret2 | t2)),
    no_seq_ = ((all(!pret1) | all(!t1)) & (pret1 | t1)) |
      ((all(!pret2) | all(!t2)) & (pret2 | t2)),
    OUTPUT_2 = ifelse(seq_, 0L, ifelse(no_seq_, 1L, NA_integer_))
  ) %>% 
  ungroup() # %T>% print(n = 28)

# # A tibble: 28 × 11
#             LABEL Subject Trial OUTPUT pret1    t1 pret2    t2  seq_ no_seq_ OUTPUT_2
# <fctr>   <dbl> <int> <fctr> <lgl> <lgl> <lgl> <lgl> <lgl>   <lgl>    <int>
# 1              vc       1     1     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 2            gfda       1     1     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 3      gsgs_pret2       1     1      0 FALSE FALSE  TRUE FALSE  TRUE   FALSE        0
# 4    sfgsgt_pret2       1     1      0 FALSE FALSE  TRUE FALSE  TRUE   FALSE        0
# 5        hhjcf_t2       1     1      0 FALSE FALSE FALSE  TRUE  TRUE   FALSE        0
# 6       xa_postt2       1     1     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 7             sgs       1     1     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 8            sgsd       1     2     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 9      fgnx_pret1       1     2      0  TRUE FALSE FALSE FALSE  TRUE   FALSE        0
# 10 wqraffsd_pret1       1     2      0  TRUE FALSE FALSE FALSE  TRUE   FALSE        0
# 11        zdgn_t1       1     2      0 FALSE  TRUE FALSE FALSE  TRUE   FALSE        0
# 12    with_postt1       1     2     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 13            nzf       1     2     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 14   great_postt2       1     2     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 15              l       2     3     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 16            fjs       2     3     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 17       ssmlk_t1       2     3      1 FALSE  TRUE FALSE FALSE FALSE    TRUE        1
# 18       gjkgj_t1       2     3      1 FALSE  TRUE FALSE FALSE FALSE    TRUE        1
# 19    djdj_postt1       2     3     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 20        ityufhj       2     3     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 21    eyhjjfjfhjf       2     3     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 22   dghjdj_pret2       2     4      0 FALSE FALSE  TRUE FALSE  TRUE   FALSE        0
# 23        gjkt_t2       2     4      0 FALSE FALSE FALSE  TRUE  TRUE   FALSE        0
# 24       kuutt_t2       2     4      0 FALSE FALSE FALSE  TRUE  TRUE   FALSE        0
# 25 truetye_postt2       2     4     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 26             fj       2     4     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA
# 27     hgfg_pret1       2     4      1  TRUE FALSE FALSE FALSE FALSE    TRUE        1
# 28         zetytu       2     4     NA FALSE FALSE FALSE FALSE FALSE   FALSE       NA

identical(as.integer(as.character(res$OUTPUT)), res$OUTPUT_2)
# [1] TRUE
# Warning message:
# In identical(as.integer(as.character(res$OUTPUT)), res$OUTPUT_2) :
#   NAs introduced by coercion

答案 1 :(得分:0)

我有这段代码,但我的结果与你的例子不同:

确定哪些标签具有模式

havePattern <- grep('_t1|_t2|pret1|pret2', GRE$LABEL)

创建新列

GRE$OUTPUT2 <- NA

拆分文字并与字母匹配。如果是一个序列,则函数返回1,如果不是0.仅适用于具有模式

的行
GRE$OUTPUT2[havePattern] <- sapply(GRE$LABEL[havePattern], function(x){
  str_sp <- strsplit(x, '')[[1]]
  numb <- na.omit(match(str_sp, letters))
  any(diff(numb) == 1) * 1
})

GRE
            LABEL Subject Trial OUTPUT OUTPUT2
1              vc       1     1     NA      NA
2            gfda       1     1     NA      NA
3      gsgs_pret2       1     1      0       0
4    sfgsgt_pret2       1     1      0       1
5        hhjcf_t2       1     1      0       0
6       xa_postt2       1     1     NA      NA
7             sgs       1     1     NA      NA
8            sgsd       1     2     NA      NA
9      fgnx_pret1       1     2      0       1
10 wqraffsd_pret1       1     2      0       1
11        zdgn_t1       1     2      0       0
12    with_postt1       1     2     NA      NA
13            nzf       1     2     NA      NA
14   great_postt2       1     2     NA      NA
15              l       2     3     NA      NA
16            fjs       2     3     NA      NA
17       ssmlk_t1       2     3      1       0
18       gjkgj_t1       2     3      1       1
19    djdj_postt1       2     3     NA      NA
20        ityufhj       2     3     NA      NA
21    eyhjjfjfhjf       2     3     NA      NA
22   dghjdj_pret2       2     4      0       1
23        gjkt_t2       2     4      0       1
24       kuutt_t2       2     4      0       0
25 truetye_postt2       2     4     NA      NA
26             fj       2     4     NA      NA
27     hgfg_pret1       2     4      1       1
28         zetytu       2     4     NA      NA