如何使用grep()函数来识别r

时间:2018-04-17 13:49:47

标签: r reshape

我正在尝试使用grep()函数为两个群体中的每个群体生成一个平均得分变量。我的所有代码都运行,但avgScore.pop1和avgScore.pop2变量值之间没有区别,我认为这与人口标识符位于变量名称末尾的事实有关。

以下是我正在使用的代码示例:

rm(list = ls())

measure <- c("m1", "m2", "m3", "m4", "m5", "m6",
         "m1", "m2", "m3", "m4", "m5", "m6",
         "m1", "m2", "m3", "m4", "m5", "m6",
         "m1", "m2", "m3", "m4", "m5", "m6",
         "m1", "m2", "m3", "m4", "m5", "m6",
         "m1", "m2", "m3", "m4", "m5", "m6",
         "m1", "m2", "m3", "m4", "m5", "m6",
         "m1", "m2", "m3", "m4", "m5", "m6")
population <- c("pop1", "pop1", "pop1", "pop1", "pop1", "pop1",
           "pop2", "pop2", "pop2", "pop2", "pop2", "pop2",
           "pop1", "pop1", "pop1", "pop1", "pop1", "pop1",
           "pop2", "pop2", "pop2", "pop2", "pop2", "pop2",
           "pop1", "pop1", "pop1", "pop1", "pop1", "pop1",
           "pop2", "pop2", "pop2", "pop2", "pop2", "pop2",
           "pop1", "pop1", "pop1", "pop1", "pop1", "pop1",
           "pop2", "pop2", "pop2", "pop2", "pop2", "pop2")
name <- c("name1", "name1", "name1", "name1", "name1", "name1",
     "name1", "name1", "name1", "name1", "name1", "name1",
     "name2", "name2", "name2", "name2", "name2", "name2",
     "name2", "name2", "name2", "name2", "name2", "name2",
     "name3", "name3", "name3", "name3", "name3", "name3",
     "name3", "name3", "name3", "name3", "name3", "name3",
     "name4", "name4", "name4", "name4", "name4", "name4",
     "name4", "name4", "name4", "name4", "name4", "name4")
facility <- c("fac1", "fac1", "fac1", "fac1", "fac1", "fac1",
         "fac1", "fac1", "fac1", "fac1", "fac1", "fac1",
         "fac1", "fac1", "fac1", "fac1", "fac1", "fac1",
         "fac1", "fac1", "fac1", "fac1", "fac1", "fac1",
         "fac2",  "fac2",  "fac2",  "fac2",  "fac2",  "fac2",
         "fac2",  "fac2",  "fac2",  "fac2",  "fac2",  "fac2",
         "fac3", "fac3", "fac3", "fac3", "fac3", "fac3",
         "fac3", "fac3", "fac3", "fac3", "fac3", "fac3")
set.seed(12); denominator <- runif(48, 10, 100)
set.seed(12); score <- runif(48, 0, 1)


dat <- data.frame(name, facility, population, measure, denominator, score)

wide1 <- reshape(data=dat,
             idvar= c("name", "facility", "population"),
             timevar = "measure", 
             direction="wide")
wide2 <- reshape(data=wide1,
             idvar = c("name", "facility"),
             timevar= "population",
             direction="wide")

wide2$avgScore.pop1 <- rowSums(wide2[, grep("score.", names(wide2), '.pop1')], na.rm=T)/ 6
wide2$avgScore.pop2 <- rowSums(wide2[, grep("score.", names(wide2), '.pop2')], na.rm=T)/ 6

wide2$avgDenom.pop1 <- rowSums(wide2[, grep("denominator.", names(wide2), '.pop1')], na.rm=T)/ 6
wide2$avgDenom.pop2 <- rowSums(wide2[, grep("denominator.", names(wide2), '.pop2')], na.rm=T)/ 6

非常感谢任何关于如何总结每个人口的分数和分母的所有措施的想法! 谢谢!

3 个答案:

答案 0 :(得分:3)

你想要paste0grep在字符向量中搜索正则表达式模式。您想要将多个字符串粘贴在一起。只需在代码中将grep替换为paste0

wide2$avgScore.pop1 <- rowSums(wide2[, paste0("score.", names(wide2), '.pop1')],
                               na.rm=T)/ 6

wide2$avgScore.pop2 <- rowSums(wide2[, paste0("score.", names(wide2), '.pop2')],
                               na.rm=T)/ 6

如果你想找到所有变量,例如以“得分”开头,你可以在这里使用grep。并以“.pop1”

结束
grep("score\\.[^.]+\\.pop1", colnames(wide2))

将返回与:

大致相同的内容
paste0("score.", names(wide2), ".pop1")

答案 1 :(得分:2)

您是否有可能在寻找聚合物?

> aggregate(score ~ population + measure, dat, sum)
   population measure    score
1        pop1      m1 1.357344
2        pop2      m1 2.062984
3        pop1      m2 2.310233
4        pop2      m2 1.845279
5        pop1      m3 2.096953
6        pop2      m3 1.968227
7        pop1      m4 1.288433
8        pop2      m4 1.705252
9        pop1      m5 1.654866
10       pop2      m5 1.504966
11       pop1      m6 1.774900
12       pop2      m6 2.510683

或者使用dplyr:

library(dplyr)
dat %>%
 group_by(population, measure) %>% 
 summarize(sum(score))

# A tibble: 12 x 3
# Groups:   population [?]
   population measure `sum(score)`
       <fctr>  <fctr>        <dbl>
 1       pop1      m1     1.357344
 2       pop1      m2     2.310233
 3       pop1      m3     2.096953
 4       pop1      m4     1.288433
 5       pop1      m5     1.654866
 6       pop1      m6     1.774900
 7       pop2      m1     2.062984
 8       pop2      m2     1.845279
 9       pop2      m3     1.968227
10       pop2      m4     1.705252
11       pop2      m5     1.504966
12       pop2      m6     2.510683

答案 2 :(得分:0)

string like '12,34,46,767'

这应该做你想要的。它使用wide2$avgScore.pop1 <- rowSums(wide2[, grepl('.pop1', names(wide2))],na.rm=T)/ 6 wide2$avgScore.pop2 <- rowSums(wide2[, grepl('.pop2', names(wide2))], na.rm=T)/ 6 来匹配结尾的所有名称&#34; .pop1&#34;和&#34; pop2&#34;分别返回一个逻辑向量来指示要求的变量索引。

不确定这是否属于您之后的情况,但为了获取数字,还有更简单的解决方案,只需使用原始数据:

grepl