在R中选择不同列组合的更快捷方式

时间:2018-01-29 19:04:36

标签: r dataframe

我的数据集是:

dput(new)
structure(list(Year = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("1982", "1983", "1985", "1989", "1994", 
"1995", "1998"), class = "factor"), scallopid = 11:20, Region = c("GB", 
"GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB", "GB"), Area = structure(c(3L, 
3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L), .Label = c("Nantucket Lightship", 
"NE GB", "SW GB"), class = "factor"), Station = c("1982288", 
"1982288", "1982288", "1982288", "1982288", "1982329", "1982329", 
"1982329", "1982329", "1982329"), Depth = c(68, 68, 68, 68, 68, 
94, 94, 94, 94, 94), Lat = c(40.833333, 40.833333, 40.833333, 
40.833333, 40.833333, 41.183333, 41.183333, 41.183333, 41.183333, 
41.183333), ring1 = c(79.31, 57.57, 75.13, 79.14, 71.5, 76.75, 
72.06, 59.98, 86.6, 67.7), ring2 = c(104.03, 100.81, 95.95, 109.95, 
105.65, 104.1, 98.19, 93.93, 105.31, 100.57), ring3 = c(124.58, 
122.71, 108.69, 122.14, 119.28, 128.48, 122.2, 110.86, 127.91, 
110.6), ring4 = c(132.44, 129.75, 116.96, NA, NA, 135.48, 128.28, 
119.62, 141.16, 124.22), ring5 = c(NA, NA, 123.42, NA, NA, 141.22, 
135.16, 129.49, 148.86, 132.73), ring6 = c(NA, NA, 129.24, NA, 
NA, 145.51, 140.31, 138.12, 152.15, 138.12), ring7 = c(NA, NA, 
134.44, NA, NA, 148.62, 145.08, 143.71, NA, 141.71), ring8 = c(NA, 
NA, 138.2, NA, NA, 152.3, 147.98, 145.43, NA, 144.9), ring9 = c(NA, 
NA, 140.81, NA, NA, 155.9, 150.36, NA, NA, 145.96), ring10 = c(NA, 
NA, 143.13, NA, NA, 158.5, NA, NA, NA, NA), ring11 = c(NA, NA, 
144.81, NA, NA, NA, NA, NA, NA, NA), ring12 = c(NA, NA, 147.39, 
NA, NA, NA, NA, NA, NA, NA), ring13 = c(NA, NA, 148.74, NA, NA, 
NA, NA, NA, NA, NA), ring14 = c(NA, NA, 149.05, NA, NA, NA, NA, 
NA, NA, NA), ring15 = c(NA, NA, 150.62, NA, NA, NA, NA, NA, NA, 
NA)), .Names = c("Year", "scallopid", "Region", "Area", "Station", 
"Depth", "Lat", "ring1", "ring2", "ring3", "ring4", "ring5", 
"ring6", "ring7", "ring8", "ring9", "ring10", "ring11", "ring12", 
"ring13", "ring14", "ring15"), row.names = 12:21, class = "data.frame")

我想创建一个包含前7列的新数据集,然后是其余列的不同组合。

长手:

#ring 1 and 2
gb1<-new[,c(1:9)]
colnames(gb1)[8]<-"ring1"
colnames(gb1)[9]<-"ring2"

#ring 2 and 3
gb2<-new[,c(1:7,9,10)]
colnames(gb2)[8]<-"ring1"
colnames(gb2)[9]<-"ring2"

#ring 3 and 4
gb3<-new[,c(1:7,10,11)]
colnames(gb3)[8]<-"ring1"
colnames(gb3)[9]<-"ring2"

等。我为所有列执行此操作,然后将它们重新绑定到一个数据帧中。

有简化的方法吗?

我正在寻找的最终结果是:

  Year scallopid Region         Area Station Depth      Lat  ring1  ring2
2  1982         1    MAB VA/NC Border  198297    50 36.68333  78.56  95.45
21 1982         1    MAB VA/NC Border  198297    50 36.68333  95.45 109.49
22 1982         1    MAB VA/NC Border  198297    50 36.68333 109.49 117.20
23 1982         1    MAB VA/NC Border  198297    50 36.68333 117.20 125.86
24 1982         1    MAB VA/NC Border  198297    50 36.68333 125.86 130.75
25 1982         1    MAB VA/NC Border  198297    50 36.68333 130.75 133.32

对于每个扇贝,初始ring1值是原始的ring1值。 ring2值是同一行中的下一个环列值,因此对于第一个记录,它将是ring。对于同一个扇贝的下一行:ring1将是ring2列的值,ring2将是ring3列的值,依此类推。

2 个答案:

答案 0 :(得分:1)

扩展@joran的建议,我们可以在收集后将每个铃声标记为ring1ring2,然后可以通过新分组进行操作:

library(tidyverse)

new.long = new %>% 
  gather(ring, value, ring1:ring15) %>% 
  group_by(ring) %>% 
  mutate(ring_group = ifelse(as.numeric(gsub("ring","", ring)) %% 2 == 1, "ring1", "ring2"))

更新:您更新的示例并未使用您发布的数据,而您发布的数据每个scallopid只包含一行,所以我不是确定下面的代码是否在正确的轨道上。让我知道。

new.long = new %>% 
  select(Year:Lat, paste0("ring", seq(1,15,2)))
  gather(ring, ring1, ring1:ring15) %>% 
  group_by(scallopid, ring) %>% 
  mutate(ring2 = lead(ring1))

答案 1 :(得分:1)

通过依次遍历 ring 列名来构建可以在末尾进行行绑定的数据框列表,考虑在循环中手动显示您正在显示的内容。最后一次迭代需要函数内的if逻辑,它将具有空 ring2

id_cols <- colnames(new[1:7])
ring_cols <- colnames(new)[8:ncol(new)]

dfList <- lapply(seq_along(ring_cols), function(i) {
  if (is.na(ring_cols[i+1])) {
    tmp <- new[,c(id_cols, ring_cols[i])]
    tmp$ring2 <- NA
  } else {
    tmp <- new[,c(id_cols, ring_cols[i:(i+1)])]
  }

  colnames(tmp)[8:9] <- c("ring1", "ring2")
  return(tmp)
})

finaldf <- do.call(rbind, dfList)