从数据集查询

时间:2016-01-07 21:50:14

标签: r merge dplyr

我有两个数据集,如下所示。

dat1 <- read.table(header=TRUE, text="
ID  log Dist
ab7 1.1 2
                   ab8  1.6 1.5
                   ab21 3   1
                   ab3  2.05    1.09
                   ab300    1.5 0.45
                   ab4  1.78    1.11
                   ab10 1.9 2
                   ab501    1.5 0.2
                  ")

dat1
     ID  log Dist
1   ab7 1.10 2.00
2   ab8 1.60 1.50
3  ab21 3.00 1.00
4   ab3 2.05 1.09
5 ab300 1.50 0.45
6   ab4 1.78 1.11
7  ab10 1.90 2.00
8 ab501 1.50 0.20


dat2 <- read.table(header=TRUE, text="
ID  LFrom   LTo It1 It2 It3 It4
ab7 1   1.05    47  152 259 140
                  ab7   1.05    1.96    29  45  39  30
                  ab7   1.96    2.35    59  65  47  40
                  ab7   2.35    4.45    27  36  31  37
                  ab7   4.45    5   58  60  60  56
                  ab8   1.1 2.1 88  236 251 145
                  ab8   2.1 3.1 51  66  47  43
                  ab8   3.1 3.5 31  63  46  37
                  ab8   3.5 3.8 58  35  31  51
                  ab8   3.8 3.9 29  40  30  48
                  ab21  1.2 2.1 72  263 331 147
                  ab3   1   2   71  207 290 242
                  ab3   2   3   22  38  64  46
                  ab3   3   4   35  35  43  61
                  ab3   4   4.5 42  37  33  53
                  ab300 1   2   54  65  51  67
                  ab4   1.2 2.1 67  38  54  24
                  ab4   2.1 2.3 67  30  20  50
                  ab4   2.3 9.1 67  27  34  39
                  ab10  1.1 2   64  56  21  34
                  ab501 1   2   47  152 259 140
                  ")

dat2

      ID LFrom  LTo It1 It2 It3 It4
1    ab7  1.00 1.05  47 152 259 140
2    ab7  1.05 1.96  29  45  39  30
3    ab7  1.96 2.35  59  65  47  40
4    ab7  2.35 4.45  27  36  31  37
5    ab7  4.45 5.00  58  60  60  56
6    ab8  1.10 2.10  88 236 251 145
7    ab8  2.10 3.10  51  66  47  43
8    ab8  3.10 3.50  31  63  46  37
9    ab8  3.50 3.80  58  35  31  51
10   ab8  3.80 3.90  29  40  30  48
11  ab21  1.20 2.10  72 263 331 147
12   ab3  1.00 2.00  71 207 290 242
13   ab3  2.00 3.00  22  38  64  46
14   ab3  3.00 4.00  35  35  43  61
15   ab3  4.00 4.50  42  37  33  53
16 ab300  1.00 2.00  54  65  51  67
17   ab4  1.20 2.10  67  38  54  24
18   ab4  2.10 2.30  67  30  20  50
19   ab4  2.30 9.10  67  27  34  39
20  ab10  1.10 2.00  64  56  21  34
21 ab501  1.00 2.00  47 152 259 140

我需要先获取一个与ID匹配的查询数据集,然后检查dat1的第二和第三列以执行查询。条件是检查logLfromLto之间dat2数据的位置。例如,对于ab3log为2.05,Dist为1.09。因此,范围是(2.05, [2.05+1.09=]3.14)。对于row 13中的dat2(Lfrom, Lto)=(2,3)以及row 14中的dat2(Lfrom, Lto)=(3,4)dat1的范围(2.05, 3.14)位于这两行的(2,4)范围内。所以它满足了条件。最终的数据集如下:

      ID  log Dist LFrom  LTo It1 It2 It3 It4
1    ab7 1.10 2.00  1.05 1.96  29  45  39  30
2    ab7 1.10 2.00  1.96 2.35  59  65  47  40
3    ab7 1.10 2.00  2.35 4.45  27  36  31  37
4    ab8 1.60 1.50  1.10 2.10  88 236 251 145
5    ab8 1.60 1.50  2.10 3.10  51  66  47  43
6    ab3 2.05 1.09  2.00 3.00  22  38  64  46
7    ab3 2.05 1.09  3.00 4.00  35  35  43  61
8  ab300 1.50 0.45  1.00 2.00  54  65  51  67
9    ab4 1.78 1.11  1.20 2.10  67  38  54  24
10   ab4 1.78 1.11  2.10 2.30  67  30  20  50
11   ab4 1.78 1.11  2.30 9.10  67  27  34  39
12 ab501 1.50 0.20  1.00 2.00  47 152 259 140

2 个答案:

答案 0 :(得分:2)

以下代码有效:

dat3 <- data.frame()

for(i in 1:nrow(dat1)){

  d <- dat1[i,]

  # filter dat2 with ID
  d2 <- dat2[dat2$ID == d$ID,]

  # filter dat2 with range interference
  r1 <- c(d$log, d$log + d$Dist)
  d2 <- d2[apply(d2[,2:3], 1, function(x){r1[2] > x[1] & x[2] > r1[1]}),]

  # filter dat2 with range and collect data to dat3
  if(nrow(d2) > 0){
    r2 <- range(d2[,2:3])
    if(r1[1] >= r2[1] & r1[2] <= r2[2])
      dat3 <- rbind(dat3, data.frame(ID=d$ID, log=d$log, Dist=d$Dist, d2[,-1]))
  }
}

rownames(dat3) <- 1:nrow(dat3)
print(dat3)

答案 1 :(得分:1)

Dplyr解决方案:

dat2 %>%
  group_by(ID) %>%
  mutate(log = dat1$log[dat1$ID == unique(ID)]) %>%
  mutate(Dist = dat1$Dist[dat1$ID == unique(ID)]) %>%
  mutate(LFromMin = min(LFrom)) %>%
  mutate(LToMax = max(LTo)) %>%
  mutate(upper = log+Dist) %>%
  filter(log > LFromMin & upper<LToMax) %>%
  filter(LFrom >= LFrom[which(LFrom-log== max((LFrom-log)[LFrom-log < 0]))]) %>%
  filter(LTo <= LTo[which(LTo-upper == min((LTo - upper)[LTo-upper>0]))]) %>%
  select(c(ID, log, Dist, LFrom, LTo, It1, It2, It3, It4))