重新安排从一个df到另一个df的非独立数据

时间:2015-07-28 13:58:40

标签: r dplyr plyr data-manipulation

我正在尝试将一些数据从一个数据帧重新排列到另一个数据帧。我认为我需要的解决方案将在plyr包中,但我无法找到完整的解决方案。

概要

我有一系列 Transects 由不同数量的 Points 组成。每个 Transect 可以分为三个 Points 的非独立组,形成 Leg

输入数据我在每个 Transect 上给出了每个 Point 的坐标:

# Subset of Points data
structure(list(Transect = structure(c(73L, 73L, 73L, 73L, 73L, 73L, 72L, 72L, 72L, 72L, 72L, 72L, 23L, 23L, 23L, 14L, 14L, 14L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L),
                                    .Label = c("B", "D", "E", "F", "G", "L1", "L2", "L3", "L4", "L5", "L9", "S101", "S105", "S109", "S116", "S117", "S118", "S119", "S121", "S122", "S123", "S124", "S125", "S126", "T001", "T002", "T003", "T004", "T006", "T007", "T008", "T009", "T010", "T011", "T012", "T013", "T014", "T015", "T016", "T017", "T018", "T019", "T022", "T023", "T024", "T026", "T028", "T029", "T030", "T031", "T032", "T033", "T035", "T039", "T040", "T043", "T049", "T050", "T051", "T056", "T060", "T061", "T062", "T063", "T065", "T066", "T067", "T068", "T072", "T073", "T074", "T075", "T076", "T077", "T078", "T079", "T082N", "T083", "T087", "T088", "T092", "T093", "T095", "T096", "T097"),
                                    class = "factor"),
               Point = c(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21),
               x = c(38.53, 38.53409, 38.53818, 38.53396, 38.52984, 38.53006, 38.45, 38.44936, 38.44942, 38.45324, 38.45743, 38.45382, 38.29102, 38.29013, 38.28935, 37.7798, 37.7803, 37.78109, 38.08238, 38.07932, 38.07534, 38.07143, 38.06737, 38.06339, 38.0596, 38.05605, 38.05261, 38.0489, 38.0444, 38.04113, 38.03668, 38.03237, 38.02786, 38.0234, 38.01895, 38.01524, 38.01481, 38.01465, 38.013), 
               y = c(4.23, 4.22811, 4.22622, 4.22465, 4.22281, 4.22553, 4.22, 4.22445, 4.22897, 4.22659, 4.22481, 4.22239, 5.37832, 5.37391, 5.36949, 5.0068, 5.01126, 5.0157, 4.95384, 4.95693, 4.95914, 4.96122, 4.96315, 4.96527, 4.96772, 4.97052, 4.97344, 4.97601, 4.97695, 4.97998, 4.98097, 4.98002, 4.97972, 4.98019, 4.98, 4.98272, 4.98715, 4.99165, 4.9958)),
          .Names = c("Transect", "Point", "x", "y"),
          row.names = c(NA, -39L),
          class = "data.frame")

每个 Transect 中每个 Leg 的身份

# Subset of Legs IDs
structure(list(Transect = structure(c(73L, 73L, 73L, 72L, 72L, 72L, 23L, 14L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L),
                                    .Label = c("B", "D", "E", "F", "G", "L1", "L2", "L3", "L4", "L5", "L9", "S101", "S105", "S109", "S116", "S117", "S118", "S119", "S121", "S122", "S123", "S124", "S125", "S126", "T001", "T002", "T003", "T004", "T006", "T007", "T008", "T009", "T010", "T011", "T012", "T013", "T014", "T015", "T016", "T017", "T018", "T019", "T022", "T023", "T024", "T026", "T028", "T029", "T030", "T031", "T032", "T033", "T035", "T039", "T040", "T043", "T049", "T050", "T051", "T056", "T060", "T061", "T062", "T063", "T065", "T066", "T067", "T068", "T072", "T073", "T074", "T075", "T076", "T077", "T078", "T079", "T082N", "T083", "T087", "T088", "T092", "T093", "T095", "T096", "T097"),
                                    class = "factor"),
               Leg = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 1L, 1L, 2L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L),
                               .Label = c("A-B", "B-C", "C-A", "C-D", "D-E", "E-F", "F-G", "G-H", "H-I", "I-J", "J-K"),
                               class = "factor")),
          .Names = c("Transect", "Leg"),
          row.names = c(NA, -18L),
          class = "data.frame")

数据类型

所有带有6个 Transects 是三角形的(每个顶点和每边的中间是),这样:

"Leg" == "A-B" contains "Points" == c(1,2,3)
"Leg" == "B-C" contains "Points" == c(3,4,5)
"Leg" == "C-A" contains "Points" == c(5,6,1)

所有其他 Transects 都是线性的,例如:

"Leg" == "A-B" contains "Points" == c(1,2,3)
"Leg" == "B-C" contains "Points" == c(3,4,5)
"Leg" == "C-D" contains "Points" == c(5,6,7)
"Leg" == "D-E" contains "Points" == c(7,8,9)  etc.

解决数据部分(示例所需结果)

通过将两个输入数据帧子集化为单个 Transects ,我已设法为每种类型的 Transects 实现所需的输出:

# when  length(tmp$Points)==6  (i.e. triangular sites)
tmp <- Points[Points$Transect=="T076",]
tmp2 <- Leg[Leg$Transect=="T076",]

for (i in 1:3) {
  tmp2$Start_x[i] <- tmp$x[i+i-1]
  tmp2$Start_y[i] <- tmp$y[i+i-1]
  tmp2$Mid_x[i]   <- tmp$x[i+i]
  tmp2$Mid_y[i]   <- tmp$y[i+i]
  tmp2$End_x[i]   <- ifelse(i==3,
                            tmp$x[1],
                            tmp$x[i+i+1])
  tmp2$End_y[i]   <- ifelse(i==3,
                            tmp$y[1],
                            tmp$y[i+i+1])
}

# when  length(tmp$Points)!=6  (i.e. straight line sites)
tmp <- Points[Points$Transect=="L2",]
tmp2 <- Leg[Leg$Transect=="L2",]

for (i in 1:round((length(tmp$Point)-1)/2)) {
  tmp2$Start_x[i] <- tmp$x[i+i-1]
  tmp2$Start_y[i] <- tmp$y[i+i-1]
  tmp2$Mid_x[i]   <- tmp$x[i+i]
  tmp2$Mid_y[i]   <- tmp$y[i+i]
  tmp2$End_x[i]   <- tmp$x[i+i+1]
  tmp2$End_y[i]   <- tmp$y[i+i+1]
}

在我看来,应该可以使用ddplyd_ply等功能的组合来分割每个 Transect 的完整数据帧,并应用相关的代码,并为每个 Leg 返回 Leg 数据帧,其中包含“开始”,“中间”和“结束”,“x”和“y”的新列。< / p>

但是我尝试这样做会返回错误,部分原因是:

a)我无法让ifelse从线性的(带有任意数量的点)对三角 Transects (有6个点)进行排序

b)我无法使plyr函数的组合正确。

返回错误的代码示例

library(plyr)
d_ply(BTVs, "Transect", function(a)
  ddply(Leg.points, "Transect", function(b)
    ifelse(length(a$Point)==6,
           # when == 6 (i.e. triangular sites)
           for (i in 1:3) {
             b$Start_x[i] <- a$x[i+i-1]
             b$Start_y[i] <- a$y[i+i-1]
             b$Mid_x[i]   <- a$x[i+i]
             b$Mid_y[i]   <- a$y[i+i]
             b$End_x[i]   <- ifelse(i==3,
                                    a$x[1],
                                    a$x[i+i+1])
             b$End_y[i]   <- ifelse(i==3,
                                    a$x[1],
                                    a$y[i+i+1])},

           # when != 6 (i.e. straight line sites)
           for (i in 1:round((length(a$Point)-1)/2)) {
             b$Start_x[i] <- a$x[i+i-1]
             b$Start_y[i] <- a$y[i+i-1]
             b$Mid_x[i]   <- a$x[i+i]
             b$Mid_y[i]   <- a$y[i+i]
             b$End_x[i]   <- a$x[i+i+1]
             b$End_y[i]   <- a$y[i+i+1]
           })))

有人可以帮忙吗?提前谢谢!

1 个答案:

答案 0 :(得分:1)

Start from the work that you have already done and wrap the loops in a function.

start_mid_end <- function(point) {
  tmp <- Points[Points$Transect==point,]
  tmp2 <- Leg[Leg$Transect==point,]

  if(nrow(tmp) == 6) {   
    for (i in 1:3) {
      tmp2$Start_x[i] <- tmp$x[i+i-1]
      tmp2$Start_y[i] <- tmp$y[i+i-1]
      tmp2$Mid_x[i]   <- tmp$x[i+i]
      tmp2$Mid_y[i]   <- tmp$y[i+i]
      tmp2$End_x[i]   <- ifelse(i==3,
                            tmp$x[1],
                            tmp$x[i+i+1])
      tmp2$End_y[i]   <- ifelse(i==3,
                            tmp$x[1],
                            tmp$y[i+i+1])
  }
tmp2
} else {

for (i in 1:round((length(tmp$Point)-1)/2)) {
  tmp2$Start_x[i] <- tmp$x[i+i-1]
  tmp2$Start_y[i] <- tmp$y[i+i-1]
  tmp2$Mid_x[i]   <- tmp$x[i+i]
  tmp2$Mid_y[i]   <- tmp$y[i+i]
  tmp2$End_x[i]   <- tmp$x[i+i+1]
  tmp2$End_y[i]   <- tmp$y[i+i+1]
}
tmp2
}
}

Now you can just add a Transect point and it will give you the breakdown:

start_mid_end("T076")
#  Transect Leg  Start_x Start_y    Mid_x   Mid_y    End_x    End_y
#1     T076 A-B 38.53000 4.23000 38.53409 4.22811 38.53818  4.22622
#2     T076 B-C 38.53818 4.22622 38.53396 4.22465 38.52984  4.22281
#3     T076 C-A 38.52984 4.22281 38.53006 4.22553 38.53000 38.53000

If you would like all of them at once:

points <- as.character(unique(Points$Transect))
do.call(rbind,lapply(points, start_mid_end))
#    Transect Leg  Start_x Start_y    Mid_x   Mid_y    End_x    End_y
# 1      T076 A-B 38.53000 4.23000 38.53409 4.22811 38.53818  4.22622
# 2      T076 B-C 38.53818 4.22622 38.53396 4.22465 38.52984  4.22281
# 3      T076 C-A 38.52984 4.22281 38.53006 4.22553 38.53000 38.53000
# 4      T075 A-B 38.45000 4.22000 38.44936 4.22445 38.44942  4.22897
# 5      T075 B-C 38.44942 4.22897 38.45324 4.22659 38.45743  4.22481
# 6      T075 C-A 38.45743 4.22481 38.45382 4.22239 38.45000 38.45000
# 7      S125 A-B 38.29102 5.37832 38.29013 5.37391 38.28935  5.36949
# 8      S109 A-B 37.77980 5.00680 37.78030 5.01126 37.78109  5.01570
# 9        L2 A-B 38.08238 4.95384 38.07932 4.95693 38.07534  4.95914
# 10       L2 B-C 38.07534 4.95914 38.07143 4.96122 38.06737  4.96315