在R中交织两列数据帧的n列

时间:2012-03-01 10:59:47

标签: bash r

我有一个数据框,例如:

lat lon var01 var02 var03 var04 var11 var12 var13 var14 ...

和另一个像:

lat lon var05 var15 var25 ...

所需的输出是:

lat lon var01 var02 var03 var04 var05 var11 var12 var13 var14 var15 ...

我认为在R中这很容易,但到目前为止我还没有找到任何办法。我也愿意接受其他语言的解决方案,比如bash。我也想只有几行代码,我知道如何使用循环等。

提前致谢

编辑:以下解决方案要求正确命名列。想象一下以下情况:

d1 <- data.frame(lat = 1:10, lon = 1:10, V11 = runif(10), V12 = rnorm(10), V21 = runif(10), V22 = rnorm(10)) 
d2 <- data.frame(lat = 1:10, lon = 1:10, A13 = runif(10), A23 = rnorm(10)) 
res <- merge(d1, d2, sort = FALSE) 
res <- res[, c(1:2, order(colnames(res[, -(1:2)])) + 2)] 

输出

lat lon        A13        A23        V11        V12        V21        V22
 10  10 0.21269952  0.2670988 0.87532133 -0.6887557 0.60493329 -0.1350546
  1   1 0.61464497 -0.5686687 0.91287592 -0.4149946 0.23962942  0.3981059
  2   2 0.55715954 -0.1351786 0.29360337 -0.3942900 0.05893438 -0.6120264
  3   3 0.32877732  1.1780870 0.45906573 -0.0593134 0.64228826  0.3411197
  4   4 0.45313145 -1.5235668 0.33239467  1.1000254 0.87626921 -1.1293631
  5   5 0.50044097  0.5939462 0.65087047  0.7631757 0.77891468  1.4330237
  6   6 0.18086636  0.3329504 0.25801678 -0.1645236 0.79730883  1.9803999  
  7   7 0.52963060  1.0630998 0.47854525 -0.2533617 0.45527445 -0.3672215
  8   8 0.07527575 -0.3041839 0.76631067  0.6969634 0.41008408 -1.0441346
  9   9 0.27775593  0.3700188 0.08424691  0.5566632 0.81087024  0.5697196

并且所需的输出是:

lat lon V11 V12 A13 V21 V22 A13

3 个答案:

答案 0 :(得分:4)

merge()是适合这项工作的工具。这是一个例子:

set.seed(1)
d1 <- data.frame(lat = 1:10, lon = 1:10, V2 = runif(10), V4 = rnorm(10))
d2 <- data.frame(lat = 1:10, lon = 1:10, V1 = runif(10), V3 = rnorm(10))

## merge the data using `lat` and `lon`
res <- merge(d1, d2, sort = FALSE) ## `sort = FALSE` stops R reordering rows

## get columns in right order
res <- res[, c(1:2, order(colnames(res[, -(1:2)])) + 2)]

给出了:

> res
   lat lon        V1         V2          V3         V4
1    1   1 0.4820801 0.26550866  0.91897737 -0.8204684
2    2   2 0.5995658 0.37212390  0.78213630  0.4874291
3    3   3 0.4935413 0.57285336  0.07456498  0.7383247
4    4   4 0.1862176 0.90820779 -1.98935170  0.5757814
5    5   5 0.8273733 0.20168193  0.61982575 -0.3053884
6    6   6 0.6684667 0.89838968 -0.05612874  1.5117812
7    7   7 0.7942399 0.94467527 -0.15579551  0.3898432
8    8   8 0.1079436 0.66079779 -1.47075238 -0.6212406
9    9   9 0.7237109 0.62911404 -0.47815006 -2.2146999
10  10  10 0.4112744 0.06178627  0.41794156  1.1249309

根据修订版Q进行更新:

## dummy data
set.seed(1)
df3 <- data.frame(matrix(runif(60), ncol = 6))
names(df3) <- paste("df3Var", 1:6, sep = "")
df3 <- cbind.data.frame(lat = 1:10, lon = 1:10, df3)
df4 <- data.frame(matrix(runif(30), ncol = 3))
names(df4) <- paste("df4Var", 1:3, sep = "")
df4 <- cbind.data.frame(lat = 1:10, lon = 1:10, df4)

## merge
res2 <- merge(df3, df4, sort = FALSE)

这给出了:

> head(res2)
  lat lon   df3Var1   df3Var2   df3Var3   df3Var4   df3Var5    df3Var6
1   1   1 0.2655087 0.2059746 0.9347052 0.4820801 0.8209463 0.47761962
2   2   2 0.3721239 0.1765568 0.2121425 0.5995658 0.6470602 0.86120948
3   3   3 0.5728534 0.6870228 0.6516738 0.4935413 0.7829328 0.43809711
4   4   4 0.9082078 0.3841037 0.1255551 0.1862176 0.5530363 0.24479728
5   5   5 0.2016819 0.7698414 0.2672207 0.8273733 0.5297196 0.07067905
6   6   6 0.8983897 0.4976992 0.3861141 0.6684667 0.7893562 0.09946616
    df4Var1   df4Var2   df4Var3
1 0.9128759 0.3390729 0.4346595
2 0.2936034 0.8394404 0.7125147
3 0.4590657 0.3466835 0.3999944
4 0.3323947 0.3337749 0.3253522
5 0.6508705 0.4763512 0.7570871
6 0.2580168 0.8921983 0.2026923
> names(res2)
 [1] "lat"     "lon"     "df3Var1" "df3Var2" "df3Var3" "df3Var4" "df3Var5"
 [8] "df3Var6" "df4Var1" "df4Var2" "df4Var3"

好的,现在请注意顺序。假设我们希望从df3中获取2个变量,其中1个变量来自df4,并且df3df4中的每个变量在其自身内的顺序正确。为此,我们需要创建一个索引向量ord

> ord
[1] 1 2 7 3 4 8 5 6 9

然后我们添加2(以涵盖合并数据框中的latlon列)

> ord + 2
[1]  3  4  9  5  6 10  7  8 11

一旦你有序列,我们只需要一种方法来使用R的矢量化工具和一点点数学来产生我们想要的序列。我分两个阶段建立了指数; i)首先我找出合并数据框的列(1:6) + 2应该在ord中的位置,然后ii)我用列的合并数据框中的索引填充剩余的空格第二个数据框。

ord <- numeric(length = sum(ncol(df3), ncol(df4)) - 4)
ngrps <- 3
ningrps <- 2
## i)
want <- rep(seq_len(ningrps), ngrps) + 
    rep(seq(from = 0, by = 3, length = prod(ngrps, ningrps) / 2), 
        each = ningrps)
ord[want] <- seq_len(prod(ngrps, ningrps))
## ii)
want <- ngrps * seq_len(ngrps)
ord[want] <- seq(to = sum(ncol(df3), ncol(df4)) - 4, by = 1, length = ngrps)
res3 <- res2[, c(1:2, ord+2)]

这给出了:

> head(res3)
  lat lon   df3Var1   df3Var2   df4Var1   df3Var3   df3Var4   df4Var2   df3Var5
1   1   1 0.2655087 0.2059746 0.9128759 0.9347052 0.4820801 0.3390729 0.8209463
2   2   2 0.3721239 0.1765568 0.2936034 0.2121425 0.5995658 0.8394404 0.6470602
3   3   3 0.5728534 0.6870228 0.4590657 0.6516738 0.4935413 0.3466835 0.7829328
4   4   4 0.9082078 0.3841037 0.3323947 0.1255551 0.1862176 0.3337749 0.5530363
5   5   5 0.2016819 0.7698414 0.6508705 0.2672207 0.8273733 0.4763512 0.5297196
6   6   6 0.8983897 0.4976992 0.2580168 0.3861141 0.6684667 0.8921983 0.7893562
     df3Var6   df4Var3
1 0.47761962 0.4346595
2 0.86120948 0.7125147
3 0.43809711 0.3999944
4 0.24479728 0.3253522
5 0.07067905 0.7570871
6 0.09946616 0.2026923

这是您想要的订单。现在我们可以把它做成一个小功能:

myMerge <- function(x, y, ngrps, ningrps, ...) {
    out <- merge(x, y, ...)
    ncols <- ncol(out) - 2
    ord <- numeric(length = ncols)
    want <- rep(seq_len(ningrps), ngrps) + 
        rep(seq(from = 0, by = ngrps, length = prod(ngrps, ningrps) / 2), 
            each = ningrps)
    ord[want] <- seq_len(prod(ngrps, ningrps))
    want <- ngrps * seq_len(ngrps)
    ord[want] <- seq(to = ncols, by = 1, length = ngrps)
    out <- out[, c(1:2, ord+2)]
    out
}

在上面的df3df4上使用时,会给出:

> myMerge(df3, df4, ngrps = 3, ningrps = 2, sort = FALSE)
   lat lon    df3Var1   df3Var2    df4Var1    df3Var3   df3Var4   df4Var2
1    1   1 0.26550866 0.2059746 0.91287592 0.93470523 0.4820801 0.3390729
2    2   2 0.37212390 0.1765568 0.29360337 0.21214252 0.5995658 0.8394404
3    3   3 0.57285336 0.6870228 0.45906573 0.65167377 0.4935413 0.3466835
4    4   4 0.90820779 0.3841037 0.33239467 0.12555510 0.1862176 0.3337749
5    5   5 0.20168193 0.7698414 0.65087047 0.26722067 0.8273733 0.4763512
6    6   6 0.89838968 0.4976992 0.25801678 0.38611409 0.6684667 0.8921983
7    7   7 0.94467527 0.7176185 0.47854525 0.01339033 0.7942399 0.8643395
8    8   8 0.66079779 0.9919061 0.76631067 0.38238796 0.1079436 0.3899895
9    9   9 0.62911404 0.3800352 0.08424691 0.86969085 0.7237109 0.7773207
10  10  10 0.06178627 0.7774452 0.87532133 0.34034900 0.4112744 0.9606180
     df3Var5    df3Var6   df4Var3
1  0.8209463 0.47761962 0.4346595
2  0.6470602 0.86120948 0.7125147
3  0.7829328 0.43809711 0.3999944
4  0.5530363 0.24479728 0.3253522
5  0.5297196 0.07067905 0.7570871
6  0.7893562 0.09946616 0.2026923
7  0.0233312 0.31627171 0.7111212
8  0.4772301 0.51863426 0.1216919
9  0.7323137 0.66200508 0.2454885
10 0.6927316 0.40683019 0.1433044

这又是你想要的。您可以使用函数定义,因此您不需要同时指定ngrpsningrps,因为您可以从另一个加上df3 - 2中的列数。但我会将其作为读者的练习。

答案 1 :(得分:1)

建议的另一个功能是cbind()。您可以指定与另一个数据帧组合的列数和哪个列。查看帮助部分,举例说明: cbind help page

答案 2 :(得分:0)

您可以将最后一行修改为:

res <- res[, c(1:2, order(sub("[[:alpha:]]+"", colnames(res[, -(1:2)]))) + 2)] 

那(现在)处理多个字母字符前导模式。如果您的模式更复杂,那么您需要提供一个示例来说明这种复杂程度。正则表达式解决方案可以轻松修剪所有领先的alpha或所有字母字符,但我们确实需要知道真正的问题是多么复杂。