如何在R中提取行块

时间:2014-07-26 10:49:05

标签: r dataframe

这是我的数据框的一个例子(原版有~10 000行)。我想基于VariableC提取行块。我只想在FALSE之间保留行。但只有"阻止"最小行数为10(随机位于数据框中)并丢弃其他行。换句话说,我想将我的数据帧分成子数据帧(即行块)。另一种方法是创建一个新列,每个块具有单独的数字或字母。最终目标是为每个块绘制(回归)VariableAVariableB,并提取每个块的回归和斜率系数。我知道如何做最后一部分,但我找不到如何提取块的解决方案。

dput(DF)
structure(list(VariableA = c(-0.427796831, -0.985783635, 0.07381913, 
-0.788768923, 2.088999368, 1.634064399, -0.396180684, 1.242763624, 
-0.925287904, -1.127545153, -1.392674655, -0.988900906, -0.08007986, 
1.123984722, 0.698530819, -0.983565282, 0.568517376, -0.349446274, 
0.451443794, -0.525897224, -0.932426185, -1.026114049, -0.502973503, 
0.779152951, -0.636137726, -0.488850226, 0.281389897, -0.058183652, 
-0.490377469, 0.541441864, 0.101754052, -0.16701156, 0.830697787, 
0.383672008, 0.376444634, 0.377695822, -0.167281753, 0.85629382, 
0.213632586, -0.180474289, 1.008370316, -0.039110304, -0.498537412, 
-2.804652051, -0.308652164, -0.57234963, 0.599951896, 0.52484456, 
0.008141731, -0.355182154, -0.401441593, 1.201478908, 0.656311257, 
0.459034655), VariableB = c(-0.599169932, -0.874625086, -0.879367189, 
0.068133167, -0.800781757, -0.746429115, -0.231178499, -0.905456972, 
0.40165965, 0.664579078, -0.386614574, -0.700272577, 1.844891234, 
0.277616227, 0.560119708, -2.874313318, 0.835592571, -0.66310824, 
0.770336487, 1.547635124, -0.604065751, 1.009519877, -0.54792181, 
-0.904229067, -0.309270319, 0.16088111, 0.325712725, -0.931632811, 
-1.124531146, -0.24012375, -0.887921437, -1.531276383, 1.565233292, 
0.462452663, 0.836271408, -0.721959208, 1.92215585, 0.189964832, 
1.661140854, -1.604886269, -1.237132008, 0.811584528, -0.965798536, 
2.604504203, -1.124331258, 0.240004185, -0.34902354, -0.447056073, 
0.051475583, 0.159486311, -1.86620661, -1.671688795, -1.268626575, 
-1.734731137), VariableC = structure(c(11L, 19L, 9L, 36L, 36L, 
26L, 7L, 24L, 36L, 5L, 17L, 15L, 33L, 30L, 29L, 21L, 31L, 10L, 
36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 8L, 
16L, 35L, 25L, 28L, 4L, 32L, 27L, 34L, 18L, 36L, 36L, 14L, 2L, 
13L, 3L, 36L, 23L, 22L, 1L, 20L, 6L, 36L, 12L), .Label = c("-0.019569584", 
"-0.020014785", "-0.033234545", "-0.034426339", "-0.046296608", 
"-0.047020989", "-0.062735918", "-0.078616739", "-0.080554806", 
"-0.101255451", "-0.102696676", "-0.127569648", "-0.143298342", 
"-0.146433595", "-0.168917348", "-0.169828794", "-0.177928923", 
"-0.178536056", "-0.186040872", "-0.22676482", "-0.38578786", 
"0.005961731", "0.007778849", "0.033730665", "0.084612467", "0.088763528", 
"0.104625865", "0.121271604", "0.125865053", "0.140160095", "0.140410995", 
"0.17548741", "0.176481137", "0.187477344", "0.239593108", "FALSE"
), class = "factor")), .Names = c("VariableA", "VariableB", "VariableC"
), class = "data.frame", row.names = c(NA, -54L))

2 个答案:

答案 0 :(得分:3)

这是一种方法:

# create indicator variable
df$ind <- cumsum(df$VariableC == "FALSE")
# remove "FALSE" rows
df_sub <- df[df$VariableC != "FALSE", ]

# run a regression for each unique ind value
library(MASS)
lmList(VariableA ~ VariableB | ind, data = df_sub)

结果:

Call: lmList(formula = VariableA ~ VariableB | ind, data = df_sub) 
Coefficients:
   (Intercept)   VariableB
0  -0.40531670  0.05261483
2  -0.93213791 -2.80237922
3  -0.26593782  0.31197216
15  0.24240710  0.10646927
17 -0.92256481 -0.65475348
18  0.02793152 -0.22209490
19  0.45903466          NA

Degrees of freedom: 35 total; 21 residual
Residual standard error: 0.6656342

如何制作情节?

library(ggplot2)
ggplot(df_sub, aes(x = VariableB, y = VariableA)) +
      geom_point() + 
      facet_wrap( ~ ind) + 
      geom_smooth(method = lm)

enter image description here

答案 1 :(得分:1)

你可以这样做:

falseIdx <- which(as.character(DF$VariableC) == "FALSE")

# at least 2 FALSE's must be present...
if(length(falseIdx) >= 2){
  blocks <- 
  lapply(2:(length(falseIdx)-1),FUN=function(idx){
    currFalse <- falseIdx[idx]
    prevFalse <- falseIdx[idx-1]

    # we build a block only if it has at least 10 rows
    if(currFalse - prevFalse - 1 >= 10){
      return(DF[(prevFalse+1):(currFalse-1),])
    }else{
      return(NULL)
    }

  })
  # remove nulls
  blocks[sapply(blocks, is.null)] <- NULL
}else{
  blocks <- list()
}

根据您的示例数据进行计算,blocks只包含一个data.frame

> blocks
[[1]]
    VariableA  VariableB    VariableC
31  0.1017541 -0.8879214 -0.078616739
32 -0.1670116 -1.5312764 -0.169828794
33  0.8306978  1.5652333  0.239593108
34  0.3836720  0.4624527  0.084612467
35  0.3764446  0.8362714  0.121271604
36  0.3776958 -0.7219592 -0.034426339
37 -0.1672818  1.9221558   0.17548741
38  0.8562938  0.1899648  0.104625865
39  0.2136326  1.6611409  0.187477344
40 -0.1804743 -1.6048863 -0.178536056