这是我的数据框的一个例子(原版有~10 000行)。我想基于VariableC
提取行块。我只想在FALSE
之间保留行。但只有"阻止"最小行数为10(随机位于数据框中)并丢弃其他行。换句话说,我想将我的数据帧分成子数据帧(即行块)。另一种方法是创建一个新列,每个块具有单独的数字或字母。最终目标是为每个块绘制(回归)VariableA
和VariableB
,并提取每个块的回归和斜率系数。我知道如何做最后一部分,但我找不到如何提取块的解决方案。
dput(DF)
structure(list(VariableA = c(-0.427796831, -0.985783635, 0.07381913,
-0.788768923, 2.088999368, 1.634064399, -0.396180684, 1.242763624,
-0.925287904, -1.127545153, -1.392674655, -0.988900906, -0.08007986,
1.123984722, 0.698530819, -0.983565282, 0.568517376, -0.349446274,
0.451443794, -0.525897224, -0.932426185, -1.026114049, -0.502973503,
0.779152951, -0.636137726, -0.488850226, 0.281389897, -0.058183652,
-0.490377469, 0.541441864, 0.101754052, -0.16701156, 0.830697787,
0.383672008, 0.376444634, 0.377695822, -0.167281753, 0.85629382,
0.213632586, -0.180474289, 1.008370316, -0.039110304, -0.498537412,
-2.804652051, -0.308652164, -0.57234963, 0.599951896, 0.52484456,
0.008141731, -0.355182154, -0.401441593, 1.201478908, 0.656311257,
0.459034655), VariableB = c(-0.599169932, -0.874625086, -0.879367189,
0.068133167, -0.800781757, -0.746429115, -0.231178499, -0.905456972,
0.40165965, 0.664579078, -0.386614574, -0.700272577, 1.844891234,
0.277616227, 0.560119708, -2.874313318, 0.835592571, -0.66310824,
0.770336487, 1.547635124, -0.604065751, 1.009519877, -0.54792181,
-0.904229067, -0.309270319, 0.16088111, 0.325712725, -0.931632811,
-1.124531146, -0.24012375, -0.887921437, -1.531276383, 1.565233292,
0.462452663, 0.836271408, -0.721959208, 1.92215585, 0.189964832,
1.661140854, -1.604886269, -1.237132008, 0.811584528, -0.965798536,
2.604504203, -1.124331258, 0.240004185, -0.34902354, -0.447056073,
0.051475583, 0.159486311, -1.86620661, -1.671688795, -1.268626575,
-1.734731137), VariableC = structure(c(11L, 19L, 9L, 36L, 36L,
26L, 7L, 24L, 36L, 5L, 17L, 15L, 33L, 30L, 29L, 21L, 31L, 10L,
36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 36L, 8L,
16L, 35L, 25L, 28L, 4L, 32L, 27L, 34L, 18L, 36L, 36L, 14L, 2L,
13L, 3L, 36L, 23L, 22L, 1L, 20L, 6L, 36L, 12L), .Label = c("-0.019569584",
"-0.020014785", "-0.033234545", "-0.034426339", "-0.046296608",
"-0.047020989", "-0.062735918", "-0.078616739", "-0.080554806",
"-0.101255451", "-0.102696676", "-0.127569648", "-0.143298342",
"-0.146433595", "-0.168917348", "-0.169828794", "-0.177928923",
"-0.178536056", "-0.186040872", "-0.22676482", "-0.38578786",
"0.005961731", "0.007778849", "0.033730665", "0.084612467", "0.088763528",
"0.104625865", "0.121271604", "0.125865053", "0.140160095", "0.140410995",
"0.17548741", "0.176481137", "0.187477344", "0.239593108", "FALSE"
), class = "factor")), .Names = c("VariableA", "VariableB", "VariableC"
), class = "data.frame", row.names = c(NA, -54L))
答案 0 :(得分:3)
这是一种方法:
# create indicator variable
df$ind <- cumsum(df$VariableC == "FALSE")
# remove "FALSE" rows
df_sub <- df[df$VariableC != "FALSE", ]
# run a regression for each unique ind value
library(MASS)
lmList(VariableA ~ VariableB | ind, data = df_sub)
结果:
Call: lmList(formula = VariableA ~ VariableB | ind, data = df_sub)
Coefficients:
(Intercept) VariableB
0 -0.40531670 0.05261483
2 -0.93213791 -2.80237922
3 -0.26593782 0.31197216
15 0.24240710 0.10646927
17 -0.92256481 -0.65475348
18 0.02793152 -0.22209490
19 0.45903466 NA
Degrees of freedom: 35 total; 21 residual
Residual standard error: 0.6656342
如何制作情节?
library(ggplot2)
ggplot(df_sub, aes(x = VariableB, y = VariableA)) +
geom_point() +
facet_wrap( ~ ind) +
geom_smooth(method = lm)
答案 1 :(得分:1)
你可以这样做:
falseIdx <- which(as.character(DF$VariableC) == "FALSE")
# at least 2 FALSE's must be present...
if(length(falseIdx) >= 2){
blocks <-
lapply(2:(length(falseIdx)-1),FUN=function(idx){
currFalse <- falseIdx[idx]
prevFalse <- falseIdx[idx-1]
# we build a block only if it has at least 10 rows
if(currFalse - prevFalse - 1 >= 10){
return(DF[(prevFalse+1):(currFalse-1),])
}else{
return(NULL)
}
})
# remove nulls
blocks[sapply(blocks, is.null)] <- NULL
}else{
blocks <- list()
}
根据您的示例数据进行计算,blocks
只包含一个data.frame
:
> blocks
[[1]]
VariableA VariableB VariableC
31 0.1017541 -0.8879214 -0.078616739
32 -0.1670116 -1.5312764 -0.169828794
33 0.8306978 1.5652333 0.239593108
34 0.3836720 0.4624527 0.084612467
35 0.3764446 0.8362714 0.121271604
36 0.3776958 -0.7219592 -0.034426339
37 -0.1672818 1.9221558 0.17548741
38 0.8562938 0.1899648 0.104625865
39 0.2136326 1.6611409 0.187477344
40 -0.1804743 -1.6048863 -0.178536056