在每个非NA之前将数据帧分组

时间:2015-03-13 19:13:39

标签: r split subset apply na

  1. 我希望根据列#34; Height"将我的数据帧拆分为子集。每个子集的一行具有值,0-Inf行具有NA。
  2. 这是为了能够将函数应用到子集之后,具体地根据它们的" Diameter"来排序行。值,每个子集中的行数,sqrt(sum(Diameter ^ 2))等。
  3. 1

    我的数据框如下所示:

    > dput(df[1:300, c("IDD", "Height", "Diameter")])
    structure(list(IDD = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 
    29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 
    45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 
    61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 
    77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 
    93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 
    107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 
    120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 
    133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 
    146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 
    159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 
    172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 
    185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 
    198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 
    211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 
    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 
    237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 
    250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 
    263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 
    276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 
    289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300), 
        Height = c(6.7, 4.7, 2.2, NA, 1.3, NA, 1.15, 2.2, 2.5, NA, 
        5.25, NA, 1.8, 3.7, 1.3, 1.85, 2.2, NA, NA, 1.7, 2.6, 2.4, 
        NA, NA, 4.3, 2.25, NA, NA, NA, 5.1, 2.8, NA, 4, 5.8, 6.2, 
        NA, NA, NA, 5.8, 5.2, 2.45, NA, 1.25, 1.3, NA, 1.7, NA, NA, 
        4.4, 1.9, 4.4, NA, NA, NA, 8.5, 1.2, NA, 3.1, NA, 1.1, 1.1, 
        3.7, 5.5, NA, NA, 7.5, 2.4, 1.2, NA, 2.1, NA, 1.3, NA, NA, 
        1.2, 1.8, 6.2, NA, NA, NA, 1.5, 1.9, 1.15, 2.6, NA, 2.8, 
        1.7, 2.4, NA, NA, 5.2, 2.6, 1.3, 2.25, NA, 1.8, NA, 2, NA, 
        2.15, 1.9, NA, NA, NA, NA, NA, NA, 2.2, NA, 7.6, NA, 1.4, 
        NA, 2.8, 1.15, 4, 2.1, NA, NA, 4.2, 1.1, 4.7, 2.5, NA, 6.2, 
        2.4, 3.8, 2.2, 2.8, 7.8, 2.4, 1.3, 2.2, 3.2, 4.2, 3, NA, 
        NA, 3.7, 2.6, NA, 1.8, NA, NA, 2.3, NA, NA, NA, NA, 5.6, 
        2.3, 1.8, 3.6, NA, NA, NA, 1.7, NA, 4.2, 3.1, NA, NA, 3, 
        6, NA, 2.4, NA, NA, NA, 4.4, NA, 4.1, 2.3, 6.2, 2.4, NA, 
        NA, 3, 2.4, NA, 4.3, NA, NA, NA, 5.7, 1.6, NA, NA, 2.6, NA, 
        NA, 2.3, NA, NA, NA, 2, 1.5, NA, 1.15, NA, NA, 1.8, 2.6, 
        1.6, 2.25, 1.25, NA, 1.3, NA, 1.15, 1.1, 1.2, 2.2, NA, 1.35, 
        NA, 1.1, 1.4, NA, NA, 3.7, 1.9, 7.1, 2.15, 1.8, 2.7, 6.8, 
        NA, NA, NA, 2.05, 1.9, 3.4, 3.4, 1.7, 1.7, 1.5, 1.3, 2.5, 
        1.05, 1.05, NA, 1.1, 1.1, 1.5, 1.55, 1.25, 1.1, 1, NA, 2.1, 
        4.4, 1.8, 1.4, NA, 2.6, 1.1, NA, 1.75, NA, NA, 1.2, NA, 2.7, 
        NA, 4.6, NA, NA, 3.9, NA, NA, 4.3, 1.6, NA, 1.5, NA, 3.9, 
        3.2, 2.6, NA, 4.6, 2.65, NA, 1.75, NA, 1.2, 2.15, 1.65, NA, 
        NA, 2.25, 1.1, 1.55, 1.35, NA, 1.2, 2.5, 1.2, 6.1, 1.8), 
        Diameter = c(7.480282325, 4.774648293, 2.387324146, 1.432394488, 
        1.909859317, 1.909859317, 1.273239545, 1.909859317, 1.750704374, 
        1.750704374, 4.13802852, 1.591549431, 2.228169203, 3.660563691, 
        1.432394488, 2.06901426, 2.387324146, 0.795774715, 13.52817016, 
        1.432394488, 2.06901426, 2.387324146, 1.432394488, 1.273239545, 
        3.501408748, 2.387324146, 2.06901426, 1.432394488, 1.273239545, 
        3.501408748, 2.228169203, 1.750704374, 2.864788976, 4.13802852, 
        6.047887837, 4.456338407, 2.546479089, 1.591549431, 3.978873577, 
        3.660563691, 2.387324146, 1.750704374, 2.06901426, 1.432394488, 
        1.750704374, 1.750704374, 1.273239545, 1.273239545, 3.183098862, 
        1.909859317, 3.660563691, 1.909859317, 1.273239545, 1.114084602, 
        8.116902098, 1.591549431, 1.432394488, 2.228169203, 1.909859317, 
        1.432394488, 1.432394488, 3.342253805, 3.342253805, 3.183098862, 
        3.342253805, 5.092958179, 2.06901426, 1.432394488, 1.432394488, 
        1.750704374, 1.273239545, 1.591549431, 1.114084602, 22.75915686, 
        1.750704374, 3.023943919, 5.729577951, 2.705634033, 17.66619868, 
        10.50422624, 1.432394488, 2.228169203, 1.114084602, 1.909859317, 
        1.750704374, 2.864788976, 1.591549431, 2.228169203, 0.954929659, 
        1.114084602, 4.774648293, 1.909859317, 1.114084602, 2.387324146, 
        2.037183272, 1.591549431, 1.432394488, 1.591549431, 1.273239545, 
        2.228169203, 1.909859317, 1.273239545, 1.273239545, 1.591549431, 
        1.273239545, 1.273239545, 0.954929659, 2.546479089, 3.023943919, 
        13.05070533, 5.570423008, 1.591549431, 1.273239545, 2.546479089, 
        1.591549431, 3.023943919, 2.06901426, 1.432394488, 10.3450713, 
        3.342253805, 1.750704374, 5.092958179, 2.705634033, 2.228169203, 
        7.161972439, 1.782535363, 3.023943919, 1.909859317, 1.432394488, 
        6.525352667, 2.387324146, 1.273239545, 1.909859317, 2.06901426, 
        3.501408748, 2.705634033, 1.273239545, 1.273239545, 3.501408748, 
        1.909859317, 1.432394488, 1.909859317, 1.114084602, 1.432394488, 
        2.06901426, 12.5732405, 1.432394488, 21.8042272, 24.19155135, 
        6.843662553, 2.228169203, 2.06901426, 3.342253805, 1.909859317, 
        1.432394488, 2.06901426, 2.228169203, 1.750704374, 3.342253805, 
        2.864788976, 1.273239545, 1.750704374, 2.705634033, 5.729577951, 
        2.06901426, 2.705634033, 1.750704374, 1.591549431, 1.591549431, 
        2.864788976, 1.114084602, 2.705634033, 2.228169203, 5.252113122, 
        2.546479089, 0.954929659, 1.591549431, 2.06901426, 2.228169203, 
        1.273239545, 3.501408748, 0.795774715, 1.273239545, 1.273239545, 
        5.252113122, 1.591549431, 1.432394488, 1.432394488, 2.705634033, 
        1.432394488, 1.591549431, 2.546479089, 1.273239545, 1.432394488, 
        1.432394488, 2.06901426, 1.591549431, 1.432394488, 1.750704374, 
        1.273239545, 1.273239545, 1.909859317, 2.546479089, 0.954929659, 
        2.705634033, 2.06901426, 0.954929659, 1.114084602, 1.273239545, 
        1.273239545, 1.273239545, 1.273239545, 1.909859317, 1.432394488, 
        1.273239545, 1.273239545, 1.909859317, 1.750704374, 5.252113122, 
        1.273239545, 3.501408748, 2.546479089, 7.161972439, 2.228169203, 
        1.909859317, 2.387324146, 4.456338407, 1.591549431, 3.501408748, 
        1.273239545, 1.750704374, 1.909859317, 2.705634033, 3.342253805, 
        1.909859317, 1.750704374, 2.06901426, 2.228169203, 2.546479089, 
        1.273239545, 1.750704374, 0.954929659, 1.591549431, 1.591549431, 
        2.06901426, 1.750704374, 1.591549431, 1.273239545, 1.273239545, 
        1.432394488, 1.909859317, 3.660563691, 2.228169203, 1.750704374, 
        1.273239545, 2.546479089, 2.864788976, 1.114084602, 1.273239545, 
        1.432394488, 1.273239545, 1.750704374, 1.273239545, 2.705634033, 
        0.954929659, 3.501408748, 1.750704374, 1.591549431, 3.023943919, 
        1.909859317, 1.591549431, 3.183098862, 1.750704374, 0.795774715, 
        1.591549431, 1.432394488, 3.501408748, 2.546479089, 2.864788976, 
        2.546479089, 4.13802852, 2.705634033, 2.546479089, 2.387324146, 
        1.591549431, 1.273239545, 2.705634033, 2.387324146, 1.273239545, 
        1.114084602, 2.705634033, 1.591549431, 1.591549431, 1.432394488, 
        1.273239545, 1.591549431, 2.387324146, 1.432394488, 4.933803236, 
        1.909859317)), .Names = c("IDD", "Height", "Diameter"), row.names = c(NA, 
    300L), class = "data.frame")
    

    并受到此问题答案的启发Select last non-NA column of a list of dataframes 我试图创建以下函数

    ind <- apply(df["Height"], 1, function(r) {
         r[ (which(!is.na(r))[1]) : (which(!is.na(r))[2]+1) ] } )
    
    然而,

    给了我错误 error in (which(!is.na(r))[1]):(which(!is.na(r))[2] + 1) : NA/NaN Argument

    所以我尝试使用tapply并且错误undefined columns selected - 但df [&#34; Height&#34;]是一个已定义的列,或者我错了?我使用tapply是因为我认为它可能是数据帧/向量类问题。

    2

    我不知道在拥有子集时如何应用这些功能。 具体来说,我没有任何线索来订购每个子集acc的行。到desc(Diameter)。 对于干没有。和平均直径,这应该工作正确吗?

    ddply(ind, .(ID, class), summarise,
        stems = length(ID),
        avg_diameter = sqrt(sum((Diameter)^2)))
    

    感谢您的建议!

1 个答案:

答案 0 :(得分:2)

我认为

library("plyr")
df <- mutate(df,ID=cumsum(!is.na(df$Height)))
dfsum <- ddply(df,.(ID),summarise,
           stems=length(ID),
           avg_diameter = sqrt(sum((Diameter)^2)))
head(dfsum)
##   ID stems avg_diameter
## 1  1     1     7.480282
## 2  2     1     4.774648

应该有用吗??

将每个子集的行按顺序排列为desc(Diameter)“,

ddply(df,.(ID), arrange,desc(Diameter))