df_data是一个包含40 000行和30列的数据帧,在for循环中我尝试添加许多列(大约600),它的工作原理非常慢,但我尝试使用其他类似的替代方法,如apply但我无法达到相同的结果。是否可以同时将一行的所有值相加?
df_i_final <- data.frame()
start<- Sys.time()
for (i in 1:nrow(df_data)) {
row <- df_data[i, ]
varname_1 <- paste("1",row["CATEGORIE"],row["RANK"],row["MD"],"a", sep="_")
varname_2 <- paste("2",row["CATEGORIE"],row["RANK"],row["MD"],"b", sep="_")
varname_3 <- paste("3",row["CATEGORIE"],row["RANK"],row["MD"],"c", sep="_")
varname_4 <- paste("4",row["CATEGORIE"],row["RANK"],row["MD"],"d", sep="_")
varname_5 <- paste("5",row["CATEGORIE"],row["RANK"],row["MD"],"e", sep="_")
varname_6 <- paste("aa",row["CATEGORIE"],row["RANK"],row["MD"],"fg", sep="_")
varname_7 <- paste("aa",row["CATEGORIE"],row["RANK"],row["MD"],"fg", sep="_")
varname_8 <- paste("aa",row["CATEGORIE"],row["RANK"],row["MD"],"fg", sep="_")
varname_9 <- paste("aa",row["CATEGORIE"],row["RANK"],row["MD"],"fg", sep="_")
if (row["VAL1"] > 0 | row["VAL2"] > 0 | row["VAL3"] > 0 | row["VAL4"] > 0 | row["VAL5"] > 0 | row["VAL6"] > 0 | row["VAL7"] > 0 )
{
df_i_final[i, "IDENT"] <- row["IDENT"]
if (row["VAL1"] > 0) df_i_final[i, varname_1] <- row["VAL1"]
if (row["VAL2"] > 0) df_i_final[i, varname_2] <- row["VAL2"]
if (row["VAL3"] > 0) df_i_final[i, varname_3] <- row["VAL3"]
if (row["VAL4"] > 0) df_i_final[i, varname_4] <- row["VAL4"]
if (row["VAL5"] > 0) df_i_final[i, varname_5] <- row["VAL5"]
if (row["VAL6"] > 0) df_i_final[i, varname_6] <- row["VAL6"]
if (row["VAL7"] > 0) df_i_final[i, varname_7] <- row["VAL7"]
}
}
process_time<- Sys.time() - start
print(format(process_time))
编辑:我添加一些示例,然后尝试使用您的代码来创建列,但是当它设置值时,它应该只更新具有相同类别名称的列...
df2 <- data.frame(ID = c("1100455", "1100455", "1100455", "1100455", "1100455", "1100464", "1100464"),
CATEGORIE = c("10110", "10160", "10604", "11220", "90310", "10110","10140"),
RANK = c("1", "1", "1", "1", "0" ,"1", "1"),
MD = c("0", "0", "0", "3", "4", "0", "0" ),
PROD3 = c(2345.00,1114.58,501.40,0.00,0.00,2720.00,636.80),
VALUE3 = c(540.00,0.00,0.00,0.00,0.00,0.00,0.00),
AREA3 = c(563.76,0.00,17.35,0.00,0.00,0.00,0.00),
LONG3 = c(4100,2100,1740,265,0,3978,940)
)
nm1 <-c("PROD3")
nm1
i1 <- Reduce(`|`, lapply(df2[nm1], `>`, 0))
newvars <- paste("aa",df2[["CATEGORIE"]],df2[["RANK"]],df2[["MD"]],"ta", sep="_")
newvars <- unique(newvars)
newvars
df2[newvars] <- NA
df2[i1, newvars] <- df2[i1, nm1]
df2
这里所有具有不同名称的类别的列都已更新,仅是那些与类别值匹配的列(所以这里只有aa_10110_1_0_ta)
ID CATEGORIE RANK MD PROD3 VALUE3 AREA3 LONG3 aa_10110_1_0_ta aa_10160_1_0_ta aa_10604_1_0_ta aa_11220_1_3_ta aa_90310_0_4_ta aa_10140_1_0_ta
1 1100455 10110 1 0 2345.00 540 563.76 4100 2345.00 2345.00 2345.00 2345.00 2345.00 2345.00
答案 0 :(得分:2)
我们可以对此向量化。用lapply
遍历感兴趣的列,检查它是否大于0,用Reduce
到|
到单个逻辑向量,用它来创建具有相应值的新列列
nm1 <- paste0('VAL', 1:7)
i1 <- Reduce(`|`, lapply(df_i_final[nm1], `>`, 0))
newvars <- paste(seq_len(nrow(df_i_final)),
df_i_final[["CATEGORIE"]],df_i_final[["RANK"]],df_i_final[["MD"]],"a", sep="_")
df_i_final[newvars] <- NA
df_i_final[i1, newvars] <- df_i_final[i1, nm1]