假设我有一个如下数据框,其中包含类(低,中,高)和一些值。因此,在第一行中,0.7和0.25是高的几率,0.99是中等的几率,0.11是低的几率。
col1 col2 col3 col4 col5 col6 col7 col8
1 High 0.7 Medium 0.99 High 0.25 Low 0.11
2 Low 0.9 Low 0.19 Low 0.29 Low 0.49
3 High 0.6 High 0.16 Low 0.46 Medium 0.63
4 High 0.8 Low 0.71 Low 0.28 High 0.20
我想找到一行中每个类的计数,以及每个类的平均值。
通过使用具有低,中和高条件的rowSums,我能够找到发生率最高的类。 但是要查看下一列的值,我只能使用for循环执行此操作,
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
有更有效的方法吗?
下面是我的版本,它使用rowSums查找计数,并使用迭代for循环查看下一个值。
col1=c("High","Low","High","High")
col2 = c(0.7,0.9,0.6,0.8)
col3=c("High","Low","High","Low")
col4 = c(0.7,0.19,0.16,0.71)
col5=c("High","Low","Low","Low")
col6 = c(0.71,0.29,0.46,0.28)
col7=c("Low","Low","Low","High")
col8 = c(0.11,0.49,0.63,0.20)
df = data.frame(col1,col2,col3,col4,col5,col6,col7,col8)
df$finalclass = NULL
df$finalchance = NULL
for(i in 1:nrow(df))
{
countlow = 0;
counthigh = 0;
countmedium = 0;
chancelow = 0;
chancemedium = 0;
chancehigh = 0;
countlow = rowSums(df[i,1:8]=="Low")
countmedium = rowSums(df[i,1:8]=="Medium")
counthigh = rowSums(df[i,1:8]=="High")
highestcount = max(countlow,countmedium,counthigh)
#tie case
if(((highestcount == countlow) & (highestcount == countmedium)) | ((highestcount == countmedium) & (highestcount == counthigh)) | ((highestcount == countlow) & (highestcount == counthigh)))
{
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
if(chancelow == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Low"
df[i,"finalchance"] = chancelow/highestcount
}
if(chancemedium == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "Medium"
df[i,"finalchance"] = chancemedium/highestcount
}
if(chancehigh == max(chancelow,chancemedium,chancehigh))
{
df[i,"finalclass"] = "High"
df[i,"finalchance"] = chancehigh/highestcount
}
}
#no-tie case
else
{
if(highestcount == countlow)
{
df[i,"finalclass"] = "Low"
for(j in 1:7)
{
if(df[i,j] == "Low")
{
chancelow = chancelow + df[i,j+1]
}
}
df[i,"finalchance"] = chancelow/highestcount
}
if(highestcount == countmedium)
{
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "Medium")
{
chancemedium = chancemedium + df[i,j+1]
}
}
df[i,"finalchance"] = chancemedium/highestcount
}
if(highestcount == counthigh)
{
df[i,"finalclass"] = "High"
df[i,"finalclass"] = "Medium"
for(j in 1:7)
{
if(df[i,j] == "High")
{
chancehigh = chancehigh + df[i,j+1]
}
}
df[i,"finalchance"] = chancehigh/highestcount
}
}
}
答案 0 :(得分:1)
假设列出现在"键/值"对,将数据集(" df")子集化为值(' df1')和键(' df2')数据集。
df1 <- df[seq(2, ncol(df), by=2)]
df2 <- df[seq(1, ncol(df), by=2)]
获得&#34;计数&#34;每个班级(&#34;高&#34;,&#34;低&#34;,&#34;中&#34;)在每一行中,我们可以apply
使用MARGIN=1
。通过将一行中的元素类转换为&#34;因子&#34;并指定级别,我们可以得到该行甚至缺失级别的计数。
t(apply(df2, 1, function(x) table(factor(x,
levels=c('High', 'Low', 'Medium')))))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
或者这可以使用来自mtabulate
的方便功能(qdapTools
)来完成。
library(qdapTools)
mtabulate(as.data.frame(t(df2)))
# High Low Medium
#1 2 1 1
#2 0 4 0
#3 2 1 1
#4 2 2 0
找到&#34;意思是&#34;按行的不同类的值,我们可以遍历数据集的行(sapply
)(&#34; df1&#34;)并使用聚合函数(tapply
)。
sapply(seq_len(nrow(df1)), function(i)
tapply(unlist(df1[i,]), unlist(df2[i,]), FUN=mean))
#[[1]]
# High Low Medium
# 0.475 0.110 0.990
#[[2]]
# Low
#0.4675
#[[3]]
# High Low Medium
# 0.38 0.46 0.63
#[[4]]
# High Low
#0.500 0.495
或者我们可以使用ave
填写&#34; df1&#34;的相应元素。按组的平均值。
ave(as.matrix(df1), as.matrix(df2), row(df2))
# col2 col4 col6 col8
#1 0.4750 0.9900 0.4750 0.1100
#2 0.4675 0.4675 0.4675 0.4675
#3 0.3800 0.3800 0.4600 0.6300
#4 0.5000 0.4950 0.4950 0.5000
df <- structure(list(col1 = c("High", "Low", "High", "High"),
col2 = c(0.7, 0.9, 0.6, 0.8), col3 = c("Medium", "Low", "High",
"Low"), col4 = c(0.99, 0.19, 0.16, 0.71), col5 = c("High", "Low",
"Low", "Low"), col6 = c(0.25, 0.29, 0.46, 0.28), col7 = c("Low",
"Low", "Medium", "High"), col8 = c(0.11, 0.49, 0.63, 0.2)),
.Names = c("col1", "col2", "col3", "col4", "col5", "col6", "col7",
"col8"), class = "data.frame", row.names = c("1", "2", "3", "4"))