根据前一列中的条件查看数据框中的下一列

时间:2015-02-24 16:09:35

标签: r rstudio

假设我有一个如下数据框,其中包含类(低,中,高)和一些值。因此,在第一行中,0.7和0.25是高的几率,0.99是中等的几率,0.11是低的几率。

    col1 col2   col3 col4 col5 col6   col7 col8 
1   High  0.7 Medium 0.99 High 0.25    Low 0.11 
2    Low  0.9    Low 0.19  Low 0.29    Low 0.49 
3   High  0.6   High 0.16  Low 0.46 Medium 0.63 
4   High  0.8    Low 0.71  Low 0.28   High 0.20

我想找到一行中每个类的计数,以及每个类的平均值。

通过使用具有低,中和高条件的rowSums,我能够找到发生率最高的类。 但是要查看下一列的值,我只能使用for循环执行此操作,

for(j in 1:7)
{
    if(df[i,j] == "Medium")
    {
        chancemedium = chancemedium + df[i,j+1]
    }
}

有更有效的方法吗?

下面是我的版本,它使用rowSums查找计数,并使用迭代for循环查看下一个值。

col1=c("High","Low","High","High")
col2 = c(0.7,0.9,0.6,0.8)
col3=c("High","Low","High","Low")
col4 = c(0.7,0.19,0.16,0.71)
col5=c("High","Low","Low","Low")
col6 = c(0.71,0.29,0.46,0.28)
col7=c("Low","Low","Low","High")
col8 = c(0.11,0.49,0.63,0.20)

df = data.frame(col1,col2,col3,col4,col5,col6,col7,col8)

df$finalclass = NULL
df$finalchance = NULL

for(i in 1:nrow(df))
{
    countlow = 0;
    counthigh = 0;
    countmedium = 0;

    chancelow = 0;
    chancemedium = 0;
    chancehigh = 0;

    countlow = rowSums(df[i,1:8]=="Low")
    countmedium = rowSums(df[i,1:8]=="Medium")
    counthigh = rowSums(df[i,1:8]=="High")

    highestcount = max(countlow,countmedium,counthigh)

    #tie case
    if(((highestcount == countlow) & (highestcount == countmedium)) | ((highestcount == countmedium) & (highestcount == counthigh)) | ((highestcount == countlow) & (highestcount == counthigh)))
    {
        for(j in 1:7)
        {
            if(df[i,j] == "Low")
            {
                chancelow = chancelow + df[i,j+1]
            }

            if(df[i,j] == "Medium")
            {
                chancemedium = chancemedium + df[i,j+1]
            }

            if(df[i,j] == "High")
            {
                chancehigh = chancehigh + df[i,j+1]
            }
        }

        if(chancelow == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "Low"
            df[i,"finalchance"] = chancelow/highestcount
        }

        if(chancemedium == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "Medium"
            df[i,"finalchance"] = chancemedium/highestcount
        }

        if(chancehigh == max(chancelow,chancemedium,chancehigh))
        {
            df[i,"finalclass"] = "High"
            df[i,"finalchance"] = chancehigh/highestcount
        }
    }

    #no-tie case
    else
    {
        if(highestcount == countlow)
        {
            df[i,"finalclass"] = "Low"
            for(j in 1:7)
            {
                if(df[i,j] == "Low")
                {
                    chancelow = chancelow + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancelow/highestcount
        }

        if(highestcount == countmedium)
        {
            df[i,"finalclass"] = "Medium"
            for(j in 1:7)
            {
                if(df[i,j] == "Medium")
                {
                    chancemedium = chancemedium + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancemedium/highestcount
        }

        if(highestcount == counthigh)
        {
            df[i,"finalclass"] = "High"
            df[i,"finalclass"] = "Medium"
            for(j in 1:7)
            {
                if(df[i,j] == "High")
                {
                    chancehigh = chancehigh + df[i,j+1]
                }
            }
            df[i,"finalchance"] = chancehigh/highestcount
        }
    }
}

1 个答案:

答案 0 :(得分:1)

假设列出现在"键/值"对,将数据集(" df")子集化为值(' df1')和键(' df2')数据集。

df1 <- df[seq(2, ncol(df), by=2)]
df2 <- df[seq(1, ncol(df), by=2)]

获得&#34;计数&#34;每个班级(&#34;高&#34;,&#34;低&#34;,&#34;中&#34;)在每一行中,我们可以apply使用MARGIN=1 。通过将一行中的元素类转换为&#34;因子&#34;并指定级别,我们可以得到该行甚至缺失级别的计数。

 t(apply(df2, 1, function(x) table(factor(x,
                levels=c('High', 'Low', 'Medium')))))
 #  High Low Medium
 #1    2   1      1
 #2    0   4      0
 #3    2   1      1
 #4    2   2      0

或者这可以使用来自mtabulate的方便功能(qdapTools)来完成。

 library(qdapTools) 
 mtabulate(as.data.frame(t(df2)))
 #  High Low Medium
 #1    2   1      1
 #2    0   4      0
 #3    2   1      1
 #4    2   2      0

找到&#34;意思是&#34;按行的不同类的值,我们可以遍历数据集的行(sapply)(&#34; df1&#34;)并使用聚合函数(tapply)。

sapply(seq_len(nrow(df1)), function(i) 
       tapply(unlist(df1[i,]), unlist(df2[i,]), FUN=mean))
#[[1]]
# High    Low Medium 
# 0.475  0.110  0.990 

#[[2]]
# Low 
#0.4675 

#[[3]]
# High    Low Medium 
#  0.38   0.46   0.63 

#[[4]]
# High   Low 
#0.500 0.495 

或者我们可以使用ave填写&#34; df1&#34;的相应元素。按组的平均值。

ave(as.matrix(df1), as.matrix(df2), row(df2))
#   col2   col4   col6   col8
#1 0.4750 0.9900 0.4750 0.1100
#2 0.4675 0.4675 0.4675 0.4675
#3 0.3800 0.3800 0.4600 0.6300
#4 0.5000 0.4950 0.4950 0.5000

数据

df <- structure(list(col1 = c("High", "Low", "High", "High"),
col2 = c(0.7, 0.9, 0.6, 0.8), col3 = c("Medium", "Low", "High", 
 "Low"), col4 = c(0.99, 0.19, 0.16, 0.71), col5 = c("High", "Low",
"Low", "Low"), col6 = c(0.25, 0.29, 0.46, 0.28), col7 = c("Low", 
"Low", "Medium", "High"), col8 = c(0.11, 0.49, 0.63, 0.2)),
 .Names = c("col1", "col2", "col3", "col4", "col5", "col6", "col7",
 "col8"), class = "data.frame", row.names = c("1", "2", "3", "4"))