使用R中的阈值进行子集化并使用for循环返回相邻行

时间:2016-10-12 20:30:42

标签: r data.table

所以我因为能够使用R建立基于阈值的表而难以解决这个问题。我想做的就是groupby(set_a,set_b)并根据是否选择value rank_of_values大于.80(同时使用一些粘贴)来构建表。我的标准是,选择max rank_of_values并在表中返回value,如果表中为true。如果该部分为假,则在表格中附加表格中的第一个和第二个rank_of_values = value。我之前在set_b中只有一个小组的情况下能够解决这个问题,但现在有多个小组我遇到了麻烦。我也试图弄清楚警告,但是因为长度错误而被重击但是到目前为止没有运气。任何帮助和建议将不胜感激。

library(data.table)


#create sample data
set_a <- c("a","a","a","a","b","b","b","b","c","c","c","c","a","a","a","a","b","b","b","b","c","c","c","c","a","a","a","a","b","b","b","b","c","c","c","c")
set_b <- c("red","red","red","red","red","red","red","red","red","red","red","red","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","green","green","green","green","green","green","green","green","green","green","green","green")
#value <- c(sample(1:100,size = 36,replace = T))
value <- c(19,15,3,62,61,17,2,31,16,511,2,64,81,51,58,94,81,79,23,35,9,40,54,78,78,56,11,58,99,74,36,58,5,47,39,98)
a = data.frame(set_a,set_b,value)

a = data.table(a)

#This function calculates the average of the counts
a = a[,rank_of_values:= value/sum(value), by=list(set_a,set_b)]
head(a)


mn <- c() #create matrix to fill
mn0 <- c() #temp matrix
colu = unique(as.character(a$set_a))
colu2 = unique(as.character(a$set_b))

for (i in seq_along(colu))
{
  #subset the data table for the set_a:
  t = subset(a, set_a == colu[i])

  for (i in seq_along(colu2)){
    t2 =  subset(t, set_b == colu2[i])
    #subsetting the data by the set_b
    if(t2$rank_of_values > .8){
      mn <- cbind(as.character(t2$set_a[i]),paste0(t2$set_b[i],"_a"),t2$value[i])
      mn0 <- rbind(mn,mn0)
      mn2 <- cbind(as.character(t2$set_a[i]),paste0(t2$set_b[i],"_b"),t2$value[i])
      mn0 <- rbind(mn0,mn2)

    }
    else
    {
      t2[order(-rank_of_values)][,.SD[1:2]] #create a second data table to select for
      #order the data.table and  

      mn3 <- cbind(as.character(t2$set_a[1]),paste0(t2$set_b[1],"_a"),t2$value[1])

      mn0 <- rbind(mn0,mn3)
      mn4 <- cbind(as.character(t2$set_a[2]),paste0(t2$set_b[2],"_b"),t2$value[2])
      mn0 <- rbind(mn0,mn4)
    }
  }
}

mn0

当前结果:

     [,1] [,2]      [,3] 
 [1,] "a"  "red_a"   "19" 
 [2,] "a"  "red_b"   "15" 
 [3,] "a"  "blue_a"  "81" 
 [4,] "a"  "blue_b"  "51" 
 [5,] "a"  "green_a" "78" 
 [6,] "a"  "green_b" "56" 
 [7,] "b"  "red_a"   "61" 
 [8,] "b"  "red_b"   "17" 
 [9,] "b"  "blue_a"  "81" 
[10,] "b"  "blue_b"  "79" 
[11,] "b"  "green_a" "99" 
[12,] "b"  "green_b" "74" 
[13,] "c"  "red_a"   "16" 
[14,] "c"  "red_b"   "511"
[15,] "c"  "blue_a"  "9"  
[16,] "c"  "blue_b"  "40" 
[17,] "c"  "green_a" "5"  
[18,] "c"  "green_b" "47" 

期望的结果:

        [,1] [,2]      [,3] 
 [1,] "a"  "red_a"   "62" 
 [2,] "a"  "red_b"   "19" 
 [3,] "a"  "blue_a"  "94" 
 [4,] "a"  "blue_b"  "81" 
 [5,] "a"  "green_a" "78" 
 [6,] "a"  "green_b" "58" 
 [7,] "b"  "red_a"   "61" 
 [8,] "b"  "red_b"   "31" 
 [9,] "b"  "blue_a"  "81" 
[10,] "b"  "blue_b"  "79" 
[11,] "b"  "green_a" "99" 
[12,] "b"  "green_b" "74" 
[13,] "c"  "red_a"   "511"
[14,] "c"  "red_b"   "64" 
[15,] "c"  "blue_a"  "78" 
[16,] "c"  "blue_b"  "54" 
[17,] "c"  "green_a" "98" 
[18,] "c"  "green_b" "47" 

收到警告消息:

Warning messages:
1: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
2: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
3: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
4: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
5: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
6: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
7: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
8: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used
9: In if (t2$rank_of_values > 0.8) { :
  the condition has length > 1 and only the first element will be used

1 个答案:

答案 0 :(得分:0)

在搜索论坛后,我找到了解决问题的方法。

mn <- c() #create matrix to fill
mn0 <- c() #temp matrix
colu = unique(as.character(a$set_a))
colu2 = unique(as.character(a$set_b))

for (i in colu){
  for (j in colu2){

    t = a[a$set_a %in% i & a$set_b %in% j,][order(-rank_of_values)]
    for(z in t$rank_of_values[[nrow(t)]]){
      if(t$rank_of_values[[z]] > .8){
        print(z)
        mn <- cbind(as.character(t$set_a[[z]]),paste0(t$set_b[[z]],"_a"),t$value[[z]])
        mn0 <- rbind(mn,mn0)
        mn2 <- cbind(as.character(t$set_a[[z]]),paste0(t$set_b[[z]],"_b"),t$value[[z]])
        mn0 <- rbind(mn0,mn2)
      }
      else{
         #create a second data table to select for
        #order the data.table and  

        mn3 <- cbind(as.character(t$set_a[[1]]),paste0(t$set_b[[1]],"_a"),t$value[[1]])

        mn0 <- rbind(mn0,mn3)
        mn4 <- cbind(as.character(t$set_a[[2]]),paste0(t$set_b[[2]],"_b"),t$value[[2]])
        mn0 <- rbind(mn0,mn4)
      }
    }
  }
}

<强>结果

     [,1] [,2]      [,3] 
 [1,] "a"  "red_a"   "62" 
 [2,] "a"  "red_b"   "19" 
 [3,] "a"  "blue_a"  "94" 
 [4,] "a"  "blue_b"  "81" 
 [5,] "a"  "green_a" "78" 
 [6,] "a"  "green_b" "58" 
 [7,] "b"  "red_a"   "61" 
 [8,] "b"  "red_b"   "31" 
 [9,] "b"  "blue_a"  "81" 
[10,] "b"  "blue_b"  "79" 
[11,] "b"  "green_a" "99" 
[12,] "b"  "green_b" "74" 
[13,] "c"  "red_a"   "511"
[14,] "c"  "red_b"   "64" 
[15,] "c"  "blue_a"  "78" 
[16,] "c"  "blue_b"  "54" 
[17,] "c"  "green_a" "98" 
[18,] "c"  "green_b" "47"