所以我因为能够使用R建立基于阈值的表而难以解决这个问题。我想做的就是groupby(set_a,set_b
)并根据是否选择value
rank_of_values
大于.80(同时使用一些粘贴)来构建表。我的标准是,选择max rank_of_values
并在表中返回value
,如果表中为true。如果该部分为假,则在表格中附加表格中的第一个和第二个rank_of_values
= value
。我之前在set_b
中只有一个小组的情况下能够解决这个问题,但现在有多个小组我遇到了麻烦。我也试图弄清楚警告,但是因为长度错误而被重击但是到目前为止没有运气。任何帮助和建议将不胜感激。
library(data.table)
#create sample data
set_a <- c("a","a","a","a","b","b","b","b","c","c","c","c","a","a","a","a","b","b","b","b","c","c","c","c","a","a","a","a","b","b","b","b","c","c","c","c")
set_b <- c("red","red","red","red","red","red","red","red","red","red","red","red","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","green","green","green","green","green","green","green","green","green","green","green","green")
#value <- c(sample(1:100,size = 36,replace = T))
value <- c(19,15,3,62,61,17,2,31,16,511,2,64,81,51,58,94,81,79,23,35,9,40,54,78,78,56,11,58,99,74,36,58,5,47,39,98)
a = data.frame(set_a,set_b,value)
a = data.table(a)
#This function calculates the average of the counts
a = a[,rank_of_values:= value/sum(value), by=list(set_a,set_b)]
head(a)
mn <- c() #create matrix to fill
mn0 <- c() #temp matrix
colu = unique(as.character(a$set_a))
colu2 = unique(as.character(a$set_b))
for (i in seq_along(colu))
{
#subset the data table for the set_a:
t = subset(a, set_a == colu[i])
for (i in seq_along(colu2)){
t2 = subset(t, set_b == colu2[i])
#subsetting the data by the set_b
if(t2$rank_of_values > .8){
mn <- cbind(as.character(t2$set_a[i]),paste0(t2$set_b[i],"_a"),t2$value[i])
mn0 <- rbind(mn,mn0)
mn2 <- cbind(as.character(t2$set_a[i]),paste0(t2$set_b[i],"_b"),t2$value[i])
mn0 <- rbind(mn0,mn2)
}
else
{
t2[order(-rank_of_values)][,.SD[1:2]] #create a second data table to select for
#order the data.table and
mn3 <- cbind(as.character(t2$set_a[1]),paste0(t2$set_b[1],"_a"),t2$value[1])
mn0 <- rbind(mn0,mn3)
mn4 <- cbind(as.character(t2$set_a[2]),paste0(t2$set_b[2],"_b"),t2$value[2])
mn0 <- rbind(mn0,mn4)
}
}
}
mn0
当前结果:
[,1] [,2] [,3]
[1,] "a" "red_a" "19"
[2,] "a" "red_b" "15"
[3,] "a" "blue_a" "81"
[4,] "a" "blue_b" "51"
[5,] "a" "green_a" "78"
[6,] "a" "green_b" "56"
[7,] "b" "red_a" "61"
[8,] "b" "red_b" "17"
[9,] "b" "blue_a" "81"
[10,] "b" "blue_b" "79"
[11,] "b" "green_a" "99"
[12,] "b" "green_b" "74"
[13,] "c" "red_a" "16"
[14,] "c" "red_b" "511"
[15,] "c" "blue_a" "9"
[16,] "c" "blue_b" "40"
[17,] "c" "green_a" "5"
[18,] "c" "green_b" "47"
期望的结果:
[,1] [,2] [,3]
[1,] "a" "red_a" "62"
[2,] "a" "red_b" "19"
[3,] "a" "blue_a" "94"
[4,] "a" "blue_b" "81"
[5,] "a" "green_a" "78"
[6,] "a" "green_b" "58"
[7,] "b" "red_a" "61"
[8,] "b" "red_b" "31"
[9,] "b" "blue_a" "81"
[10,] "b" "blue_b" "79"
[11,] "b" "green_a" "99"
[12,] "b" "green_b" "74"
[13,] "c" "red_a" "511"
[14,] "c" "red_b" "64"
[15,] "c" "blue_a" "78"
[16,] "c" "blue_b" "54"
[17,] "c" "green_a" "98"
[18,] "c" "green_b" "47"
收到警告消息:
Warning messages:
1: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
2: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
3: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
4: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
5: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
6: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
7: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
8: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
9: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
答案 0 :(得分:0)
在搜索论坛后,我找到了解决问题的方法。
mn <- c() #create matrix to fill
mn0 <- c() #temp matrix
colu = unique(as.character(a$set_a))
colu2 = unique(as.character(a$set_b))
for (i in colu){
for (j in colu2){
t = a[a$set_a %in% i & a$set_b %in% j,][order(-rank_of_values)]
for(z in t$rank_of_values[[nrow(t)]]){
if(t$rank_of_values[[z]] > .8){
print(z)
mn <- cbind(as.character(t$set_a[[z]]),paste0(t$set_b[[z]],"_a"),t$value[[z]])
mn0 <- rbind(mn,mn0)
mn2 <- cbind(as.character(t$set_a[[z]]),paste0(t$set_b[[z]],"_b"),t$value[[z]])
mn0 <- rbind(mn0,mn2)
}
else{
#create a second data table to select for
#order the data.table and
mn3 <- cbind(as.character(t$set_a[[1]]),paste0(t$set_b[[1]],"_a"),t$value[[1]])
mn0 <- rbind(mn0,mn3)
mn4 <- cbind(as.character(t$set_a[[2]]),paste0(t$set_b[[2]],"_b"),t$value[[2]])
mn0 <- rbind(mn0,mn4)
}
}
}
}
<强>结果强>
[,1] [,2] [,3]
[1,] "a" "red_a" "62"
[2,] "a" "red_b" "19"
[3,] "a" "blue_a" "94"
[4,] "a" "blue_b" "81"
[5,] "a" "green_a" "78"
[6,] "a" "green_b" "58"
[7,] "b" "red_a" "61"
[8,] "b" "red_b" "31"
[9,] "b" "blue_a" "81"
[10,] "b" "blue_b" "79"
[11,] "b" "green_a" "99"
[12,] "b" "green_b" "74"
[13,] "c" "red_a" "511"
[14,] "c" "red_b" "64"
[15,] "c" "blue_a" "78"
[16,] "c" "blue_b" "54"
[17,] "c" "green_a" "98"
[18,] "c" "green_b" "47"