我有许多data.frames,每个都有一个因素。我想确保它们都使用相同的级别。这样做的正确方法是什么?
在下面的代码中,您将看到我使用整体水平集中的级别和一个小的便利函数changeLevels重新分配每个案例的因子。我希望有更好的方法来做到这一点。
set.seed(1234)
b<-round(runif(100,1,10),digits=2)
set.seed(2345)
b2<-round(runif(100,11,20),digits=2)
set.seed(3456)
b3<-round(runif(50,15,18),digits=2)
#.. all potential levels
bt<-factor(sort(c(b,b2,b3)))
lvls<-levels(bt)
t1<-as.data.frame(table(sample(b,5)))
t2<-as.data.frame(table(sample(b,1)))
t3<-as.data.frame(table(sample(b,1)))
t4<-as.data.frame(table(sample(b,8)))
t5<-as.data.frame(table(sample(b2,20)))
t6<-as.data.frame(table(sample(b3,18)))
t1<-cbind(t1,p="A")
t2<-cbind(t2,p="B")
t3<-cbind(t3,p="C")
t4<-cbind(t4,p="D")
t5<-cbind(t5,p="E")
t6<-cbind(t6,p="F")
d<-data.frame()
d<-rbind(d,t2,t3,t6,t4,t5,t1)
#.. out of order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
geom_bar(aes(weight=Freq)) +
facet_grid( p ~ ., margins=T)+
ggtitle("out of order bins")
changeFactor<-function(t,lvls){
temp<-as.numeric(as.character(t))
factor(temp,levels=lvls)
}
t1$Var1<-changeFactor(t1$Var1,lvls)
t2$Var1<-changeFactor(t2$Var1,lvls)
t3$Var1<-changeFactor(t3$Var1,lvls)
t4$Var1<-changeFactor(t4$Var1,lvls)
t5$Var1<-changeFactor(t5$Var1,lvls)
t6$Var1<-changeFactor(t6$Var1,lvls)
d<-data.frame()
d<-rbind(d,t2,t3,t6,t4,t5,t1)
#.. in order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
geom_bar(aes(weight=Freq)) +
facet_grid( p ~ ., margins=T)+
ggtitle("in order bins")
答案 0 :(得分:2)
我认为你的方式&#34;阅读&#34; as.character
的因素是你不了解所有&#34; true&#34;的最佳方法。水平。
但是,因为你做知道它们(它们都存储在lvls
内),为什么不在构建ti$Var1
向量时直接使用它们?也就是说,而不是:
ti = as.data.frame(table(sample(b,5))); # automately creates a factor vector ti$Var1 with what is found inside the sample as levels
ti$Var1 = factor(as.character(ti$Var1), levels = lvls); # replaces it with a new factor, created by reading each value of the previous one and assigning it a level from lvls
(这最终是你做的), 直接做:
tab = table(sample(b,5));
ti = data.frame(myVar = factor(names(tab), lvls) # creates directly the right factor vector with levels drawn from lvls
, myFreq = as.numeric(tab)
);
(这最终是您想要的)(并且“甚至可以让您更好地控制ti
列的名称”
否则,你会得到空行:
factoredSample = factor(sample(b,5), lvls); # directly associates each drawn value with a level from lvls
ti = as.data.frame(table(factoredSample)); # and table will then also count the non-represented levels within factoredSample
(顺便说一下,我不知道这是否只是为了提出问题的目的,但如果你真的需要在你的脚本中处理这么多几乎相同的data.frames,那么你就是可能使用了错误的数据结构。)
答案 1 :(得分:2)
简短回答:将您的数据保存在列表中并了解*pply
家庭
set.seed(1234)
b<-round(runif(100,1,10),digits=2)
set.seed(2345)
b2<-round(runif(100,11,20),digits=2)
set.seed(3456)
b3<-round(runif(50,15,18),digits=2)
#.. all potential levels
bt<-factor(sort(c(b,b2,b3)))
lvls<-levels(bt)
options(stringsAsFactors = FALSE)
f <- function(x, y, z)
cbind(data.frame(table(sample(x, y))), p = z)
datl <- Map(f, list(b,b,b,b,b2,b3), c(5,1,1,8,20,18), LETTERS[1:6])
changeFactor<-function(t,lvls){
temp<-as.numeric(as.character(t))
factor(temp,levels=lvls)
}
datl <- lapply(rapply(datl, f = function(x) changeFactor(x, lvls),
classes = 'factor', how = 'replace'),
data.frame)
d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])
#.. in order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
geom_bar(aes(weight=Freq)) +
facet_grid( p ~ ., margins=T)+
ggtitle("in order bins")
答案很长:
set.seed(1234)
b<-round(runif(100,1,10),digits=2)
set.seed(2345)
b2<-round(runif(100,11,20),digits=2)
set.seed(3456)
b3<-round(runif(50,15,18),digits=2)
#.. all potential levels
bt<-factor(sort(c(b,b2,b3)))
lvls<-levels(bt)
首先,我不希望出现任何意外因素,所以stringsAsFactors = FALSE
然后编写一个函数f来做你想做的事情,并检查以确保它有效
options(stringsAsFactors = FALSE)
f <- function(x, y, z)
cbind(data.frame(table(sample(x, y))), p = z)
f(b, 5, 'A')
# Var1 Freq p
# 1 1.13 1 A
# 2 1.46 1 A
# 3 2.09 1 A
# 4 2.5 1 A
# 5 7.02 1 A
似乎有效,所以只需Map
到参数列表并检查输出
datl <- Map(f, list(b,b,b,b,b2,b3), c(5,1,1,8,20,18), LETTERS[1:6])
# List of 6
# $ :'data.frame': 5 obs. of 3 variables:
# ..$ Var1: Factor w/ 5 levels "2.02","3.09",..: 1 2 3 4 5
# ..$ Freq: int [1:5] 1 1 1 1 1
# ..$ p : chr [1:5] "A" "A" "A" "A" ...
# $ :'data.frame': 1 obs. of 3 variables:
# ..$ Var1: Factor w/ 1 level "1.63": 1
# ..$ Freq: int 1
# ..$ p : chr "B"
所以将所有内容与ggplot
结合使用d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])
library(ggplot2)
#.. out of order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
geom_bar(aes(weight=Freq)) +
facet_grid( p ~ ., margins=T)+
ggtitle("out of order bins")
changeFactor<-function(t,lvls){
temp<-as.numeric(as.character(t))
factor(temp,levels=lvls)
}
再次确保该功能在一个数据帧上完成它应该做的事情
changeFactor(datl[[1]]$Var1, lvls)
# [1] 2.02 3.09 3.79 3.89 8.3
# 234 Levels: 1.09 1.12 1.13 1.24 1.36 1.38 1.41 1.46 1.63 1.66 1.81 1.95 ... 19.86
然后立即再次将它们应用于它们并检查输出
datl <- lapply(rapply(datl, f = function(x) changeFactor(x, lvls),
classes = 'factor', how = 'replace'),
data.frame)
str(datl)
# List of 6
# $ :'data.frame': 5 obs. of 3 variables:
# ..$ Var1: Factor w/ 234 levels "1.09","1.12",..: 13 28 41 45 81
# ..$ Freq: int [1:5] 1 1 1 1 1
# ..$ p : chr [1:5] "A" "A" "A" "A" ...
# $ :'data.frame': 1 obs. of 3 variables:
# ..$ Var1: Factor w/ 234 levels "1.09","1.12",..: 9
# ..$ Freq: int 1
# ..$ p : chr "B"
# ...
再次组合并绘制
d <- do.call(rbind, datl[c(2, 3, 6, 4, 5, 1)])
#.. in order bins
ggplot(d,aes(x=factor(Var1),fill=factor(p))) +
geom_bar(aes(weight=Freq)) +
facet_grid( p ~ ., margins=T)+
ggtitle("in order bins")