# Creating a dataframe.
x <- sample( LETTERS[1:4], 1000, replace=TRUE, prob=c(0.1, 0.2, 0.69, 0.01))
y <- sample( LETTERS[6:9], 1000, replace=TRUE, prob=c(0.1, 0.2, 0.68, 0.02))
z <- sample(16500:350000, 1000, replace=TRUE)
df<- data.frame(x, y, z)
我的数据集中的分类变量具有很少发生的级别
让我们说我需要在每个级别中至少有100个案例,而这个案例可以使用suggest_levels
函数来完成。该函数根据某个定量变量的平均值或两级分类变量的每个级别的百分比来确定彼此相似的级别。
library("regclass")
t <- suggest_levels(z~x,data=df,target=3,recode=TRUE)
df$x <- t$newlevels
table(df$x)
我可以看到我设定了#34;目标&#34;手动选择3 为了能够在给定最小数量的情况下获得完美数量的级别,我将使用以下循环:
p <- (length(levels(df$y)) - 1)
k <- min(table(df$y))
f <- c()
while (k < 100) {
SL_1 <- suggest_levels(df$z ~ df$y, data = df, target = p, recode = TRUE)
j <- SL_1$newlevels
p = p - 1
k <- min(table(j))
if (k >= 100) {
f <- append(f, length(unique(SL_1$newlevels)), after = length(f))
}
}
是否有任何函数可以根据所需的最小案例数来查找级别数?现在我已经有了这个,但它似乎不起作用:
foo <- function(colname, number) {
number <- (length(levels(df[[colname]])) - 1)
d <- min(table(df[[colname]]))
num_of_levels <- c()
while (d < 100) {
SL_1 <- suggest_levels(df$z ~ df[[colname]], data = df, target = number,
recode = TRUE)
j <- SL_1$newlevels
number = number - 1
d <- min(table(j))
if (d >= 100) {
num_of_levels <- append(num_of_levels, length(unique(SL_1$newlevels)), after = length(num_of_levels))
}
return(num_of_levels)
}
}
我是编程新手,所以任何帮助都会受到赞赏。
答案 0 :(得分:0)
如果允许,我想提出一种与你不同的方法。通常,在R中使用循环是“不是不”。
您尝试过滤少于100个案例的群组。在x和y变量下,组被视为一个案例。为此,我建议使用tidyverse
方法。
df_filter <- df %>%
#Grouping under the variables x and Y
group_by(x, y) %>%
#Creating a new variable that counts the number of cases per group.
mutate(N = n()) %>%
#Filter groups with less than 100 cases.
filter(N> 100)
结果:
summary(df_filter)
x y z N
A: 0 F: 0 Min. : 17111 Min. :125.0
B:144 G:125 1st Qu.:101317 1st Qu.:144.0
C:604 H:623 Median :176031 Median :479.0
D: 0 I: 0 Mean :180700 Mean :355.4
3rd Qu.:263174 3rd Qu.:479.0
Max. :349127 Max. :479.0
regclass::suggest_levels()
功能进行编辑。请注意,我不熟悉suggest_levels()
函数,因此我使用了您的代码。
require(tidyverse)
require(regclass)
require(magrittr)
#Creating new factor
df$NewFactor <- as.factor(
suggest_levels(z~x,data=df,target=3,recode=TRUE)$newlevels)
#filter factors with less than 100 cases
df_filter <- df %>%
group_by(NewFactor) %>%
mutate(N = n()) %>%
filter(N > 100)
summary(df_filter)
x y z NewFactor N
A: 93 F:111 Min. : 16553 A:297 Min. :297.0
B:204 G:198 1st Qu.:101965 B:688 1st Qu.:297.0
C:688 H:653 Median :183983 C: 0 Median :688.0
D: 0 I: 23 Mean :185751 Mean :570.1
3rd Qu.:267289 3rd Qu.:688.0
Max. :349379 Max. :688.0
require(tidyverse)
require(magrittr)
require(regclass)
#Creating the Sample dataset (I changed the sample to 2500)
set.seed(123)
df <- data.frame(x = sample( LETTERS[1:4], 2500, replace=TRUE, prob=c(0.1, 0.2, 0.69, 0.01)),
y = sample( LETTERS[6:9], 2500, replace=TRUE, prob=c(0.1, 0.2, 0.68, 0.02)),
z = sample(16500:350000, 2500, replace=TRUE))
# Split into 3 parts
set.seed(123)
nums <- c(60, 20, 20)
assignments <- rep(NA, nrow(df))
assignments[sample(nrow(df))] <- rep(c(1, 2, 3), nums)
SplitDF <- split(df, assignments)
#Creating function to filter out factors with less than 10 cases
FilterOut <- function(df) {
#Creating new factor
df$NewFactor <- as.factor(
suggest_levels(z~x,data=df, target=3,recode=TRUE)$newlevels)
#filter factors with less than 10 cases
df_filter <- df %>%
group_by(NewFactor) %>%
mutate(N = n()) %>%
filter(N > 10) %>%
as.data.frame() %>%
#to drop levels without cases
mutate(NewFactor = factor(NewFactor))
return(df_filter)
}
#Creating function to get model. I used regression just for the example
ModelFunction <- function(df) {
Model <- FilterOut(df) %$%
lm(z ~ NewFactor) %>%
broom::tidy()
return(Model) }
#Filter the three datasets
FilterCases <- lapply(SplitDF, FilterOut)
#Creating the model for the three datasets
SetModel <- lapply(FilterCases, ModelFunction)
SetModel
$`1`
term estimate std.error statistic p.value
1 (Intercept) 181708.111 2775.383 65.4713598 0.0000000
2 NewFactorB 8117.848 6286.739 1.2912653 0.1968111
3 NewFactorC 13992.806 27869.233 0.5020879 0.6156795
$`2`
term estimate std.error statistic p.value
1 (Intercept) 178142.0976 14499.60 12.28600217 1.905931e-30
2 NewFactorB 777.3922 15346.54 0.05065586 9.596203e-01
3 NewFactorC 4435.7520 16926.89 0.26205361 7.933898e-01
$`3`
term estimate std.error statistic p.value
1 (Intercept) 169001.400 14125.24 11.9644945 3.626713e-29
2 NewFactorB 5949.759 16835.49 0.3534057 7.239342e-01
3 NewFactorC 9378.655 15010.76 0.6247956 5.323920e-01