while循环作为一个函数来确定给定r中级别的最小案例数的级别数

时间:2018-04-23 10:14:11

标签: r

# Creating a dataframe.
x <- sample( LETTERS[1:4], 1000, replace=TRUE, prob=c(0.1, 0.2, 0.69, 0.01))
y <- sample( LETTERS[6:9], 1000, replace=TRUE, prob=c(0.1, 0.2, 0.68, 0.02))
z <- sample(16500:350000, 1000, replace=TRUE)
df<- data.frame(x, y, z)

我的数据集中的分类变量具有很少发生的级别 让我们说我需要在每个级别中至少有100个案例,而这个案例可以使用suggest_levels函数来完成。该函数根据某个定量变量的平均值或两级分类变量的每个级别的百分比来确定彼此相似的级别。

library("regclass")

t <- suggest_levels(z~x,data=df,target=3,recode=TRUE)
df$x <- t$newlevels
table(df$x)

我可以看到我设定了#34;目标&#34;手动选择3 为了能够在给定最小数量的情况下获得完美数量的级别,我将使用以下循环:

p <- (length(levels(df$y)) - 1)
k <- min(table(df$y))
f <- c()

while (k < 100) {
    SL_1 <- suggest_levels(df$z ~ df$y, data = df, target = p, recode = TRUE)
    j <- SL_1$newlevels
    p = p - 1
    k <- min(table(j))

    if (k >= 100) {
        f <- append(f, length(unique(SL_1$newlevels)), after = length(f))
    }
}

是否有任何函数可以根据所需的最小案例数来查找级别数?现在我已经有了这个,但它似乎不起作用:

foo <- function(colname, number) {

    number <- (length(levels(df[[colname]])) - 1)
    d <- min(table(df[[colname]]))
    num_of_levels <- c()

    while (d < 100) {
        SL_1 <- suggest_levels(df$z ~ df[[colname]], data = df, target = number, 
            recode = TRUE)
        j <- SL_1$newlevels
        number = number - 1
        d <- min(table(j))
        if (d >= 100) {
            num_of_levels <- append(num_of_levels, length(unique(SL_1$newlevels)), after = length(num_of_levels))
        }
        return(num_of_levels)
    }
}

我是编程新手,所以任何帮助都会受到赞赏。

1 个答案:

答案 0 :(得分:0)

如果允许,我想提出一种与你不同的方法。通常,在R中使用循环是“不是不”。

您尝试过滤少于100个案例的群组。在x和y变量下,组被视为一个案例。为此,我建议使用tidyverse方法。

df_filter <- df %>% 
  #Grouping under the variables x and Y 
  group_by(x, y) %>%
  #Creating a new variable that counts the number of cases per group. 
  mutate(N = n()) %>% 
  #Filter groups with less than 100 cases. 
  filter(N> 100)

结果:

summary(df_filter)

 x       y             z                N        
 A:  0   F:  0   Min.   : 17111   Min.   :125.0  
 B:144   G:125   1st Qu.:101317   1st Qu.:144.0  
 C:604   H:623   Median :176031   Median :479.0  
 D:  0   I:  0   Mean   :180700   Mean   :355.4  
                 3rd Qu.:263174   3rd Qu.:479.0  
                 Max.   :349127   Max.   :479.0 

使用regclass::suggest_levels()功能进行编辑。

请注意,我不熟悉suggest_levels()函数,因此我使用了您的代码。

require(tidyverse)
require(regclass)
require(magrittr)

#Creating new factor
df$NewFactor <- as.factor(
  suggest_levels(z~x,data=df,target=3,recode=TRUE)$newlevels)

#filter factors with less than 100 cases
df_filter <- df %>% 
  group_by(NewFactor) %>% 
  mutate(N = n()) %>% 
  filter(N > 100) 

summary(df_filter)

 x       y             z          NewFactor       N        
 A: 93   F:111   Min.   : 16553   A:297     Min.   :297.0  
 B:204   G:198   1st Qu.:101965   B:688     1st Qu.:297.0  
 C:688   H:653   Median :183983   C:  0     Median :688.0  
 D:  0   I: 23   Mean   :185751             Mean   :570.1  
                 3rd Qu.:267289             3rd Qu.:688.0  
                 Max.   :349379             Max.   :688.0  

附加编辑

require(tidyverse)
require(magrittr)
require(regclass)

#Creating the Sample dataset (I changed the sample to 2500)
set.seed(123)
df <- data.frame(x = sample( LETTERS[1:4], 2500, replace=TRUE, prob=c(0.1, 0.2, 0.69, 0.01)),
                y = sample( LETTERS[6:9], 2500, replace=TRUE, prob=c(0.1, 0.2, 0.68, 0.02)), 
                z = sample(16500:350000, 2500, replace=TRUE))


# Split into 3 parts
set.seed(123)
nums <- c(60, 20, 20)
assignments <- rep(NA, nrow(df))
assignments[sample(nrow(df))] <- rep(c(1, 2, 3), nums)
SplitDF <- split(df, assignments)

#Creating function to filter out factors with less than 10 cases
FilterOut <- function(df) {

  #Creating new factor
  df$NewFactor <- as.factor(
    suggest_levels(z~x,data=df, target=3,recode=TRUE)$newlevels)

  #filter factors with less than 10 cases
  df_filter <- df %>% 
    group_by(NewFactor) %>% 
    mutate(N = n()) %>% 
    filter(N > 10) %>% 
    as.data.frame() %>% 
    #to drop levels without cases
    mutate(NewFactor = factor(NewFactor))

  return(df_filter)
}

#Creating function to get model. I used regression just for the example
ModelFunction <- function(df) {
  Model <- FilterOut(df) %$%
  lm(z ~ NewFactor) %>% 
  broom::tidy()

  return(Model) }

#Filter the three datasets
FilterCases <- lapply(SplitDF, FilterOut)

#Creating the model for the three datasets
SetModel <- lapply(FilterCases, ModelFunction)

SetModel 



$`1`
         term   estimate std.error  statistic   p.value
1 (Intercept) 181708.111  2775.383 65.4713598 0.0000000
2  NewFactorB   8117.848  6286.739  1.2912653 0.1968111
3  NewFactorC  13992.806 27869.233  0.5020879 0.6156795

$`2`
         term    estimate std.error   statistic      p.value
1 (Intercept) 178142.0976  14499.60 12.28600217 1.905931e-30
2  NewFactorB    777.3922  15346.54  0.05065586 9.596203e-01
3  NewFactorC   4435.7520  16926.89  0.26205361 7.933898e-01

$`3`
         term   estimate std.error  statistic      p.value
1 (Intercept) 169001.400  14125.24 11.9644945 3.626713e-29
2  NewFactorB   5949.759  16835.49  0.3534057 7.239342e-01
3  NewFactorC   9378.655  15010.76  0.6247956 5.323920e-01