减少r中的类别

时间:2017-09-12 21:43:05

标签: r

只想保留前3个独特类别,并将休息时间设置为"其他"。它按行代码排序。但是当我写一个函数时它没有用。

set.seed(1)
col1    = sample(c("ar2-15", "ar16-29", "ar30-44", "ar30-440","ar300-44","ar300-440",""," " ), 20, replace = TRUE)
col2  = sample(c("Y", "N"), 20, replace = TRUE)
col3  = sample(c("A", "B", "C", "aa", "bba", "zz", " ", "", "dd"), 20, replace = TRUE)
my_data = data.frame("col1"= col1,"col2" = col2, "col3" = col3)

str(my_data)

*## this works*
my_data$col1 <- as.character(my_data$col1)
my_data$col1 <- trimws(my_data$col1)
my_data$col1[which(my_data$col1=="")] <- -999
top_3 <- rownames(sort(table(my_data$col1, exclude = NULL),decreasing = T)[1:3])
my_data$col1 <- ifelse(my_data$col1 %in% top_3, my_data$col1, "other")
my_data$col1 <- as.factor(my_data$col1)
## in function form this does not work
my_fn <- function(df, col_name) {
  df[[col_name]] <- as.character(df[[col_name]]);
  df[[col_name]] <- trimws(df[[col_name]]);
  df[[col_name]][which(df[[col_name]]=="")] <- -999;
  top_3 <- rownames(sort(table(df[[col_name]],exclude = NULL),decreasing = T)[1:3])
  df[[col_name]] <- ifelse(df[[col_name]] %in% top_3, df[[col_name]], "other");
  df[[col_name]] <- as.factor(df[[col_name]])
}
my_fn(my_data,"col1")

1 个答案:

答案 0 :(得分:3)

return(df)添加到您的函数末尾。

my_fn <- function(df, col_name) {
  df[[col_name]] <- as.character(df[[col_name]]);
  df[[col_name]] <- trimws(df[[col_name]]);
  df[[col_name]][which(df[[col_name]]=="")] <- -999;
  top_3 <- rownames(sort(table(df[[col_name]],exclude = NULL),decreasing = T)[1:3])
  df[[col_name]] <- ifelse(df[[col_name]] %in% top_3, df[[col_name]], "other");
  df[[col_name]] <- as.factor(df[[col_name]])
  return(df)
}

如果您只想返回更改的列,请添加return(df[[col_name]])

my_fn <- function(df, col_name) {
  df[[col_name]] <- as.character(df[[col_name]]);
  df[[col_name]] <- trimws(df[[col_name]]);
  df[[col_name]][which(df[[col_name]]=="")] <- -999;
  top_3 <- rownames(sort(table(df[[col_name]],exclude = NULL),decreasing = T)[1:3])
  df[[col_name]] <- ifelse(df[[col_name]] %in% top_3, df[[col_name]], "other");
  df[[col_name]] <- as.factor(df[[col_name]])
  return(df[[col_name]])
}

my_data$col1 = my_fn(my_data,"col1")