只想保留前3个独特类别,并将休息时间设置为"其他"。它按行代码排序。但是当我写一个函数时它没有用。
set.seed(1)
col1 = sample(c("ar2-15", "ar16-29", "ar30-44", "ar30-440","ar300-44","ar300-440",""," " ), 20, replace = TRUE)
col2 = sample(c("Y", "N"), 20, replace = TRUE)
col3 = sample(c("A", "B", "C", "aa", "bba", "zz", " ", "", "dd"), 20, replace = TRUE)
my_data = data.frame("col1"= col1,"col2" = col2, "col3" = col3)
str(my_data)
*## this works*
my_data$col1 <- as.character(my_data$col1)
my_data$col1 <- trimws(my_data$col1)
my_data$col1[which(my_data$col1=="")] <- -999
top_3 <- rownames(sort(table(my_data$col1, exclude = NULL),decreasing = T)[1:3])
my_data$col1 <- ifelse(my_data$col1 %in% top_3, my_data$col1, "other")
my_data$col1 <- as.factor(my_data$col1)
## in function form this does not work
my_fn <- function(df, col_name) {
df[[col_name]] <- as.character(df[[col_name]]);
df[[col_name]] <- trimws(df[[col_name]]);
df[[col_name]][which(df[[col_name]]=="")] <- -999;
top_3 <- rownames(sort(table(df[[col_name]],exclude = NULL),decreasing = T)[1:3])
df[[col_name]] <- ifelse(df[[col_name]] %in% top_3, df[[col_name]], "other");
df[[col_name]] <- as.factor(df[[col_name]])
}
my_fn(my_data,"col1")
答案 0 :(得分:3)
将return(df)
添加到您的函数末尾。
my_fn <- function(df, col_name) {
df[[col_name]] <- as.character(df[[col_name]]);
df[[col_name]] <- trimws(df[[col_name]]);
df[[col_name]][which(df[[col_name]]=="")] <- -999;
top_3 <- rownames(sort(table(df[[col_name]],exclude = NULL),decreasing = T)[1:3])
df[[col_name]] <- ifelse(df[[col_name]] %in% top_3, df[[col_name]], "other");
df[[col_name]] <- as.factor(df[[col_name]])
return(df)
}
如果您只想返回更改的列,请添加return(df[[col_name]])
:
my_fn <- function(df, col_name) {
df[[col_name]] <- as.character(df[[col_name]]);
df[[col_name]] <- trimws(df[[col_name]]);
df[[col_name]][which(df[[col_name]]=="")] <- -999;
top_3 <- rownames(sort(table(df[[col_name]],exclude = NULL),decreasing = T)[1:3])
df[[col_name]] <- ifelse(df[[col_name]] %in% top_3, df[[col_name]], "other");
df[[col_name]] <- as.factor(df[[col_name]])
return(df[[col_name]])
}
my_data$col1 = my_fn(my_data,"col1")