我试图使用glm()
函数基于内置的虹膜数据集制作二元分类模型。
首先,过滤数据集,以便在二进制分类中仅使用Species Versicolor和Virginica。我在代码中收到警告消息。
有没有办法修复代码以摆脱这些警告消息?警告邮件位于ROC.train
代码和ggplot()
代码下。代码如下:
>library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557
JackStat推荐:
> library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos) eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)
Error: unexpected symbol in:
"result <- data.frame(true_pos, false_pos, model.name)
result %>% dplyr::arrange(false_pos, true_pos) eps"
> }
Error: unexpected '}' in "}"
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557
代码仍然显示错误消息。 我把这些代码放在正确的位置吗?
eps <- 1e-15; test_pred = pmax(pmin(test_pred, 1 - eps), eps)
我编辑了代码以排除0和1。但是,我仍然会遇到错误。我现在可以做些什么来修复错误?
> library(dplyr)
> library(forcats)
> library(ggplot2)
> iris.small <- datasets::iris %>%
+ dplyr::filter(Species != "setosa") %>%
+ dplyr::mutate(Species = fct_drop(Species)) %>%
+ dplyr::group_by(Species) %>%
+ dplyr::summarize(avg_sl = mean(Sepal.Length),
+ avg_sw = mean(Sepal.Width),
+ avg_pl = mean(Petal.Length),
+ avg_pw = mean(Petal.Width))
> set.seed(2016-11-14)
> iris.big <- data_frame(Species = as.factor(c(rep("versicolor", 500), rep("virginica", 500))),
+ sl = c(rnorm(500, iris.small$avg_sl[1]), rnorm(500, iris.small$avg_sl[2])),
+ sw = c(rnorm(500, iris.small$avg_sw[1]), rnorm(500, iris.small$avg_sw[2])),
+ pl = c(rnorm(500, iris.small$avg_pl[1]), rnorm(500, iris.small$avg_pl[2])),
+ pw = c(rnorm(500, iris.small$avg_pw[1]), rnorm(500, iris.small$avg_pw[2])))
> iris.big$sl[iris.big$sl==0] <-0.0000000001
> iris.big$sw[iris.big$sw==0] <-0.0000000001
> iris.big$pl[iris.big$pl==0] <-0.0000000001
> iris.big$pw[iris.big$pw==0] <-0.0000000001
> iris.big$sl[iris.big$sl==1] <-0.99999999
> iris.big$sw[iris.big$sw==1] <-0.99999999
> iris.big$pl[iris.big$pl==1] <-0.99999999
> iris.big$pw[iris.big$pw==1] <-0.99999999
> train_fraction <- 0.5 #fraction of data for training purposes
> n_obs <- nrow(iris.big)
> train_size <- floor(train_fraction * nrow(iris.big))
> train_indices <- sample(n_obs, size=train_size, replace=TRUE) #sample(x, size, replace = FALSE, prob = NULL)x Either a (numeric, complex, character or logical) vector of more than one element from which to choose, or a positive integer.size non-negative integer giving the number of items to choose. replace Should sampling be with replacement? prob A vector of probability weights for obtaining the elements of the vector being sampled
> train_data <- iris.big[train_indices, ]
> test_data <- iris.big[-train_indices, ]
> glm.out.train <- glm(Species ~ sl + sw + pl + pw, data=train_data, family = "binomial")
> test_pred <- predict(glm.out.train, test_data, type='response')
> calc_ROC <- function(probabilities, known_truth, model.name=NULL)
+ {
+ outcome <- as.numeric(factor(known_truth))-1
+ pos <- sum(outcome) # total known positives
+ neg <- sum(1-outcome) # total known negatives
+ pos_probs <- outcome*probabilities # probabilities for known positives
+ neg_probs <- (1-outcome)*probabilities # probabilities for known negatives
+ true_pos <- sapply(probabilities,
+ function(x) sum(pos_probs>=x)/pos) # true pos. rate
+ false_pos <- sapply(probabilities,
+ function(x) sum(neg_probs>=x)/neg)
+ if (is.null(model.name))
+ result <- data.frame(true_pos, false_pos)
+ else
+ result <- data.frame(true_pos, false_pos, model.name)
+ result %>% dplyr::arrange(false_pos, true_pos)
+ }
> ROC.train <- calc_ROC(probabilities=test_pred, known_truth=train_data$Species, model.name="train")
Warning messages:
1: In outcome * probabilities :
longer object length is not a multiple of shorter object length
2: In (1 - outcome) * probabilities :
longer object length is not a multiple of shorter object length
> ROC.test <- calc_ROC(probabilities=test_pred, known_truth=test_data$Species, model.name="test")
> ROCs <- rbind(ROC.train, ROC.test)
> ggplot(ROCs, aes(x=false_pos, y=true_pos, color=model.name)) + geom_line() + xlim(0, 0.25)
Warning message:
Removed 745 rows containing missing values (geom_path).
> ROCs %>% dplyr::group_by(model.name) %>% dplyr::mutate(delta=false_pos-lag(false_pos)) %>% dplyr::summarize(AUC=sum(delta*true_pos, na.rm=T)) %>% dplyr::arrange(desc(AUC))
# A tibble: 2 × 2
model.name AUC
<fctr> <dbl>
1 test 0.8700770
2 train 0.7329557