测试数据在执行logit时具有新级别但在C5中预测时不会出错

时间:2016-02-07 12:36:03

标签: r logistic-regression predict

我不知道2模型如何处理因子水平,但logit不会预测并给出一个说明新因子水平的错误信息。当我预测使用C5时,它工作正常。我已经从单个数据框创建了火车和测试,并且两者中的级别相互匹配。有人可以解释这种行为并为此做出转变。据我所知,测试中的新水平无法计算出他们的系数。但我认为将它们设置为NULL应该没问题。

这是一些代码。 我用它来匹配保持和训练的水平。 tr =数据集将被分成火车和测试。

tr=structure(
        list(
            production_year = c(
                2007L, 2010L, 2010L, 2008L,
                2007L, 2008L, 2008L, 2008L, 2007L, 2011L, 2009L, 2009L, 2009L,
                2008L, 2007L, 2007L, 2010L, 2009L, 2008L, 2008L, 2010L, 2010L,
                2007L, 2010L, 2009L, 2008L, 2007L, 2007L, 2008L, 2007L, 2010L,
                2011L, 2010L, 2007L, 2009L, 2009L, 2008L, 2008L, 2010L, 2011L
            ), movie_sequel = structure(
                c(
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            ), creative_type = structure(
                c(
                    1L,
                    4L, 1L, 4L, 5L, 1L, 1L, 6L, 2L, 1L, 6L, 1L, 1L, 1L, 1L, 1L, 1L,
                    1L, 8L, 1L, 7L, 1L, 1L, 3L, 1L, 1L, 2L, 4L, 4L, 1L, 1L, 4L, 5L,
                    5L, 1L, 4L, 1L, 1L, 1L, 1L
                ), .Label = c(
                    "Contemporary Fiction",
                    "Dramatization", "Factual", "Fantasy", "Historical Fiction",
                    "Kids Fiction", "Science Fiction", "Super Hero"
                ), class = "factor"
            ),
            source = structure(
                c(
                    6L, 2L, 6L, 7L, 2L, 6L, 6L, 6L, 4L,
                    6L, 2L, 7L, 6L, 6L, 6L, 3L, 6L, 6L, 1L, 2L, 6L, 5L, 6L, 5L,
                    5L, 6L, 4L, 2L, 2L, 6L, 6L, 2L, 7L, 4L, 6L, 5L, 6L, 2L, 6L,
                    6L
                ), .Label = c(
                    "Based on Comic/Graphic Novel", "Based on Fiction Book/Short Story",
                    "Based on Folk Tale/Legend/Fairytale", "Based on Real Life Events",
                    "Based on TV", "Original Screenplay", "Remake"
                ), class = "factor"
            ),
            production_method = structure(
                c(
                    3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    2L, 3L, 3L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 3L,
                    3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
                    3L, 3L, 3L
                ), .Label = c(
                    "Animation/Live Action", "Digital Animation",
                    "Live Action", "Stop-Motion Animation"
                ), class = "factor"
            ),
            genre = structure(
                c(
                    3L, 1L, 4L, 5L, 1L, 4L, 3L, 3L, 4L, 5L,
                    2L, 7L, 6L, 5L, 7L, 3L, 3L, 7L, 1L, 7L, 7L, 3L, 4L, 3L, 3L,
                    6L, 4L, 2L, 1L, 2L, 6L, 4L, 7L, 1L, 4L, 2L, 3L, 7L, 7L, 5L
                ), .Label = c(
                    "Action", "Adventure", "Comedy", "Drama", "Horror",
                    "Romantic Comedy", "Thriller/Suspense"
                ), class = "factor"
            ),
            language = structure(
                c(
                    2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
                    2L
                ), .Label = c("Danish", "English"), class = "factor"
            ),
            movie_board_rating_display_name = structure(
                c(
                    3L, 3L, 3L,
                    2L, 2L, 3L, 2L, 1L, 3L, 2L, 1L, 3L, 2L, 3L, 3L, 2L, 3L, 3L,
                    3L, 3L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 1L, 2L, 3L, 2L, 2L, 3L,
                    2L, 3L, 1L, 2L, 3L, 3L, 2L
                ), .Label = c("PG", "PG-13", "R"), class = "factor"
            ), movie_release_pattern_display_name = structure(
                c(
                    4L,
                    4L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L, 3L, 4L,
                    3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 1L, 4L,
                    4L, 4L, 2L, 3L, 4L, 4L, 4L, 3L, 4L
                ), .Label = c("Exclusive",
                              "Expands Wide", "Limited", "Wide"), class = "factor"
            ), Category1 = structure(
                c(
                    1L,
                    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
                    1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
                    2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
                ), .Label = c("0", "1"), class = "factor"
            )
        ), .Names = c(
            "production_year",
            "movie_sequel", "creative_type", "source", "production_method",
            "genre", "language", "movie_board_rating_display_name", "movie_release_pattern_display_name",
            "Category1"
        ), row.names = c(
            506L, 474L, 1011L, 569L, 737L, 1124L,
            602L, 717L, 747L, 977L, 284L, 620L, 100L, 301L, 514L, 865L, 828L,
            283L, 921L, 839L, 15L, 937L, 931L, 201L, 273L, 507L, 1180L, 689L,
            276L, 649L, 603L, 22L, 555L, 974L, 552L, 500L, 216L, 312L, 796L,
            682L
        ), class = "data.frame"
    )

    train=tr[1:25,] # training data
    hold=tr[26:40,] # test data

    for(i in 1:ncol(train)){
        if(is.factor(train[,i])){
            hold[,i] <- factor(hold[,i],levels=levels(train[,i]))

        }
    }

m.glm=glm(Category1 ~ ., data = train, family = 'binomial')
labels=hold$Category1
hold$Category1=NULL
p=predict(m.glm, hold)

所有级别

structure(list(production_year = 2011L, movie_sequel = structure(1L, .Label = c("0", 
"1"), class = "factor"), creative_type = structure(5L, .Label = c("Contemporary Fiction", 
"Dramatization", "Factual", "Fantasy", "Historical Fiction", 
"Kids Fiction", "Multiple Creative Types", "Science Fiction", 
"Super Hero"), class = "factor"), source = structure(14L, .Label = c("Based on Comic/Graphic Novel", 
"Based on Factual Book/Article", "Based on Fiction Book/Short Story", 
"Based on Folk Tale/Legend/Fairytale", "Based on Game", "Based on Musical or Opera", 
"Based on Play", "Based on Real Life Events", "Based on Short Film", 
"Based on Theme Park Ride", "Based on Toy", "Based on TV", "Compilation", 
"Original Screenplay", "Remake", "Spin-Off"), class = "factor"), 
    production_method = structure(4L, .Label = c("Animation/Live Action", 
    "Digital Animation", "Hand Animation", "Live Action", "Multiple Production Methods", 
    "Stop-Motion Animation"), class = "factor"), genre = structure(13L, .Label = c("Action", 
    "Adventure", "Black Comedy", "Comedy", "Concert/Performance", 
    "Documentary", "Drama", "Horror", "Multiple Genres", "Musical", 
    "Romantic Comedy", "Thriller/Suspense", "Western"), class = "factor"), 
    language = structure(3L, .Label = c("Arabic", "Danish", "English", 
    "Farsi", "French", "German", "Hebrew", "Hindi", "Italian", 
    "Japanese", "Norwegian", "Polish", "Portuguese", "Silent", 
    "Spanish", "Swedish"), class = "factor"), movie_board_rating_display_name = structure(6L, .Label = c("G", 
    "NC-17", "Not Rated", "PG", "PG-13", "R"), class = "factor"), 
    movie_release_pattern_display_name = structure(7L, .Label = c("Exclusive", 
    "Expands Wide", "IMAX", "Limited", "Oscar Qualifying Run", 
    "Special Engagement", "Wide"), class = "factor"), Category1 = structure(1L, .Label = c("0", 
    "1"), class = "factor")), .Names = c("production_year", "movie_sequel", 
"creative_type", "source", "production_method", "genre", "language", 
"movie_board_rating_display_name", "movie_release_pattern_display_name", 
"Category1"), row.names = 304L, class = "data.frame")

1 个答案:

答案 0 :(得分:1)

我看到它的方式,您将不得不排除那些尚未用于拟合模型的行。

predict(m.glm, hold[!hold$movie_release_pattern_display_name %in% c("Exclusive", "Expands Wide"), ])