Question

我希望使用mnlogit包来拟合模型，并使用它来进行样本外预测。我使用mnlogit附带的钓鱼数据设置了一个玩具示例：

library(data.table)
library(mnlogit)

data(Fish, package="mnlogit")
fish_dt <- data.table(Fish)
rm(Fish)

unique_id <- unique(fish_dt[, chid])
set.seed(54321)
train_id <- sample(unique_id, size=0.5*length(unique_id))

setkey(fish_dt, chid, alt)
train <- fish_dt[J(train_id)]
test <- fish_dt[!J(train_id)]
setkey(train, chid, alt)
setkey(test, chid, alt)
stopifnot(nrow(train) + nrow(test) == nrow(fish_dt))  # Partition fish_dt

mnlogit_formula <- mode ~ catch | income
mnlogit_model <- mnlogit(mnlogit_formula, data=train, choiceVar="alt")

train_predictions <- predict(mnlogit_model, probability=F)
stopifnot(length(train_predictions) == length(unique(train[, chid])))  # One per choice
mean(subset(train, mode)[, alt] == train_predictions)  # Around 0.42 accuracy in sample

## Would like to do the same out of sample, i.e. with data table "test"
test_predictions <- predict(mnlogit_model, newdata=test, probability=F)  # Error
test_predictions <- predict(mnlogit_model, newdata=as.data.frame(test), probability=F)  # Same error

我得到的错误是：

colnames<-中的错误（*tmp*，值=列表（chid = c（1L，2L，3L，4L， 5L，：长度＆＃39; dimnames＆＃39; [2]不等于数组范围

我在Ubuntu 14.04.2 LTS上运行R版本3.0.2（2013-09-25）。

我是否错误地使用了包裹或这是一个错误？

编辑：查看评论：我尝试删除＆＃34;模式＆＃34;来自＆＃34;测试＆＃34;的列数据表，但这给了我一个＆＃34; newdata必须具有与训练数据相同的列＆＃34;错误：

test[, mode := NULL]
mnlogit_predictions <- predict(mnlogit_model, newdata=test, probability=F)  # Error

编辑：这是我使用mlogit包的示例（类似但对于大问题可能会慢得多）：

library(data.table)
library(mlogit)

data(Fish, package="mnlogit")
fish_dt <- data.table(Fish)
rm(Fish)

unique_id <- unique(fish_dt[, chid])
set.seed(54321)
train_id <- sample(unique_id, size=0.5*length(unique_id))

setkey(fish_dt, chid, alt)
train <- fish_dt[J(train_id)]
test <- fish_dt[!J(train_id)]
setkey(train, chid, alt)
setkey(test, chid, alt)
stopifnot(nrow(train) + nrow(test) == nrow(fish_dt))  # Partition fish_dt

train_mlogit <- mlogit.data(train, choice="mode", shape="long",
                            chid.var="chid", alt.var="alt")
test_mlogit <- mlogit.data(test, choice="mode", shape="long",
                           chid.var="chid", alt.var="alt")

model_formula <- mode ~ catch | income
mlogit_model <- mlogit(model_formula, data=train_mlogit)

## In-sample performance
train_predictions <- predict(mlogit_model, newdata=train_mlogit)
stopifnot(nrow(train_predictions) == length(unique(train[, chid])))  # One per choice
train_predictions <- colnames(train_predictions)[apply(train_predictions, 1, which.max)]
mean(subset(train, mode)[, alt] == train_predictions)  # Around 0.42 accuracy in sample

## Out-of-sample performance
test_predictions <- predict(mlogit_model, newdata=test_mlogit)
test_predictions <- colnames(test_predictions)[apply(test_predictions, 1, which.max)]
mean(subset(test, mode)[, alt] == test_predictions)  # Around 0.41 accuracy out of sample

我想做到这一点，但用mnlogit而不是mlogit。

Answer 1

预测适用于mnlogit对象。但是，你的'test'对象是一个数据表，而不是mlogit.data对象，所以你需要在预测调用中传递choiceVar。

test_predictions <- predict(mnlogit_model, newdata=test,choiceVar="alt")  # works
test_predictions <- predict(mnlogit_model, newdata=as.data.frame(test), probability=F,choiceVar="alt")  # this also works

由于

使用newdata调用predict时出现mnlogit错误

1 个答案: