我使用R中的StMoMo软件包拟合了人类死亡率数据库中的美国死亡率数据(在注册了免费帐户https://www.mortality.org/cgi-bin/hmd/hmd_download.php后从此处下载了数据)。模型拟合度不是很好。我已经研究了软件包的源代码,但是并没有真正找到任何有用的信息来帮助我提高模型的准确性。 (我注意到它正在使用广义非线性模型拟合模型参数)。
因此,我要求提供任何可以帮助我提高准确性的建议。
我的代码如下:
library("StMoMo")
label="U.S.A."
变量“ mx”包含中央死亡率的数据;数据除以性别,年龄和年份。可以提取队列效果。
mx <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Mx_1x1.txt", skip = 2, header = TRUE,
na.strings = "."), TRUE)
mx = mx[mx$Year<=2014 & mx$Year>=1977,]
变量“流行”包含人口数据(处于危险中的接触数)除以性别,年龄和年份。请注意,该路径应指向从HMD下载的USA文件夹中的USA死亡率数据。另请注意,该年份仅限于1977-2014年。尽管HMD拥有1933-2016年的数据。
pop <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Exposures_1x1.txt", skip = 2, header = TRUE,
na.strings = "."), TRUE)
pop = pop[pop$Year<=2014 & pop$Year>=1977,]
obj <- list(type = "mortality", label = label, lambda = 0)
obj$year <- sort(unique(mx[, 1]))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
mnames <- names(mx)[-c(1, 2)]
n.mort <- length(mnames)
obj$rate <- obj$pop <- list()
for (i in 1:n.mort) {
obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
obj$rate[[i]][obj$rate[[i]] < 0] <- NA
obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
obj$pop[[i]][obj$pop[[i]] < 0] <- NA
dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age,
obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m]))
obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2]
USATrain = (structure(obj, class = "demogdata"))
这里,我们阅读了完整的数据以与预测进行比较。
label="U.S.A."
mx <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Mx_1x1.txt", skip = 2, header = TRUE,
na.strings = "."), TRUE)
pop <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Exposures_1x1.txt", skip = 2, header = TRUE,
na.strings = "."), TRUE)
obj <- list(type = "mortality", label = label, lambda = 0)
obj$year <- sort(unique(mx[, 1]))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
mnames <- names(mx)[-c(1, 2)]
n.mort <- length(mnames)
obj$rate <- obj$pop <- list()
for (i in 1:n.mort) {
obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
obj$rate[[i]][obj$rate[[i]] < 0] <- NA
obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
obj$pop[[i]][obj$pop[[i]] < 0] <- NA
dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age,
obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m]))
obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2]
FullData = (structure(obj, class = "demogdata"))
设置约束函数。我适合30-95岁。
usaMale = StMoMoData(USATrain, series = "male")
usaMaleStMoMo <- central2initial(usaMale)
ages.fit <- 30:95
LC = lc(link = "logit")
APC = apc(link = "logit")
M7 = m7()
M6 = m6()
M8 = m8(xc = 89)
CBD = cbd()
RH <- rh(link = "logit", cohortAgeFun = "1")
f2 <- function(x, ages) mean(ages) - x
constPlat <- function(ax, bx, kt, b0x, gc, wxt, ages){
nYears <- dim(wxt)[2]
x <- ages
t <- 1:nYears
c <- (1 - tail(ages, 1)):(nYears - ages[1])
xbar <- mean(x)
phiReg <- lm(gc ~ 1 + c + I(c ^ 2), na.action = na.omit)
phi <- coef(phiReg)
gc <- gc - phi[1] - phi[2] * c - phi[3] * c ^ 2
kt[2, ] <- kt[2, ] + 2 * phi[3] * t
kt[1, ] <- kt[1, ] + phi[2] * t + phi[3] * (t ^ 2 - 2 * xbar * t)
ax <- ax + phi[1] - phi[2] * x + phi[3] * x ^ 2
ci <- rowMeans(kt, na.rm = TRUE)
ax <- ax + ci[1] + ci[2] * (xbar - x)
kt[1, ] <- kt[1, ] - ci[1]
kt[2, ] <- kt[2, ] - ci[2]
list(ax = ax, bx = bx, kt = kt, b0x = b0x, gc = gc)
}
PLAT <- StMoMo(link = "logit", staticAgeFun = TRUE, periodAgeFun = c("1", f2), cohortAgeFun = "1", constFun = constPlat)
wxt <- genWeightMat(ages = ages.fit, years = usaMaleStMoMo$years, clip = 3)
拟合模型。
LCfit <- fit(LC, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
APCfit <- fit(APC, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
CBDfit <- fit(CBD, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
M7fit <- fit(M7, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
M6fit <- fit(M6, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
M8fit <- fit(M8, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
PLATfit <- fit(PLAT, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
RHfit <- fit(RH, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt, start.ax = LCfit$ax, start.bx = LCfit$bx, start.kt = LCfit$kt)
预测50年
LCfor <- forecast(LCfit, h = 50)
CBDfor <- forecast(CBDfit, h = 50)
APCfor <- forecast(APCfit, h = 50, gc.order = c(1, 1, 0))
RHfor <- forecast(RHfit, h = 50, gc.order = c(1, 1, 0))
M7for <- forecast(M7fit, h = 50, gc.order = c(2, 0, 0))
M6for <- forecast(M6fit, h = 50, gc.order = c(2, 0, 0))
M8for <- forecast(M8fit, h = 50, gc.order = c(2, 0, 0))
PLATfor <- forecast(PLATfit, h = 50, gc.order = c(2, 0, 0))
掌握2015年美国男性的实际死亡率和所有模型的预测死亡率。我正在将2015年的预计汇率与2015年的实际汇率进行比较。
actual2015 = FullData$rate$male[ages.fit+1,c("2015")]
LC2015 = LCfor$rates[,1]
CBD2015 = CBDfor$rates[,1]
APC2015 = APCfor$rates[,1]
RH2015 = RHfor$rates[,1]
M72015 = M7for$rates[,1]
M62015 = M6for$rates[,1]
M82015 = M8for$rates[,1]
PLAT2015 = PLATfor$rates[,1]
合并结果。我承认下面的代码很丑陋,均方误差可能要好于平均百分比误差。但是,我认为误差百分比偏离下图所示的事实表明该模型不太适合。此外,残差图还表明数据中存在无法解释的结构。
aicbic = matrix(ncol = 8, nrow = 4)
colnames(aicbic) = c("LC", "CBD", "APC", "RH", "M7", "PLAT", "M6", "M8")
rownames(aicbic) = c("Number Parameters", "AIC", "BIC", "Average Pct Error for t+1")
aicbic[1,1]=LCfit$npar
aicbic[1,2]=CBDfit$npar
aicbic[1,3]=APCfit$npar
aicbic[1,4]=RHfit$npar
aicbic[1,5]=M7fit$npar
aicbic[1,6]=PLATfit$npar
aicbic[1,7]=M6fit$npar
aicbic[1,8]=M8fit$npar
aicbic[2,1]=AIC(LCfit)
aicbic[2,2]=AIC(CBDfit)
aicbic[2,3]=AIC(APCfit)
aicbic[2,4]=AIC(RHfit)
aicbic[2,5]=AIC(M7fit)
aicbic[2,6]=AIC(PLATfit)
aicbic[2,7]=AIC(M6fit)
aicbic[2,8]=AIC(M8fit)
aicbic[3,1]=BIC(LCfit)
aicbic[3,2]=BIC(CBDfit)
aicbic[3,3]=BIC(APCfit)
aicbic[3,4]=BIC(RHfit)
aicbic[3,5]=BIC(M7fit)
aicbic[3,6]=BIC(PLATfit)
aicbic[3,7]=BIC(M6fit)
aicbic[3,8]=BIC(M8fit)
aicbic[4,1]=mean((LC2015 - actual2015)/actual2015)*100
aicbic[4,2]=mean((CBD2015 - actual2015)/actual2015)*100
aicbic[4,3]=mean((APC2015 - actual2015)/actual2015)*100
aicbic[4,4]=mean((RH2015 - actual2015)/actual2015)*100
aicbic[4,5]=mean((M72015 - actual2015)/actual2015)*100
aicbic[4,6]=mean((PLAT2015 - actual2015)/actual2015)*100
aicbic[4,7]=mean((M62015 - actual2015)/actual2015)*100
aicbic[4,8]=mean((M82015 - actual2015)/actual2015)*100
View(aicbic)
剩余图。
残差图的代码。 CBDRes =残差(CBDfit) plot(CBDRes,type =“ colormap”,main =“ CBD Residuals”)
LCRes = residuals(LCfit)
plot(LCRes, type = "colourmap", main = "Lee-Carter Residuals")
APCRes = residuals(APCfit)
plot(APCRes, type = "colourmap", main = "Age-Period-Cohort Residuals")
RHRes = residuals(RHfit)
plot(RHRes, type = "colourmap", main = "Renshaw and Haberman Residuals")
M7Res = residuals(M7fit)
plot(M7Res, type = "colourmap", main = "M7 Residuals")
PLATRes = residuals(PLATfit)
plot(PLATRes, type = "colourmap", main = "Plat Residuals")
M6Res = residuals(M6fit)
plot(M6Res, type = "colourmap", main = "M6 Residuals")
M8Res = residuals(M8fit)
plot(M8Res, type = "colourmap", main = "M8 Residuals")