r-与人类死亡率数据库上的StMoMo软件包匹配的模型较差

时间:2018-12-12 17:25:30

标签: r data-science

我使用R中的StMoMo软件包拟合了人类死亡率数据库中的美国死亡率数据(在注册了免费帐户https://www.mortality.org/cgi-bin/hmd/hmd_download.php后从此处下载了数据)。模型拟合度不是很好。我已经研究了软件包的源代码,但是并没有真正找到任何有用的信息来帮助我提高模型的准确性。 (我注意到它正在使用广义非线性模型拟合模型参数)。

因此,我要求提供任何可以帮助我提高准确性的建议。

我的代码如下:

library("StMoMo")

label="U.S.A."

变量“ mx”包含中央死亡率的数据;数据除以性别,年龄和年份。可以提取队列效果。

mx <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Mx_1x1.txt", skip = 2, header = TRUE, 
    na.strings = "."), TRUE)
mx = mx[mx$Year<=2014 & mx$Year>=1977,]

变量“流行”包含人口数据(处于危险中的接触数)除以性别,年龄和年份。请注意,该路径应指向从HMD下载的USA文件夹中的USA死亡率数据。另请注意,该年份仅限于1977-2014年。尽管HMD拥有1933-2016年的数据。

pop <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Exposures_1x1.txt", skip = 2, header = TRUE, 
    na.strings = "."), TRUE)
pop = pop[pop$Year<=2014 & pop$Year>=1977,]
obj <- list(type = "mortality", label = label, lambda = 0)
obj$year <- sort(unique(mx[, 1]))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
mnames <- names(mx)[-c(1, 2)]
n.mort <- length(mnames)
obj$rate <- obj$pop <- list()
for (i in 1:n.mort) {
    obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
    obj$rate[[i]][obj$rate[[i]] < 0] <- NA
    obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
    obj$pop[[i]][obj$pop[[i]] < 0] <- NA
    dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age, 
        obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m])) 
    obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2]
USATrain = (structure(obj, class = "demogdata"))

这里,我们阅读了完整的数据以与预测进行比较。

label="U.S.A."

mx <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Mx_1x1.txt", skip = 2, header = TRUE, 
    na.strings = "."), TRUE)

pop <- try(utils::read.table("C:/Users/myself/Downloads/USA/STATS/Exposures_1x1.txt", skip = 2, header = TRUE, 
    na.strings = "."), TRUE)
obj <- list(type = "mortality", label = label, lambda = 0)
obj$year <- sort(unique(mx[, 1]))
n <- length(obj$year)
m <- length(unique(mx[, 2]))
obj$age <- mx[1:m, 2]
mnames <- names(mx)[-c(1, 2)]
n.mort <- length(mnames)
obj$rate <- obj$pop <- list()
for (i in 1:n.mort) {
    obj$rate[[i]] <- matrix(mx[, i + 2], nrow = m, ncol = n)
    obj$rate[[i]][obj$rate[[i]] < 0] <- NA
    obj$pop[[i]] <- matrix(pop[, i + 2], nrow = m, ncol = n)
    obj$pop[[i]][obj$pop[[i]] < 0] <- NA
    dimnames(obj$rate[[i]]) <- dimnames(obj$pop[[i]]) <- list(obj$age, 
        obj$year)
}
names(obj$pop) = names(obj$rate) <- tolower(mnames)
obj$age <- as.numeric(as.character(obj$age))
if (is.na(obj$age[m])) 
    obj$age[m] <- 2 * obj$age[m - 1] - obj$age[m - 2]
FullData = (structure(obj, class = "demogdata"))

设置约束函数。我适合30-95岁。

usaMale = StMoMoData(USATrain, series = "male")
usaMaleStMoMo <- central2initial(usaMale)
ages.fit <- 30:95
LC = lc(link = "logit")
APC = apc(link = "logit")
M7 = m7()
M6 = m6()
M8 = m8(xc = 89)
CBD = cbd()
RH <- rh(link = "logit", cohortAgeFun = "1")

f2 <- function(x, ages) mean(ages) - x

constPlat <- function(ax, bx, kt, b0x, gc, wxt, ages){
 nYears <- dim(wxt)[2]
 x <- ages
 t <- 1:nYears
 c <- (1 - tail(ages, 1)):(nYears - ages[1])
 xbar <- mean(x)
 phiReg <- lm(gc ~ 1 + c + I(c ^ 2), na.action = na.omit)
 phi <- coef(phiReg)
 gc <- gc - phi[1] - phi[2] * c - phi[3] * c ^ 2
 kt[2, ] <- kt[2, ] + 2 * phi[3] * t
 kt[1, ] <- kt[1, ] + phi[2] * t + phi[3] * (t ^ 2 - 2 * xbar * t)
 ax <- ax + phi[1] - phi[2] * x + phi[3] * x ^ 2
 ci <- rowMeans(kt, na.rm = TRUE)
 ax <- ax + ci[1] + ci[2] * (xbar - x)
 kt[1, ] <- kt[1, ] - ci[1]
 kt[2, ] <- kt[2, ] - ci[2]
 list(ax = ax, bx = bx, kt = kt, b0x = b0x, gc = gc)
 }



PLAT <- StMoMo(link = "logit", staticAgeFun = TRUE, periodAgeFun = c("1", f2), cohortAgeFun = "1", constFun = constPlat)

wxt <- genWeightMat(ages = ages.fit, years = usaMaleStMoMo$years, clip = 3)

拟合模型。

LCfit <- fit(LC, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
APCfit <- fit(APC, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
CBDfit <- fit(CBD, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
M7fit <- fit(M7, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
M6fit <- fit(M6, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
M8fit <- fit(M8, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
PLATfit <- fit(PLAT, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt)
RHfit <- fit(RH, data = usaMaleStMoMo, ages.fit = ages.fit, wxt = wxt, start.ax = LCfit$ax, start.bx = LCfit$bx, start.kt = LCfit$kt)

预测50年

LCfor <- forecast(LCfit, h = 50)
CBDfor <- forecast(CBDfit, h = 50)
APCfor <- forecast(APCfit, h = 50, gc.order = c(1, 1, 0))
RHfor <- forecast(RHfit, h = 50, gc.order = c(1, 1, 0))
M7for <- forecast(M7fit, h = 50, gc.order = c(2, 0, 0))
M6for <- forecast(M6fit, h = 50, gc.order = c(2, 0, 0))
M8for <- forecast(M8fit, h = 50, gc.order = c(2, 0, 0))
PLATfor <- forecast(PLATfit, h = 50, gc.order = c(2, 0, 0))

掌握2015年美国男性的实际死亡率和所有模型的预测死亡率。我正在将2015年的预计汇率与2015年的实际汇率进行比较。

actual2015 = FullData$rate$male[ages.fit+1,c("2015")]
LC2015 = LCfor$rates[,1]
CBD2015 = CBDfor$rates[,1]
APC2015 = APCfor$rates[,1]
RH2015 =  RHfor$rates[,1]
M72015 = M7for$rates[,1]
M62015 = M6for$rates[,1]
M82015 = M8for$rates[,1]
PLAT2015 = PLATfor$rates[,1]

合并结果。我承认下面的代码很丑陋,均方误差可能要好于平均百分比误差。但是,我认为误差百分比偏离下图所示的事实表明该模型不太适合。此外,残差图还表明数据中存在无法解释的结构。

aicbic = matrix(ncol = 8, nrow = 4)
colnames(aicbic) = c("LC", "CBD", "APC", "RH", "M7", "PLAT", "M6", "M8")
rownames(aicbic) = c("Number Parameters", "AIC", "BIC", "Average Pct Error for t+1")

aicbic[1,1]=LCfit$npar
aicbic[1,2]=CBDfit$npar
aicbic[1,3]=APCfit$npar
aicbic[1,4]=RHfit$npar
aicbic[1,5]=M7fit$npar
aicbic[1,6]=PLATfit$npar
aicbic[1,7]=M6fit$npar
aicbic[1,8]=M8fit$npar
aicbic[2,1]=AIC(LCfit)
aicbic[2,2]=AIC(CBDfit)
aicbic[2,3]=AIC(APCfit)
aicbic[2,4]=AIC(RHfit)
aicbic[2,5]=AIC(M7fit)
aicbic[2,6]=AIC(PLATfit)
aicbic[2,7]=AIC(M6fit)
aicbic[2,8]=AIC(M8fit)
aicbic[3,1]=BIC(LCfit)
aicbic[3,2]=BIC(CBDfit)
aicbic[3,3]=BIC(APCfit)
aicbic[3,4]=BIC(RHfit)
aicbic[3,5]=BIC(M7fit)
aicbic[3,6]=BIC(PLATfit)
aicbic[3,7]=BIC(M6fit)
aicbic[3,8]=BIC(M8fit)
aicbic[4,1]=mean((LC2015 - actual2015)/actual2015)*100
aicbic[4,2]=mean((CBD2015 - actual2015)/actual2015)*100
aicbic[4,3]=mean((APC2015 - actual2015)/actual2015)*100
aicbic[4,4]=mean((RH2015 - actual2015)/actual2015)*100
aicbic[4,5]=mean((M72015 - actual2015)/actual2015)*100
aicbic[4,6]=mean((PLAT2015 - actual2015)/actual2015)*100
aicbic[4,7]=mean((M62015 - actual2015)/actual2015)*100
aicbic[4,8]=mean((M82015 - actual2015)/actual2015)*100

View(aicbic)

Here is the result

剩余图。

For the best model in terms of AIC and BIC, the Plat model, exhibits the following residuals.

残差图的代码。     CBDRes =残差(CBDfit)     plot(CBDRes,type =“ colormap”,main =“ CBD Residuals”)

LCRes = residuals(LCfit)
plot(LCRes, type = "colourmap", main = "Lee-Carter Residuals")

APCRes = residuals(APCfit)
plot(APCRes, type = "colourmap", main = "Age-Period-Cohort Residuals")

RHRes = residuals(RHfit)
plot(RHRes, type = "colourmap", main = "Renshaw and Haberman Residuals")

M7Res = residuals(M7fit)
plot(M7Res, type = "colourmap", main = "M7 Residuals")

PLATRes = residuals(PLATfit)
plot(PLATRes, type = "colourmap", main = "Plat Residuals")

M6Res = residuals(M6fit)
plot(M6Res, type = "colourmap", main = "M6 Residuals")

M8Res = residuals(M8fit)
plot(M8Res, type = "colourmap", main = "M8 Residuals")

0 个答案:

没有答案