我正努力为只有正数的相对较小的样本找到合适的分布这一问题。数据样本称为“ xxx”:
1610.0 560.0 70.0 14000.0 96550.0 630.0 1505.0 1592.5 717.5 32657.5 10830.0 93770.0 1015.0 17.5 115127.5 1472.5 45840.0 33500.0 98000.0 9955.0 1500.0 36000.0
似乎某种混合物分布可能适合此数据,但我从未经历过找到\拟合数据的混合物分布
我已经尝试了指数,魏格勒,伽玛和对数正态分布,因为它们已知适合建模索赔大小。结果,它们全部被ks.test()
拒绝。
qqnorm(xxx)
qqline(xxx, col = "steelblue")
shapiro.test(xxx) #p-value = 3.514e-05
#data is not norm.distributed.
hist(xxx,prob=T,col="gray", breaks=seq(0,125000, by=10000),
xlab="", ylab="", main="")
qqexp(xxx)
fitt2 <- fitdistr(xxx, "exponential")
hist(xxx, freq = FALSE, breaks = 10)
curve(dexp(x, rate = fitt2$estimate), from = 0, col = "red", add = TRUE)
ks.test(xxx, "pexp", fitt2$estimate) #p-value = 0.0001932<0.05, distribution refused
fitt.gamma <- fitdist(xxx, distr = "gamma", method = "mle", lower = c(0, 0), start = list(scale = 1, shape = 1))
plot(fitt.gamma)
fdfgg <- fitdistr(xxx, "gamma", list(shape = 1, rate = 0.1), lower = 0.01)
ks.test(xxx,"pgamma",fdfgg$estimate) #2.2e-16
fitt1l<-fitdistr(xxx,"lognormal")
ks.test(xxx, "plnorm", fit1l$estimate) #not log-normal
fitt2l<-fitdist(xxx,"lnorm",method="mle") #although graph look ok
plot(fitt2l)
fitt1w<-fitdistr(xxx,"weibull")
ks.test(xxx, "pweibull", fitt1w$estimate) #no
fitt2w<-fitdist(xxx,"weibull",method="mle",lower = c(0, 0))
plot(fitt1w)
#There was a post where an author provided the code for fitting mixture of 2 gamma distributions and an extreme value distribution. But from the histogram they didn't show a good fit, but I didn't change anything in this code, so it may be a reason for a poor fit:
git2 <- fgev(xxx)
param2 <- git2$estimate
loc <- param2[["loc"]]
scal <- param2[["scale"]]
shape <- param2[["shape"]]
lines(xval, dgev(xval, loc=loc, scale=scal, shape=shape), col="blue", lwd=2)
# mixture of two Gamma distributions
git3 <- flexmix(xxx~1, k=2,
model = list(FLXMRglm(family = "Gamma"), FLXMRglm(family = "Gamma"))
)`enter code here`
param3 <- parameters(git3)[[1]]
interc <- param3[1,]
shape <- param3[2,]
lambda <- prior(git3)
yval <- lambda[[1]]*dgamma(xval, shape=shape[[1]],
rate=interc[[1]]*shape[[1]]) +
lambda[[2]]*dgamma(xval, shape=shape[[2]], rate=interc[[2]]*shape[[2]])
lines(xval, yval, col="darkred", lwd=2)