分位数和数据间隔进行比较

时间:2015-04-01 14:35:28

标签: r plot confidence-interval quantile

我有一个关于使用分位数来确定曲线包络的问题。这就是我正在做的事情:我有一个连续变量(" var"),我使用cut(" var_cut")变为离散变量,以及从连续变量获得的相关变量( " modvar&#34)。我正在做的是绘制 modvar~var_cut ,我想了解 modvar 的可变性。这是我选择的方法:

#df containing var and modvar        
structure(list(var = c(0.1968, 0.2263667, 0.1769, 0.2318, 0.2001333, 
    0.2382667, 0.2005, 0.2022667, 0.1699333, 0.2115667, 0.212, 0.2218667, 
    0.2327333, 0.2224333, 0.1690333, 0.1961333, 0.1756667, 0.2268333, 
    0.1938667, 0.1983, 0.1914333, 0.1745333, 0.2382, 0.2068333, 0.2509333, 
    0.221, 0.2075667, 0.2475333, 0.2463333, 0.2354, 0.2335, 0.2382, 
    0.2636667, 0.1829667, 0.2180333, 0.1703333, 0.2177333, 0.1932667, 
    0.2281, 0.1960667, 0.1975333, 0.1640333, 0.2021667, 0.2044333, 
    0.2124, 0.2267, 0.2202333, 0.1648667, 0.1898, 0.168, 0.2225, 
    0.1899667, 0.1966667, 0.183, 0.1678667, 0.2288333, 0.2006, 0.2389333, 
    0.2105, 0.2018667, 0.2457667, 0.2393333, 0.2286, 0.2280333, 0.2319, 
    0.2565333, 0.1838, 0.2189667, 0.1710667, 0.2184, 0.194, 0.2289333, 
    0.1968, 0.1984, 0.1646667, 0.2029667, 0.2053667, 0.2132333, 0.2274667, 
    0.2211, 0.1655333, 0.1907333, 0.1688333, 0.2234, 0.1908, 0.1975333, 
    0.1838333, 0.1686, 0.2297333, 0.2013667, 0.2397667, 0.2113333, 
    0.2027333, 0.2467333, 0.2402, 0.2295333, 0.2289333, 0.2328333, 
    0.2574333, 0.1795667), modvar = c(1.01575728698598, 0.978902741156023, 
    1.04056240429755, 0.972130196236979, 1.01160236751187, 0.964069530301364, 
    1.01114528024965, 1.00894310935747, 1.04924631438672, 0.997350768101313, 
    0.99681066471784, 0.984511938538037, 0.97096684869995, 0.983805678263226, 
    1.05036815386312, 1.01658832074033, 1.04209969832671, 0.978321129711924, 
    1.01941361113723, 1.0138875545253, 1.02244681578377, 1.04351246817399, 
    0.964152671071449, 1.00325089585421, 0.948280761510472, 0.985592269953813, 
    1.00233672132977, 0.95251882175466, 0.954014607723197, 0.967642838331369, 
    0.970011166114885, 0.964152671071449, 0.932408727300664, 1.03300033368478, 
    0.989290226814528, 1.04874771906387, 0.989664173306662, 1.0201615041215, 
    0.976742202973302, 1.01667133686158, 1.01484323711037, 1.05660059539869, 
    1.00906775818819, 1.00624246779128, 0.996312069394995, 0.978487286603262, 
    0.986547952538877, 1.05556177204354, 1.02448270513577, 1.05165615023086, 
    0.983722537493141, 1.02427491553498, 1.01592344387731, 1.03295882562415, 
    1.0518223071222, 0.975828153097695, 1.01102063141894, 0.963238621195842, 
    0.998680397178511, 1.00944170468032, 0.954720867998008, 0.962740025872996, 
    0.976118958819745, 0.976825343743386, 0.972005547406268, 0.941300426990633, 
    1.03196163497846, 0.988126754628668, 1.04783354453944, 0.988833139552309, 
    1.0192474542459, 0.975703504266984, 1.01575728698598, 1.01376290569459, 
    1.05581106970497, 1.00807056754249, 1.00507899560542, 0.995273370688676, 
    0.977531604018197, 0.985467621123101, 1.05473086293802, 1.02331935759875, 
    1.05061745152455, 0.982600698016739, 1.02323621682866, 1.01484323711037, 
    1.03192012691783, 1.0509082572466, 0.974706313621292, 1.01006494883388, 
    0.962199797840693, 0.997641698472193, 1.00836149791338, 0.953516012400351, 
    0.96165969445722, 0.974955611282715, 0.975703504266984, 0.970842199869238, 
    0.94017858751423, 1.03723839392897)), .Names = c("var", "modvar"
    ), row.names = c(NA, 100L), class = "data.frame")



#Calculation of discrete variable, as well as lower and upper boundaries of modvar
df$var_cut<-cut(df$var, quantile(df$var, (0:10)/10), include.lowest=TRUE)
df$var_cut<-cut(df$var, quantile(df$var, (0:10)/10), include.lowest=TRUE, labels=c(1:length(levels(df$var_cut))))

df$lowervar<-ifelse(df$var_cut=="1",df$lowervar<-quantile(df[df$var_cut=="1","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="2",df$lowervar<-quantile(df[df$var_cut=="2","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="3",df$lowervar<-quantile(df[df$var_cut=="3","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="4",df$lowervar<-quantile(df[df$var_cut=="4","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="5",df$lowervar<-quantile(df[df$var_cut=="5","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="6",df$lowervar<-quantile(df[df$var_cut=="6","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="7",df$lowervar<-quantile(df[df$var_cut=="7","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="8",df$lowervar<-quantile(df[df$var_cut=="8","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="9",df$lowervar<-quantile(df[df$var_cut=="9","modvar"],c(0.05), na.rm=T),
                            ifelse(df$var_cut=="10",df$lowervar<-quantile(df[df$var_cut=="10","modvar"],c(0.05), na.rm=T),NA))))))))))

df$uppervar<-ifelse(df$var_cut=="1",df$uppervar<-quantile(df[df$var_cut=="1","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="2",df$uppervar<-quantile(df[df$var_cut=="2","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="3",df$uppervar<-quantile(df[df$var_cut=="3","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="4",df$uppervar<-quantile(df[df$var_cut=="4","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="5",df$uppervar<-quantile(df[df$var_cut=="5","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="6",df$uppervar<-quantile(df[df$var_cut=="6","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="7",df$uppervar<-quantile(df[df$var_cut=="7","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="8",df$uppervar<-quantile(df[df$var_cut=="8","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="9",df$uppervar<-quantile(df[df$var_cut=="9","modvar"],c(0.95), na.rm=T),
                            ifelse(df$var_cut=="10",df$uppervar<-quantile(df[df$var_cut=="10","modvar"],c(0.95), na.rm=T),NA))))))))))

我一直在假设通过使用&#34; probs&#34;分位数函数的参数,我可以获得一个类似于置信区间的 modvar 曲线的包络,指定下限和上限分别对应于分位数函数中的0.05和0.95概率。 / p>

你会说这是一种方法吗?那么用这种方式比较两条不同的曲线呢?我想检查不同变量之间的重叠,例如。我要做的是绘制相同的 modvar~var_cut ,但是对于不同的数据库,并检查相应曲线的下限和上限是否重叠。

我希望你能帮助我,提前谢谢你!

0 个答案:

没有答案