基于R的频率采样

时间:2015-02-07 22:38:21

标签: r random histogram sample

我想根据每个值大小从相当大的数据中生成20000个样本,以便填充NA值: 所以我使用直方图的输出,但它没有成功,并给我一个错误,如何避免它?

y=hist(maindata,col="red",breaks=length(unique(maindata))
for(k in 1:20000){
data=maindata
for(i in 1:nrow(data)){
if (data[i]="Na"){
 data[i]=sample(y$breaks,size=1,replace=FALSE,prob=y$density)}}}

我收到此错误:

Error in sample.int(length(x), size, replace, prob) : 
  incorrect number of probabilities

我检查了length(y$breaks)length(y$density)length(y$breaks)是一个单位,我应该如何修复它?

提前谢谢

编辑

    structure(list(breaks = c(15, 16, 17, 18, 19, 20, 21, 22, 23, 
24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 
40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 
56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 
72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 
88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 
103, 104, 105, 106, 107, 108, 109), counts = c(27L, 17L, 31L, 
83L, 118L, 144L, 211L, 279L, 354L, 312L, 300L, 377L, 407L, 443L, 
481L, 351L, 302L, 236L, 248L, 178L, 141L, 101L, 77L, 80L, 63L, 
44L, 64L, 44L, 60L, 46L, 24L, 29L, 15L, 28L, 21L, 13L, 19L, 10L, 
30L, 11L, 12L, 12L, 7L, 12L, 12L, 11L, 11L, 7L, 7L, 4L, 4L, 4L, 
1L, 2L, 3L, 6L, 1L, 1L, 3L, 3L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 
1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 
0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L, 1L, 2L), density = c(0.00453172205438067, 
0.00285330647868412, 0.00520308828465928, 0.0139308492782813, 
0.0198053037932192, 0.0241691842900302, 0.035414568647197, 0.0468277945619335, 
0.0594159113796576, 0.0523665659617321, 0.0503524672708963, 0.0632762672037596, 
0.0683115139308493, 0.0743538100033568, 0.0807317891910037, 0.0589123867069486, 
0.0506881503860356, 0.0396106075864384, 0.0416247062772743, 0.0298757972473985, 
0.0236656596173212, 0.0169519973145351, 0.0129237999328634, 0.0134273246055723, 
0.0105740181268882, 0.00738502853306479, 0.0107418596844579, 
0.00738502853306479, 0.0100704934541793, 0.0077207116482041, 
0.0040281973816717, 0.00486740516951997, 0.00251762336354481, 
0.00469956361195032, 0.00352467270896274, 0.00218194024840551, 
0.00318898959382343, 0.00167841557569654, 0.00503524672708963, 
0.0018462571332662, 0.00201409869083585, 0.00201409869083585, 
0.00117489090298758, 0.00201409869083585, 0.00201409869083585, 
0.0018462571332662, 0.0018462571332662, 0.00117489090298758, 
0.00117489090298758, 0.000671366230278617, 0.000671366230278617, 
0.000671366230278617, 0.000167841557569654, 0.000335683115139308, 
0.000503524672708963, 0.00100704934541793, 0.000167841557569654, 
0.000167841557569654, 0.000503524672708963, 0.000503524672708963, 
0, 0, 0, 0.000167841557569654, 0.000167841557569654, 0, 0, 0, 
0.000167841557569654, 0, 0, 0.000167841557569654, 0, 0.000167841557569654, 
0, 0.000167841557569654, 0, 0.000167841557569654, 0.000167841557569654, 
0, 0, 0.000167841557569654, 0.000167841557569654, 0, 0, 0, 0, 
0, 0.000503524672708963, 0, 0, 0, 0.000167841557569654, 0.000335683115139308
), mids = c(15.5, 16.5, 17.5, 18.5, 19.5, 20.5, 21.5, 22.5, 23.5, 
24.5, 25.5, 26.5, 27.5, 28.5, 29.5, 30.5, 31.5, 32.5, 33.5, 34.5, 
35.5, 36.5, 37.5, 38.5, 39.5, 40.5, 41.5, 42.5, 43.5, 44.5, 45.5, 
46.5, 47.5, 48.5, 49.5, 50.5, 51.5, 52.5, 53.5, 54.5, 55.5, 56.5, 
57.5, 58.5, 59.5, 60.5, 61.5, 62.5, 63.5, 64.5, 65.5, 66.5, 67.5, 
68.5, 69.5, 70.5, 71.5, 72.5, 73.5, 74.5, 75.5, 76.5, 77.5, 78.5, 
79.5, 80.5, 81.5, 82.5, 83.5, 84.5, 85.5, 86.5, 87.5, 88.5, 89.5, 
90.5, 91.5, 92.5, 93.5, 94.5, 95.5, 96.5, 97.5, 98.5, 99.5, 100.5, 
101.5, 102.5, 103.5, 104.5, 105.5, 106.5, 107.5, 108.5), xname = "b", 
    equidist = TRUE), .Names = c("breaks", "counts", "density", 
"mids", "xname", "equidist"), class = "histogram")

数据信息

> head(maindata)
[1] 30 44 -1 32 30 34
> is.numeric(maindata)
[1] TRUE
> is.vector(maindata)
[1] TRUE
> length(maindata)
[1] 36203

1 个答案:

答案 0 :(得分:1)

您是否只想要分发非缺失数据的20,000个样本?如果是这样,另一种方法是直接从非缺失数据计算核密度估计,然后从中进行采样。例如,使用虚假数据:

# Fake data with some missing values
set.seed(31)
dat = rnorm(30000, 20, 10)
dat[sample(1:30000, 5000)] = NA

# Create kernel density estimate from the data
# n is the number of grid points used in the esimate (should always be a power of 2)
dat.dens = density(dat[!is.na(dat)], n=2^10)

sim.sample = sample(dat.dens$x, 2e4, replace=TRUE, prob=dat.dens$y)

plot(dat.dens)
lines(density(sim.sample), col="red")

enter image description here

如果我误解了你要做的事情,请告诉我。