样本在R中的组内具有相同数量的每个性别

时间:2012-12-07 17:08:06

标签: r statistics

首先,首先。以下是我的数据:

lat <- c(12, 12, 58, 58, 58, 58, 58, 45, 45, 45, 45, 45, 45, 64, 64, 64, 64, 64, 64, 64)
long <- c(-14, -14, 139, 139, 139, 139, 139, -68, -68, -68, -68, -68, 1, 1, 1, 1, 1, 1, 1, 1)
sex <- c("M", "M", "M", "M", "F", "M", "M", "F", "M", "M", "M", "F", "M", "F", "M", "F", "F", "F", "F", "M")
score <- c(2, 6, 3, 6, 5, 4, 3, 2, 3, 9, 9, 8, 6, 5, 6, 7, 5, 7, 5, 1)

data <- data.frame(lat, long, sex, score)

数据应如下所示:

   lat long sex score
1   12  -14   M     2
2   12  -14   M     6
3   58  139   M     3
4   58  139   M     6
5   58  139   F     5
6   58  139   M     4
7   58  139   M     3
8   45  -68   F     2
9   45  -68   M     3
10  45  -68   M     9
11  45  -68   M     9
12  45  -68   F     8
13  45    1   M     6
14  64    1   F     5
15  64    1   M     6
16  64    1   F     7
17  64    1   F     5
18  64    1   F     7
19  64    1   F     5
20  64    1   M     1

我最终试图弄明白这一点。变量是纬度,经度,性别和分数。我希望每个地点都有相同数量的男性和女性(即经度和纬度相同)。例如,第二个位置(第3行到第7行)只有一个女性。应保留这名女性,并保留其余个体的一名男性(也许是随机抽样)。有些地方只有一种性别的信息,例如第一个位置(第1行和第2行)只有男性数据。应删除此位置的行(因为没有女性)。所有按计划进行的最终数据集应如下所示:

   lat2 long2 sex2 score2
1    58   139    F      5
2    58   139    M      4
3    45   -68    F      2
4    45   -68    M      3
5    45   -68    M      9
6    45   -68    F      8
7    64     1    M      6
8    64     1    F      5
9    64     1    F      7
10   64     1    M      1

任何帮助都将不胜感激。

3 个答案:

答案 0 :(得分:5)

以下是lapply的解决方案:

data[unlist(lapply(with(data, split(seq.int(nrow(data)), paste(lat, long))),
        # 'split' splits the sequence of row numbers (indices) along the unique
        # combinations of 'lat' and 'long'
        # 'lapply' applies the following function to all sub-sequences
        function(x) {
          # which of the indices are for males:
          male <- which(data[x, "sex"] == "M")
          # which of the indices are for females:
          female <- which(data[x, "sex"] == "F")
          # sample from the indices of males:
          s_male <- sample(male, min(length(male), length(female)))
          # sample from the indices of females:
          s_female <- sample(female, min(length(male), length(female)))
          # combine both sampled indices:
          x[c(s_male, s_female)]                
        })), ]
# The function 'lappy' returns a list of indices which is transformed to a vector
# using 'unlist'. These indices are used to subset the original data frame.

结果:

   lat long sex score
9   45  -68   M     3
11  45  -68   M     9
12  45  -68   F     8
8   45  -68   F     2
7   58  139   M     3
5   58  139   F     5
20  64    1   M     1
15  64    1   M     6
19  64    1   F     5
16  64    1   F     7

答案 1 :(得分:2)

下面是一个快速的方法,包括创建一个lat-long组合的临时列。我们根据此列拆分DF,计算每个拆分中的M / F,适当地采样,然后重新组合。

# First, We call the dataframe something other than "data" ;) 
mydf <- data.frame(lat, long, sex, score)

# create a new data frame with a temporary column, which concatenates the lat & long. 
mydf.new <- data.frame(mydf, latlong=paste(mydf$lat, mydf$long, sep=","))

# Split the data frame according to the lat-long location
mydf.splat <- split(mydf.new, mydf.new$latlong)

# eg, taking a look at one of our tables:
mydf.splat[[4]]

sampled <- 
  lapply(mydf.splat, function(tabl) {
    Ms <- sum(tabl$sex=="M")
    Fs <- sum(tabl$sex=="F")

    if(Fs == 0 || Ms ==0)    # If either is zero, we drop that location
      return(NULL)

    if(Fs == Ms)   # If they are both equal, no need to sample. 
      return(tabl)

    # If number of Females less than Males, return all Females 
    #    and sample from males in ammount equal to Females
    if (Fs < Ms)   
      return(tabl[c(which(tabl$sex=="F"), sample(which(tabl$sex=="M"), Fs)),   ])

    if (Ms < Fs)  # same as previous, but for Males < Femals
      return(tabl[c(which(tabl$sex=="M"), sample(which(tabl$sex=="F"), Ms)),   ])

    stop("hmmm... something went wrong.")  ## We should never hit this line, but just in case. 
  })

# Flatten into a single table
mydf.new <- do.call(rbind, sampled)

# Clean up
row.names(mydf.new) <- NULL  # remove the row names that were added
mydf.new$latlong <- NULL     # remove the temporary column that we added

结果

mydf.new

#    lat long sex score
# 1   45  -68   F     2
# 2   45  -68   F     8
# 3   45  -68   M     9
# 4   45  -68   M     3
# 5   58  139   F     5
# 6   58  139   M     3
# 7   64    1   M     6
# 8   64    1   M     1
# 9   64    1   F     7
# 10  64    1   F     5

答案 2 :(得分:2)

这将返回值作为列表元素:

spl <- split(data, interaction(data$lat, data$long) ,drop=TRUE)
           # interaction creates all the two way pairs from those two vectors
           # drop is needed to eliminate the dataframes with no representation
res <- lapply(spl, function(x) { #First find the nuber of each gender to select
                        N=min(table(x$sex))  # then sample each sex separately
   rbind( x[ x$sex=="M" & row.names(x) %in% sample(row.names(x[x$sex=="M",] ), N) , ],
                 # One (or both) of these will be "sampling" all of that sex.
          x[ x$sex=="F" & row.names(x) %in% sample(row.names(x[x$sex=="F", ]), N) , ] )
                                } )
res
#------------
$`45.-68`  
   lat long sex score
9   45  -68   M     3
11  45  -68   M     9
8   45  -68   F     2
12  45  -68   F     8

$`12.-14` # So there were no women in this group and zero could be matched
[1] lat   long  sex   score
<0 rows> (or 0-length row.names)

$`45.1`
[1] lat   long  sex   score
<0 rows> (or 0-length row.names)

$`64.1`
   lat long sex score
15  64    1   M     6
20  64    1   M     1
16  64    1   F     7
17  64    1   F     5

$`58.139`
  lat long sex score
7  58  139   M     3
5  58  139   F     5

,,,,但如果您想将其作为数据框,则可以使用do.call(rbind, res)

> do.call(rbind, res)
          lat long sex score
45.-68.10  45  -68   M     9
45.-68.11  45  -68   M     9
45.-68.8   45  -68   F     2
45.-68.12  45  -68   F     8
64.1.15    64    1   M     6
64.1.20    64    1   M     1
64.1.17    64    1   F     5
64.1.18    64    1   F     7
58.139.6   58  139   M     4
58.139.5   58  139   F     5