Question

我目前有一个包含年龄和CPUE的数据集。我需要创建一个循环（或其他代码）

将数据帧子集化为唯一数据帧
对于每个独特年龄，找到CPUE
将每个百分位CPUE值添加到数据集（无论是按年龄划分还是按年龄确定）作为新列
根据cpue的阈值添加另一列，该列与ifelse语句类似，其中列将告诉您是否满足cpue阈值。

到目前为止我的代码是非循环的，如下所示：

    #####subsetting for each age
    yr1=ycsnew[which(ycsnew$age==1),]
    yr2=ycsnew[which(ycsnew$age==2),]
    yr3=ycsnew[which(ycsnew$age==3),]
    yr4=ycsnew[which(ycsnew$age==4),]
    yr5=ycsnew[which(ycsnew$age==5),]
    yr6=ycsnew[which(ycsnew$age==6),]
    yr7=ycsnew[which(ycsnew$age==7),]

    ####creating a vector using for example the 80th percentile (I would like 
    this to be continuous from 0.1 to 0.9 by 0.01)
    q1=quantile(yr1$logcpueplus1,0.8)[[1]]
    q2=quantile(yr2$logcpueplus1,0.8)[[1]]
    q3=quantile(yr3$logcpueplus1,0.8)[[1]]
    q4=quantile(yr4$logcpueplus1,0.8)[[1]]
    q5=quantile(yr5$logcpueplus1,0.8)[[1]]
    q6=quantile(yr6$logcpueplus1,0.8)[[1]]
    q7=quantile(yr7$logcpueplus1,0.8)[[1]]

   ####using ifelse statement to create column for probability greater than
   or equal to percentile value (pgtq8=ProbabilityGreaterThanQ80)
   yr1$pgtq8=ifelse(yr1$logcpueplus1>=q1,1,0)
   yr2$pgtq8=ifelse(yr2$logcpueplus1>=q2,1,0)
   yr3$pgtq8=ifelse(yr3$logcpueplus1>=q3,1,0)

  ##then I ended up binding everything back together
  ycsnew2=rbind(yr1,yr2,yr3,yr4,yr5,yr6,yr7)

我开始循环自己的过程，并且已经走到了这一步：

    ###1 subset into different year datasets
    age_split=split(ycsnew,ycsnew$age)
    new_names <- c("one", "two", "three","four","five","six","seven")
    for (i in 1:length(age_split)) {
      assign(new_names[i], age_split[[i]])
    }

    ###for each age, loop through to get values for 0.1-0.9 by0.01
    N=(seq(0.1,0.9,0.01))
    one_percentiles=matrix(rep("puppy",81),nrow=81,ncol=1)
    sequence1=c(seq(0.1,0.9,0.01))
    for(n in unique(seq(1,81,1))){
      for(i in sequence1){
       ps=quantile(one$logcpueplus1,i)[[1]]
       one_percentiles[[n]]=ps

         }
    }

我现在卡住了，因为“i”值没有正确地循环通过“sequence1”对象，而我只是得到一个“one_percentiles”矩阵，其填充的分位数值仅为i = 0.1或i = 0.9。

我一定很难将ifelse（）语句通过循环添加到数据集中作为新列，所以任何洞察都会非常感激。

我的数据集如下。

logcpueplus1 age
1     0.13353139   7
2     0.13353139   6
3     0.06899287   2
4     0.08004271   1
5     0.13353139   6
6     0.06899287   7
7     0.04879016   4
8     0.04879016   4
9     0.13353139   7
10    0.06899287   7
12    0.06899287   6
13    0.06899287   2
14    0.06899287   2
15    0.06899287   7
16    0.06899287   6
17    0.09531018   4
21    0.13353139   7
22    0.18232156   4
24    0.04879016   4
25    0.09531018   4
26    0.06899287   2
28    0.06899287   7
30    0.04879016   4
37    0.08004271   1
39    0.04879016   4
40    0.08004271   1
41    0.13353139   6
42    0.25131443   6
43    0.13353139   7
44    0.04879016   4
47    0.08004271   1
49    0.04879016   4
50    0.13353139   5
52    0.19415601   6
53    0.13353139   7
54    0.19415601   6
55    0.30538165   5
56    0.04879016   3
57    0.06899287   1
59    0.06899287   6
60    0.35667494   5
61    0.09531018   3
63    0.19415601   6
64    0.25131443   5
65    0.09531018   3
66    0.06899287   6
67    0.19415601   6
69    0.06899287   1
70    0.13976194   3
71    0.13353139   5
73    0.04879016   3
77    0.06899287   6
78    0.04879016   3
79    0.06899287   7

Answer 1

使用df，其中library(dplyr) df <- df %>% group_by(age) %>% mutate(q = quantile(logcpueplus1,0.8), pgtq8 = ifelse(logcpueplus1 >= q,1,0)) df #Source: local data frame [54 x 4] #Groups: age [7] # logcpueplus1 age q pgtq8 # <dbl> <int> <dbl> <dbl> #1 0.13353139 7 0.13353139 1 #2 0.13353139 6 0.19415601 0 #3 0.06899287 2 0.06899287 1 #4 0.08004271 1 0.08004271 1 #5 0.13353139 6 0.19415601 0 #6 0.06899287 7 0.13353139 0 #7 0.04879016 4 0.09531018 0 #8 0.04879016 4 0.09531018 0 #9 0.13353139 7 0.13353139 1 #10 0.06899287 7 0.13353139 0 # ... with 44 more rows是您的数据框。

{{1}}

Answer 2

以下是使用data.table

的选项

library(data.table)
setDT(df1)[, q := quantile(logcpueplus1, 0.8), age
          ][, pgtq8 := as.integer(logcpueplus1 >= q)][]

Answer 3

这是我最终使用并满足我的所有需求。

ycsB=ycsnew
dim(ycsnew)
names(ycsB)
see2=(seq(0.1,0.9,0.01))

ycsB[, 21:(21+length(see2)-1)]=NA
names(ycsB)
ages=unique(ycsnew$age)

for(s in 1:length(see2)){

   quant_name=see2[s]

for(b in 1:length(ages)){

  age_subset=subset(ycsB, ycsB$age==ages[b])
  age_quantile= quantile(age_subset$logcpueplus1,quant_name)

   ycsB[which(ycsB$age==ages[b]), (26+s)]=age_quantile
    for(j in 1:length(ages)){

      if_s=ifelse(age_subset$logcpueplus1>=age_quantile,1,0)  
      ycsB[which(ycsB$age==ages[b]), (107+s)]=if_s
      }
  }
}

循环分位数以在数据集

3 个答案: