我的订阅数据框如下所示。 大约有100万个唯一ID。 该表列出了订阅状态。当用户开始订阅服务时,状态字段由' Sub'当用户取消订阅时,它会被“Usub'”表示。
dat <- data.frame(ID=c(rep("A",12),(rep("B",12))), Year="2014", Month=rep(seq(1:12),2), Status=NA)
dat$Status[4] <- "Sub"
dat$Status[8] <- "Usub"
dat$Status[17] <- "Usub"
dat$Status[21] <- "Sub"
ID Year Month Status
A 2014 1
A 2014 2
A 2014 3
A 2014 4 Sub
A 2014 5
A 2014 6
A 2014 7
A 2014 8 Usub
A 2014 9
A 2014 10
A 2014 11
A 2014 12
B 2014 1
B 2014 2
B 2014 3
B 2014 4
B 2014 5 Usub
B 2014 6
B 2014 7
B 2014 8
B 2014 9 Sub
B 2014 10
B 2014 11
B 2014 12
C 2014 1 .
. . . .
. . . .
我希望填补每个状态更新之间的差距。 所需的输出表如下所示:
ID Year Month Status
A 2014 1 Usub
A 2014 2 Usub
A 2014 3 Usub
A 2014 4 Sub
A 2014 5 Sub
A 2014 6 Sub
A 2014 7 Sub
A 2014 8 Usub
A 2014 9 Usub
A 2014 10 Usub
A 2014 11 Usub
A 2014 12 Usub
B 2014 1 Sub
B 2014 2 Sub
B 2014 3 Sub
B 2014 4 Sub
B 2014 5 Usub
B 2014 6 Usub
B 2014 7 Usub
B 2014 8 Usub
B 2014 9 Sub
B 2014 10 Sub
B 2014 11 Sub
B 2014 12 Sub
C 2014 1 .
. . . .
. . . .
每个ID至少有一个状态值。 如果ID的第一个状态记录是&#34; Usub&#34;,那么所有之前的几个月&#39;状态是&#34; Sub&#34;。 (与2014/05年度的ID B相同)相反,如果第一个状态记录以&#34; Sub&#34;开头,则所有前几个月&#39;状态是&#34; Usub&#34;
答案 0 :(得分:3)
您可以通过减去Status == "Usub"
和Status = "Sub"
来生成与您所需的状态列相同的交替序列,以Sub
填充Usub
的所有位置将具有更低的值比那些应该用library(dplyr)
df %>% group_by(ID) %>% mutate(Status = factor(cumsum((Status == "Usub") - (Status == "Sub")),
labels = c("Sub", "Usub")))
# ID Year Month Status
# 1 A 2014 1 Usub
# 2 A 2014 2 Usub
# 3 A 2014 3 Usub
# 4 A 2014 4 Sub
# 5 A 2014 5 Sub
# 6 A 2014 6 Sub
# 7 A 2014 7 Sub
# 8 A 2014 8 Usub
# 9 A 2014 9 Usub
# 10 A 2014 10 Usub
# 11 A 2014 11 Usub
# 12 A 2014 12 Usub
# 13 B 2014 1 Sub
# 14 B 2014 2 Sub
# 15 B 2014 3 Sub
# 16 B 2014 4 Sub
# 17 B 2014 5 Usub
# 18 B 2014 6 Usub
# 19 B 2014 7 Usub
# 20 B 2014 8 Usub
# 21 B 2014 9 Sub
# 22 B 2014 10 Sub
# 23 B 2014 11 Sub
# 24 B 2014 12 Sub
填充的那些,然后使用这样一个事实:可以按顺序标记因子将整数序列转换为一个因子:
data.table
相应的library(data.table)
setDT(df)[, Status := as.character(factor(cumsum((Status == "Usub") - (Status == "Sub")), labels = c("Sub", "Usub"))), .(ID)]
方式是:
NA
您必须将新因子转换回字符类,因为在创建新列时,它不允许更改类型。
数据假设您有空字符串而不是structure(list(ID = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B"), Year = c("2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014"), Month = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12", "1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12"), Status = c("", "", "", "Sub",
"", "", "", "Usub", "", "", "", "", "", "", "", "", "Usub", "",
"", "", "Sub", "", "", "")), .Names = c("ID", "Year", "Month",
"Status"), row.names = c(NA, 24L), class = "data.frame")
:
<div class="demos-filter">
<div class="filter-button-group">
<a data-filter="*">showAll</a>
<a data-filter=".apples">topmenu</a>
<a data-filter=".oranges">onepage</a>
</div>
</div>
<div class="demos">
<div data-index="0" class="demo oranges" data-level="1">
//stuff here
</div>
<div data-index="1" class="demo apples" data-level="2">
//stuff here
</div>
</div>
答案 1 :(得分:0)
uniquevector<-unique(dat$ID)
for(i in uniquevector){
zzz <- which(dat$ID==i & dat$Status == "Sub")
zzz2 <- which(dat$ID==i & dat$Status == "Usub")
zzz3 <- which(dat$ID==i & dat$Month == 12)
zzz4 <- which(dat$ID==i & dat$Month == 1)
if(zzz2 > zzz){
index<-zzz:(zzz2-1)
dat$Status[index] <- "Sub"
}
if(zzz2 < zzz){
index<-zzz2:(zzz-1)
dat$Status[index] <- "Usub"
}
if(zzz3 > zzz2 & zzz < zzz2){
index<-zzz2:zzz3
dat$Status[index] <- "Usub"
}
if(zzz2 < zzz & zzz3 > zzz){
index<-zzz:zzz3
dat$Status[index] <- "Sub"
if((zzz4 < zzz) & zzz < zzz2){
index<-zzz4:(zzz-1)
dat$Status[index] <- "Usub"
}
if((zzz4 < zzz2) & zzz2 < zzz){
index<-zzz4:(zzz2-1)
dat$Status[index] <- "Sub"
}
}}
答案 2 :(得分:0)
另一个选项是将空白""
转换为NA
,并使用na.locf
包中的zoo
将NA替换为非NA前一个元素。由于这是按操作分组,我们也可以使用ave
中的base R
执行此操作。
library(zoo)
df$Status <- with(df, ave(replace(Status, !nzchar(Status), NA), ID,
FUN = function(x){ x1 <- na.locf(x, na.rm=FALSE)
replace(x1, is.na(x1), setdiff(unique(na.omit(x1)), x1[!is.na(x1)][1]))}))
df$Status
#[1] "Usub" "Usub" "Usub" "Sub" "Sub" "Sub" "Sub" "Usub" "Usub" "Usub" "Usub" "Usub" "Sub" "Sub" "Sub" "Sub" "Usub" "Usub" "Usub"
#[20] "Usub" "Sub" "Sub" "Sub" "Sub"