对于在不同时间点测量的序列,我感兴趣的是每个序列的起始时间点,如果有跳过则重置起始时间点。
dd <- data.frame(seq = letters[c(1:6,1,6:7,1:3,7:8,1)],
grp = rep(1:5, c(3,4,5,2,1)))
o2 <- c(1,1,1,2,2,2,1,2,3,1,3,3,3,4,5)
par(mar = c(5, 5, 2, 5), las = 1, bty = 'n', xpd = NA)
plot(as.numeric(dd$seq), dd$grp, col = o2, pch = 16,
cex = 3, xaxt = 'n', yaxt = 'n', xlab = 'seq', ylab = '')
axis(1, at = 1:8, letters[1:8], lwd = 0)
axis(2, at = 1:5, paste0('time ', 1:5))
axis(4, at = 1:5, palette()[1:5])
也许这会更好地说明:对于连续发生的每个序列,我想将该组分配到最低时间点并相应地对其进行着色。
因此第一组a在时间1开始并且不间断地持续到3,所以理论上这是在时间1发生的相同序列。由于存在另一组a,因此假设与另一组无关一组a和有色时间点5。
b和c有两个来源,因此它们会根据时间点单独着色。
我想要的结果就是这个向量,o2
# split(cbind(dd, desired = o2), dd$grp)
cbind(dd, desired = o2)
# seq grp desired
# 1 a 1 1
# 2 b 1 1
# 3 c 1 1
# 4 d 2 2
# 5 e 2 2
# 6 f 2 2
# 7 a 2 1
# 8 f 3 2
# 9 g 3 3
# 10 a 3 1
# 11 b 3 3
# 12 c 3 3
# 13 g 4 3
# 14 h 4 4
# 15 a 5 5
答案 0 :(得分:3)
这是使用dplyr
pd <- dd %>% arrange(seq,grp) %>%
group_by(seq) %>%
mutate(set=cumsum(grp-lag(grp, default=100)!=1)) %>%
group_by(seq,set) %>%
mutate(colgrp=min(grp))
你用
绘图par(mar = c(5, 5, 2, 5), las = 1, bty = 'n', xpd = NA)
plot(as.numeric(pd$seq), pd$grp, col = pd$colgrp, pch = 16,
cex = 3, xaxt = 'n', yaxt = 'n', xlab = 'seq', ylab = '')
axis(1, at = 1:8, letters[1:8], lwd = 0)
axis(2, at = 1:5, paste0('time ', 1:5))
axis(4, at = 1:5, palette()[1:5])
请注意奇怪的default=100
值。理想情况下,我想使用-1
或超出范围的内容,但感谢this bug,您无法输入负数。
答案 1 :(得分:2)
灵感来自我对rle-like function that catches runs of adjacent integers
的回答dd %>% group_by(seq) %>%
arrange(grp) %>%
mutate(origin_group = grp - 0:(n() - 1)) %>%
group_by(seq, origin_group) %>%
mutate(origin = min(grp))
这与MrFlick的回答非常相似,我只是采用了稍微不同的方法来进行第一次分组。
par(mar = c(5, 5, 2, 5), las = 1, bty = 'n', xpd = NA)
plot(as.numeric(dd2$seq), dd2$grp, col = dd2$origin, pch = 16,
cex = 3, xaxt = 'n', yaxt = 'n', xlab = 'seq', ylab = '')
axis(1, at = 1:8, letters[1:8], lwd = 0)
axis(2, at = 1:5, paste0('time ', 1:5))
axis(4, at = 1:5, palette()[1:5])
答案 2 :(得分:0)
以下是我目前使用的方法
#include <stdio.h>
#include <stdlib.h>
#define DATA 10;
#define NAME 10;
typedef struct{
int id;
char *givenname;
char *familyname;
} students;
int main()
{
int answer;
int incr = 0; // Index for students in the list
int datalen = DATA;
int namelen = NAME;
students *studentlist;
studentlist = malloc(datalen * sizeof(students)); // Allocate memory for first ten students
if(NULL == studentlist){
printf("Error: Couldn't allocate memory\n");
exit(0);
}
for(incr = 0; incr < datalen; incr ++){
printf("Add student to the list? Yes(1) No(2)\n");
scanf("%d", &answer);
if(answer != 1){
break;
}
studentlist[incr]->givenname = malloc(namelen * sizeof(char)); // Allocate memory for each name
studentlist[incr]->familyname = malloc(namelen * sizeof(char));
printf("Insert ID: ");
scanf("%d", &studentlist[incr].id);
printf("Insert given name: \n");
scanf("%s", studentlist[incr].givenname);
printf("Insert family name: \n");
scanf("%s", studentlist[incr].familyname);
}
free(studentlist);
free(studentlist.givename);
free(studentlist.familyname);
return 0;
}
测试更大的数据集
## two helper functions
cum_reset <- function(x, value = 0L, FUN = cummin) {
## reset a cum* fn if value is encountered
# x <- 1:10
# x[x %% 3 == 0] <- 0
# cum_reset(x)
# cum_reset(1:10, value = c(4,6))
idx <- c(0, head(cumsum(x %in% value), -1))
sp <- split(x, idx)
unname(unlist(lapply(sp, FUN)))
}
do_reset <- function(x, FUN = min) {
# a <- dd$grp[dd$seq == 'a']
# b <- dd$grp[dd$seq == 'b']
o <- rep(0, max(x))
o[x] <- x
o <- cum_reset(o)
o[o > 0]
}
o3 <- with(dd, ave(grp, seq, FUN = do_reset))
all(o2 == o3) # TRUE
cbind(dd, desired = o2, got = o3)
# seq grp desired got
# 1 a 1 1 1
# 2 b 1 1 1
# 3 c 1 1 1
# 4 d 2 2 2
# 5 e 2 2 2
# 6 f 2 2 2
# 7 a 2 1 1
# 8 f 3 2 2
# 9 g 3 3 3
# 10 a 3 1 1
# 11 b 3 3 3
# 12 c 3 3 3
# 13 g 4 3 3
# 14 h 4 4 4
# 15 a 5 5 5
选项2
使用matt和gregor的答案的组合,这是基础r中的另一种解决方案
EDIT添加额外的行来计算其他人将失败的重复项
dd2 <- rbind(dd, within(dd, grp <- grp + 5))
(o4 <- with(dd2, ave(grp, seq, FUN = do_reset)))
# [1] 1 1 1 2 2 2 1 2 3 1 3 3 3 4 5 5 6 6 7 7 7 5 7 8 5 8 8 8 9 10
par(mar = c(5, 5, 2, 5), las = 1, bty = 'n', xpd = NA)
plot(as.numeric(dd2$seq), dd2$grp, col = o4, pch = 16, cex = 3,
xaxt = 'n', yaxt = 'n', xlab = 'seq', ylab = '')
axis(1, at = 1:8, letters[1:8], lwd = 0)
axis(2, at = 1:10, paste0('time ', 1:10))
axis(4, at = 1:10, rep_len(palette(), 10))