Question

我有以下数据集：

data <- data.frame(id = 1:7,
                   t1 = c("AV1", "AV1", "AZ", "AV1", "AV1","AV1","AV2"),
                   t2 = c("AV2", NA, "AV3", "AV2", "AV2",NA, "AV3"),
                   t3 = c("AZ", "AV2", "AV4", "AZ", "AZ","AV4","AV4"))

每一行代表一个人＆＃34; id＆＃34;，在几个不同的时间步骤（状态（＃34; t1＆＃34; - ＆＃34; t3＆＃34;）列中显示状态（值）：

id t1  t2   t3
1 AV1  AV2  AZ
2 AV1  NA   AV2
3 AZ   AV3  AV4
4 AV1  AV2  AZ
5 AV1  AV2  AZ
6 AV1  NA   AV4
7 AV2  AV3  AV4

我想计算不同的转换，＆＃34;来自＆＃34;一个时间步的值，＆＃34;到＆＃34;后续时间步骤中的值，汇总为整个数据集：

 from            to                 count 
  AV1            AV2                 4             
  AV2            AZ                  3              
  AZ             AV3                 1             
  AV3            AV4                 2           
  AV1            AV4                 1
  AV2            AV3                 1

因此，＆＃34;计数＆＃34;表示特定转换发生的次数。例如，AV1到AV2发生4次，AV2到AZ发生3次。 NA被排除在外。

非常感谢！

Answer 1

为避免对列进行硬编码，您可以将数据重新整形为长格式（melt）。使用head和tail，每个“id”（by = id），以连续的时间步长对齐值。计算每个唯一转换（.N）

的行数（by = .(from, to)）

library(data.table)
setDT(data)
d <- melt(data ,id.vars = "id", na.rm=TRUE)
d[ , .(from = head(value, -1), to = tail(value, -1)), by = id][ , .N, by = .(from, to)]
#   from  to N
# 1: AV1 AV2 4
# 2: AV2  AZ 3
# 3:  AZ AV3 1
# 4: AV3 AV4 2
# 5: AV1 AV4 1
# 6: AV2 AV3 1

base类似的替代方案，虽然连接了转换：

d <- na.omit(reshape(data, varying = list(2:4), direction = "long"))
as.data.frame(table(unlist(by(d, d$id, function(dat) paste(head(dat$t1, -1), tail(dat$t1, -1), sep = " - ")))))

#        Var1 Freq
# 1 AV1 - AV2    4
# 2 AV1 - AV4    1
# 3 AV2 - AV3    1
# 4  AV2 - AZ    3
# 5 AV3 - AV4    2
# 6  AZ - AV3    1

Answer 2

这是一个适用于任意数量列的通用方法。我们找到列的所有对组合（索引方式）。我们使用它们来索引原始df中的列，并将它们放在列表中。粘贴元素，进行一些清理（trimws(gsub('NA', '', do.call(paste, a[i1[,x]]))），然后使用table函数，我们得到您的预期结果。将其包含在as.data.frame中可以得出预期的输出结构。

i1 <- combn(seq_along(a[-1])+1, 2)

final_d <- as.data.frame(table(unlist(lapply(seq(ncol(i1)), function(x) {
               v1 <- trimws(gsub('NA', '', do.call(paste, a[i1[,x]]))); 
               grep('\\s', v1, value = TRUE)
               }))))

给出，

     Var1 Freq
1 AV1 AV2    4
2 AV1 AV4    1
3  AV1 AZ    3
4 AV2 AV3    1
5 AV2 AV4    1
6  AV2 AZ    3
7 AV3 AV4    2
8  AZ AV3    1
9  AZ AV4    1

或者要完全相同，

setNames(data.frame(do.call('rbind', strsplit(as.character(final_d$Var1),' ',fixed=TRUE)), 
                    final_d$Freq), 
                    c('from', 'to', 'freq.'))

  from  to freq.
1  AV1 AV2     4
2  AV1 AV4     1
3  AV1  AZ     3
4  AV2 AV3     1
5  AV2 AV4     1
6  AV2  AZ     3
7  AV3 AV4     2
8   AZ AV3     1
9   AZ AV4     1

Answer 3

其中一种方法可能是

library(dplyr)

d1 <- data %>% group_by(t1, t2) %>% filter(!is.na(t1) & !is.na(t2)) %>% summarise(n()) %>% `colnames<-`(c("from", "to", "weight")) %>% as.data.frame()
d2 <- data %>% group_by(t2, t3) %>% filter(!is.na(t2) & !is.na(t3)) %>% summarise(n()) %>% `colnames<-`(c("from", "to", "weight")) %>% as.data.frame()
d3 <- data %>% group_by(t1, t3) %>% filter(!is.na(t1) & !is.na(t3)) %>% summarise(n()) %>% `colnames<-`(c("from", "to", "weight")) %>% as.data.frame()
#final data
df <- rbind(d1, d2, d3) %>% group_by(from, to) %>% summarise(weight=sum(weight)) %>% as.data.frame()

Answer 4

修改
避免硬编码列的tidyverse方法可以采用类似的方法来处理@Henrik的优秀接受答案。在这种情况下，我在使用lead结果之前使用了count函数来组合相邻的值。

library(tidyverse) data %>% gather(key, value, -id) %>% filter(!is.na(value)) %>% group_by(id) %>% transmute(from = value, to = lead(value)) %>% filter(!is.na(to)) %>% ungroup() %>% count(from, to) #> # A tibble: 6 x 3 #> from to n #> <chr> <chr> <int> #> 1 AV1 AV2 4 #> 2 AV1 AV4 1 #> 3 AV2 AV3 1 #> 4 AV2 AZ 3 #> 5 AV3 AV4 2 #> 6 AZ AV3 1

原始解决方案
这样的事怎么样？它不是很优雅，但我认为它可以完成工作。

library(dplyr) data <- tibble(id = 1:7, t1 = c("AV1", "AV1", "AZ", "AV1", "AV1", "AV1", "AV2"), t2 = c("AV2", NA, "AV3", "AV2", "AV2", NA, "AV3"), t3 = c("AZ", "AV2", "AV4", "AZ", "AZ", "AV4", "AV4")) cnt1 <- data %>% filter(!is.na(t2)) %>% count(t1, t2) %>% rename(from = t1, to = t2) cnt2 <- data %>% filter(!is.na(t2)) %>% count(t2, t3) %>% rename(from = t2, to = t3) cnt3 <- data %>% filter(is.na(t2)) %>% count(t1, t3) %>% rename(from = t1, to = t3) cnt1 %>% bind_rows(cnt2) %>% bind_rows(cnt3) %>% group_by(from, to) %>% summarise(weight = sum(n)) #> # A tibble: 6 x 3 #> # Groups: from [?] #> from to weight #> <chr> <chr> <int> #> 1 AV1 AV2 4 #> 2 AV1 AV4 1 #> 3 AV2 AV3 1 #> 4 AV2 AZ 3 #> 5 AV3 AV4 2 #> 6 AZ AV3 1

计算几个时间步长（列）中不同状态之间的转换

4 个答案: