我想为数据帧的行建立频率表。
我已经找到方法,但是考虑了列的顺序。我希望找到忽略列顺序的频率。
例如:
0 A B
1 B A
2 C D
3 D C
4 C D
我希望获得:
A B 2
C D 3
谢谢。
答案 0 :(得分:2)
library("tidyverse")
x <- read.table(
text = "0 A B
1 B A
2 C D
3 D C
4 C D",
stringsAsFactors = FALSE)
x %>%
# Specify the columns to combine explicitly (here V2 and V3)
# Then sort each pair and paste it into a single string
mutate(pair = pmap_chr(list(V2, V3),
function(...) paste(sort(c(...)), collapse = " "))) %>%
count(pair)
#> # A tibble: 2 x 2
#> pair n
#> <chr> <int>
#> 1 A B 2
#> 2 C D 3
由reprex package(v0.2.1)于2019-03-29创建
答案 1 :(得分:1)
首先sort
按行,然后按所有列分组并计算行数。
library(dplyr)
df1 <- data.frame(t(apply(df[-1], 1, sort)))
df1 %>%
group_by_all() %>%
summarise(Freq = n())
# X1 X2 Freq
# <fct> <fct> <int>
#1 A B 2
#2 C D 3
数据
df <- structure(list(V1 = 0:4, V2 = structure(c(1L, 2L, 3L, 4L, 3L),
.Label = c("A",
"B", "C", "D"), class = "factor"), V3 = structure(c(2L, 1L, 4L,
3L, 4L), .Label = c("A", "B", "C", "D"), class = "factor")), class =
"data.frame", row.names = c(NA,
-5L))
答案 2 :(得分:1)
我们可以使用pmin/pmax
创建分组变量,并且应该更有效
library(dplyr)
df %>%
count(V2N = pmin(V2, V3), V3N = pmax(V2, V3))
# A tibble: 2 x 3
# V2N V3N n
# <chr> <chr> <int>
#1 A B 2
#2 C D 3
df1 <- df[rep(seq_len(nrow(df)), 1e6),]
system.time({
df1 %>%
count(V2N = pmin(V2, V3), V3N = pmax(V2, V3))
})
#user system elapsed
# 1.164 0.043 1.203
system.time({
df2 <- data.frame(t(apply(df1[-1], 1, sort)))
df2 %>%
group_by_all() %>%
summarise(Freq = n())
})
# user system elapsed
# 160.357 1.227 161.544
df <- structure(list(V1 = 0:4, V2 = c("A", "B", "C", "D", "C"), V3 = c("B",
"A", "D", "C", "D")), row.names = c(NA, -5L), class = "data.frame")