计算R中的重复列

时间:2015-02-14 06:37:05

标签: r

我的目标是计算我的df的前两列中的唯一值的数量 这是我的df:

> head(dat)
       M1 M2 M3 M4
10_A9  H  H  H  H
10_B6  A  H  B  A
10_B8  H  H  H  B
10_B9  H  H  H  H
10_C1  A  B  B  A
10_C3  H  H  H  H

我尝试过:

> unique(dat[,1:2])
       M1 M2
10_A9  H  H
10_B6  A  H
10_C1  A  B
10_D8  H  A
11_B9  B  A
12_D9  B  H
12_E4     A

然后我计划计算nrow(),但答案不正确。我在这个简单的练习中出错了吗?

编辑: 这是我的前两列dat(几行):

> a[,1:2]
   M1 M2
10_A9   H  H
10_B6   A  H
10_B8   H  H
10_B9   H  H
10_C1   A  B
10_C3   H  H
10_C4   H  H
10_D5   H  H
10_D7   H  H
10_D8   H  A
10_D11  A  H
10_E6   H  H
11_A3   A  B
11_A7   A  B
11_B1   H  H
11_B3   H  H
11_B7   H  A
11_B9   B  A
11_B11  H  H
11_C1   A  H
11_C8   A  H
11_C9   H  A
11_D3   A  B
11_D4   A  B

对于上面几行,我想要的输出是:

10_B6   A  H
10_C1   A  B
10_D8   H  A
10_D11  A  H
11_A3   A  B
11_A7   A  B
11_B7   H  A
11_B9   B  A
11_C1   A  H
11_C8   A  H
11_C9   H  A
11_D3   A  B
11_D4   A  B

1 个答案:

答案 0 :(得分:1)

你真的想要下面吗?但不确定。

DF <- structure(list(row_n = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 
9L, 10L, 11L, 8L, 12L, 13L, 14L, 15L, 17L, 18L, 19L, 16L, 20L, 
21L, 22L, 23L, 24L), .Label = c("10_A9", "10_B6", "10_B8", "10_B9", 
"10_C1", "10_C3", "10_C4", "10_D11", "10_D5", "10_D7", "10_D8", 
"10_E6", "11_A3", "11_A7", "11_B1", "11_B11", "11_B3", "11_B7", 
"11_B9", "11_C1", "11_C8", "11_C9", "11_D3", "11_D4"), class = "factor"), 
    M1 = structure(c(3L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 3L, 3L, 
    1L, 3L, 1L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 1L, 3L, 1L, 1L), .Label = c("A", 
    "B", "H"), class = "factor"), M2 = structure(c(3L, 3L, 3L, 
    3L, 2L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 2L, 2L, 3L, 3L, 1L, 1L, 
    3L, 3L, 3L, 1L, 2L, 2L), .Label = c("A", "B", "H"), class = "factor")), .Names = c("row_n", 
"M1", "M2"), class = "data.frame", row.names = c(NA, -24L))

# I made up row_n to help compare the result below with your desired output.
subset(DF, M1 != M2)
#     row_n M1 M2
# 2   10_B6  A  H
# 5   10_C1  A  B
# 10  10_D8  H  A
# 11 10_D11  A  H
# 13  11_A3  A  B
# 14  11_A7  A  B
# 17  11_B7  H  A
# 18  11_B9  B  A
# 20  11_C1  A  H
# 21  11_C8  A  H
# 22  11_C9  H  A
# 23  11_D3  A  B
# 24  11_D4  A  B

# Below get you the number of pairs.
nrow(subset(DF, M1 != M2))
[1] 13

注意

# You need to convert columns to character (assuming the dataset is all letters).
# To do so, a concise way is to change all columns to characters:

DF[] <- lapply(DF, as.character).

# Or you could do as @Frank suggested.