我有以下数据框:
df <- structure(list(group1 = structure(c(2L, 8L, 5L, 3L, 4L, 6L, 7L,
9L, 6L, 3L, 9L, 7L, 8L, 4L, 5L, 4L, 9L, 6L, 7L, 8L, 5L, 7L, 8L,
9L, 5L, 6L, 7L, 6L, 9L, 8L, 9L, 7L, 8L, 9L, 8L, 9L), .Label = c("NEW ENGLAND",
"MIDDLE ATLANTIC", "E. NOR. CENTRAL", "W. NOR. CENTRAL", "SOUTH ATLANTIC",
"E. SOU. CENTRAL", "W. SOU. CENTRAL", "MOUNTAIN", "PACIFIC"), class = "factor"),
Sample_Size_group1 = c(220L, 157L, 372L, 331L, 127L, 135L,
196L, 267L, 135L, 331L, 267L, 196L, 157L, 127L, 372L, 127L,
267L, 135L, 196L, 157L, 372L, 196L, 157L, 267L, 372L, 135L,
196L, 135L, 267L, 157L, 267L, 196L, 157L, 267L, 157L, 267L),
group2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 8L), .Label = c("NEW ENGLAND",
"MIDDLE ATLANTIC", "E. NOR. CENTRAL", "W. NOR. CENTRAL",
"SOUTH ATLANTIC", "E. SOU. CENTRAL", "W. SOU. CENTRAL", "MOUNTAIN",
"PACIFIC"), class = "factor"),
Sample_Size_group2 = c(120L, 120L, 120L, 120L, 120L, 120L, 120L, 120L, 220L, 220L, 220L,
220L, 220L, 220L, 220L, 331L, 331L, 331L, 331L, 331L, 331L,
127L, 127L, 127L, 127L, 127L, 372L, 372L, 372L, 372L, 135L,
135L, 135L, 196L, 196L, 157L)), .Names = c("group1", "Sample_Size_group1",
"group2", "Sample_Size_group2"), row.names = c(1L, 2L, 3L, 4L,
5L, 6L, 7L, 8L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 46L, 47L,
48L, 49L, 50L, 51L, 64L, 65L, 66L, 67L, 68L, 79L, 80L, 81L, 82L,
91L, 92L, 93L, 100L, 101L, 106L), class = "data.frame")
group1
和group2
个变量在相同的顺序中具有相同的因子级别(请注意,并非所有级别都显示在任一变量中):
levels(df$group1)
[1] "NEW ENGLAND" "MIDDLE ATLANTIC" "E. NOR. CENTRAL" "W. NOR. CENTRAL" "SOUTH ATLANTIC" "E. SOU. CENTRAL"
[7] "W. SOU. CENTRAL" "MOUNTAIN" "PACIFIC"
levels(df$group2)
[1] "NEW ENGLAND" "MIDDLE ATLANTIC" "E. NOR. CENTRAL" "W. NOR. CENTRAL" "SOUTH ATLANTIC" "E. SOU. CENTRAL"
[7] "W. SOU. CENTRAL" "MOUNTAIN" "PACIFIC"
我想通过将它们与各自的样本大小相结合来改变这两个因素的水平,如下所示:
df$newgroup1 <- paste0(df$group1, " (n=", df$Sample_Size_group1, ")")
df$newgroup2 <- paste0(df$group2, " (n=", df$Sample_Size_group2, ")")
这会创建字符向量,所以我想把它们变成因子:
df$newgroup1 <- factor(df$newgroup1)
df$newgroup2 <- factor(df$newgroup2)
但现在因子水平不再相同,也不是原始顺序。
levels(df$newgroup1)
[1] "E. NOR. CENTRAL (n=331)" "E. SOU. CENTRAL (n=135)" "MIDDLE ATLANTIC (n=220)" "MOUNTAIN (n=157)"
[5] "PACIFIC (n=267)" "SOUTH ATLANTIC (n=372)" "W. NOR. CENTRAL (n=127)" "W. SOU. CENTRAL (n=196)"
levels(df$newgroup2)
[1] "E. NOR. CENTRAL (n=331)" "E. SOU. CENTRAL (n=135)" "MIDDLE ATLANTIC (n=220)" "MOUNTAIN (n=157)"
[5] "NEW ENGLAND (n=120)" "SOUTH ATLANTIC (n=372)" "W. NOR. CENTRAL (n=127)" "W. SOU. CENTRAL (n=196)"
以下是我希望两种载体的水平:
[1] "NEW ENGLAND (n=120)" "MIDDLE ATLANTIC (n=220)" "E. NOR. CENTRAL (n=135)" "W. NOR. CENTRAL (n=127)"
[5] "SOUTH ATLANTIC (n=372)" "E. SOU. CENTRAL (n=135)" "W. SOU. CENTRAL (n=196)" "MOUNTAIN (n=157)"
[9] "PACIFIC (n=267)"
我知道我可以轻松地手动执行此操作,尤其是使用forcats
这样的程序包,但是如何以编程方式在基础R
中应用此功能?
答案 0 :(得分:1)
以下是使用grep
将原始级别与新的唯一元素进行匹配的解决方案。之后,按索引对元素进行排序以创建新级别。
# Create newgroup1 and newgroup2
df$newgroup1 <- paste0(df$group1, " (n=", df$Sample_Size_group1, ")")
df$newgroup2 <- paste0(df$group2, " (n=", df$Sample_Size_group2, ")")
# Get the level
LEV <- levels(df$group1)
# Find the unique element in newgroup1 and newgroup2
New_element <- unique(c(df$newgroup1, df$newgroup2))
# Find the index using grep
index <- sapply(LEV, grep, x = New_element)
# Use New_element[index] to set new level
df$newgroup1 <- factor(df$newgroup1, levels = New_element[index])
df$newgroup2 <- factor(df$newgroup2, levels = New_element[index])
# Check the results
levels(df$newgroup1)
levels(df$newgroup2)
答案 1 :(得分:0)
使用unique
将水平排列在彼此旁边,按原始因子排序,然后使用该排序来定义新因子:
group1_levels = unique(df[, c("group1", "newgroup1")])
group1_levels = group1_levels[order(group1_levels$group1), ]
df$newgroup1 = factor(df$newgroup1, levels = group1_levels$newgroup1)
group2_levels = unique(df[, c("group2", "newgroup2")])
group2_levels = group2_levels[order(group2_levels$group2), ]
df$newgroup2 = factor(df$newgroup2, levels = group2_levels$newgroup2)