我正在尝试清除数据集,以便可以轻松对其进行分析。
我有一个看起来像这样的数据集:
z a b c d a_1 b_2 c_3 d_4 ab_1 ab_2
Participant1 A 1 3 4 3 2 3 6 7 2 1
Participant2 B 3 2 4 5 6 4 2 1 2 1
Participant3 C 1 3 5 4 2 2 1 2 3 4
Participant4 D 1 3 8 1 3 4 5 4 5 2
我在每个测试变量的平均得分栏中添加了一个列。因此,我使用了以下代码:
CleanData <- CleanData%>%
group_by(ResponseId) %>%
mutate(MeanA = mean(as.numeric(c(a, b, c, d)))) %>%
mutate(MeanB = mean(as.numeric(c(a_1, b_2, c_3, d_4)))) %>%
mutate(MeanC = mean(as.numeric(c(ab_1, ab_2))))
使用上述代码,我能够在现有数据帧中添加三个新列(MeanA,MeanB和MeanC),即:
Z a b c d a_1 b_2 c_3 d_4 ab_1 ab_2 meanA meanB meancC
Participant1 A 1 3 4 3 2 3 6 7 2 1 2.75 4,5 1,5
Participant2 B 3 2 4 5 6 4 2 1 2 1 x x x
Participant3 C 1 3 5 4 2 2 1 2 3 4 x x x
Participant4 D 1 3 8 1 3 4 5 4 5 2 x x x
但是,均值列未显示正确的数字。例如,值ab_1:3和ab_2:4的平均值应返回3.5,但该列显示为5。有人可以帮我理解为什么会这样吗?
谢谢
更新: 我发现值:a_1,b_2,c_3,d_4,ab_1,ab_2属于因数,因此我通过添加例如
将其更改为数字。as.numeric(as.character(mean(c(ab_1, ab_2))))
但是它仍然不能返回正确的值!
UPDATE2: dput返回以下
structure(list(ResponseId = structure(c(32L, 40L, 113L, 43L), .Label = c("R_10UwSkTd0ZY0I6s",
"R_12z514ODiHL80r6", "R_1Bx5NSYrdqhaIuJ", "R_1C8bt9iRS6L1817",
"R_1C91Jv4yYTt8VsS", "R_1Cj5l50lu2VKcxg", "R_1DD2zbJPiueDijH",
"R_1FA36tDxZEUqBJM", "R_1FLH4KQaKlxMn77", "R_1FeUYLmPoDpKnOb",
"R_1GAXJ8YmAJflif4", "R_1GE4YeIfEuZuUYX", "R_1GyaiQcync3sHMJ",
"R_1IRUh7uRVEI3Op1", "R_1KjzzPz48WkuCFk", "R_1LcddBAuXLiBDgo",
"R_1M65lFIT9h0TrpQ", "R_1NCw01UV09WSGSX", "R_1QlN6pk1IH5xXqn",
"R_1XMp4A95WZLJmWl", "R_1dBn0Bb9ZPprg3R", "R_1dcu8FxLC7E2tq8",
"R_1dgBt0IGuRjDD4W", "R_1eRGDucOhbsOkgI", "R_1eXovzU5BJzWdt9",
"R_1g5Unx7VkpjcHEe", "R_1gtnPQhBie1xBLq", "R_1guMjae0AF4k9yh",
"R_1j6xFfGDt6QCgc3", "R_1jOf096UI4ManbG", "R_1l3smK23l8TPmso",
"R_1ohW9lO1Qw7D2Nt", "R_1pDO3wCW2J7X9Mw", "R_1qU6JvhOWUPen4Y",
"R_23e2rcSv4MzoilL", "R_27W3bSNMVuVHxLl", "R_2Bqy2lJu3vKgTkC",
"R_2PiG0aapXGpeAhi", "R_2PjUnRQM4nSaYuO", "R_2QnhIx6nPDMKt4J",
"R_2S1saV61VMsVhzA", "R_2SjpkptLr59Pi5W", "R_2VQn7jIGZe7rSiJ",
"R_2VgwMZugDIzh2gc", "R_2Y3ylZL7Os57FQg", "R_2YtPLlRDm3W0IiF",
"R_2cjhW6bLhpKoWeD", "R_2czLWk4DA7AtUWB", "R_2dhg48SvjsPF7xi",
"R_2e3xcJj4B4sVxpu", "R_2fcI8PF5YakOibH", "R_2qqi75IiKtupT3k",
"R_2s705LLk7dRpBtu", "R_2tKAG3kO0uTAp3W", "R_2tglcBV7ZIr5fRL",
"R_2uWFRjzrewJ9lZp", "R_2w6pFIySy6z417N", "R_2wRKqbJfulfII8L",
"R_2zTr8igVCFw8pBZ", "R_2zbYrfdxoSMQDZw", "R_2zqoRmevtHcELGs",
"R_32XQhbZdKS1gZqP", "R_33C0jdizQOZOCam", "R_3CWxNxWuLPFvTzi",
"R_3DbBdhc5Z5gYa8p", "R_3ESZKQIuMZpxjWb", "R_3HhHnbzqwkjBTyr",
"R_3L76M1sxyZT8LZF", "R_3LecRRD5DH4kAHZ", "R_3NwVV3CjA62MD4l",
"R_3Oj2rtphCHA2eid", "R_3OjKwwD8cFQtLqs", "R_3e3rPFQew2oBcZB",
"R_3gT9FFFMBZ0iVhF", "R_3h6MPewodtUKKEi", "R_3lAoiHlF9KgfbFB",
"R_3lFbGqMCvw5n4bo", "R_3nAFNfIRZmI8wzE", "R_3nTXeACPBHRGkZI",
"R_3q7MZwtck0MNgI0", "R_3qwfz0yMLuWw81M", "R_3suKETbTurYGy0m",
"R_3yleFr647Wb8EQV", "R_55a2UXtA2wqJeXD", "R_5w2RvbtXFaQpLMd",
"R_6Gs5YBPSE2abloR", "R_6LFgmHhwol4jAEp", "R_9WRZaEXexVKfN4t",
"R_AMNieClr3z90yuB", "R_BtXxoHRS0zenny1", "R_Cg1tqfa6iBeV6Ct",
"R_DiBGGojPdv9dDjz", "R_DjKGN63D9nlkRoJ", "R_DzUYyku3vvPVHgd",
"R_OlJDf3crqx5ixC9", "R_T6pA7HhF1QdSffz", "R_UlL0oXl78e1DAWJ",
"R_Wk3USlMJv0smEXD", "R_XRO3jxTdfN1ATyV", "R_XyLTjQJ24ZC2WnT",
"R_Z3o0VAV64k7sUmt", "R_ZltqETejsEhSOE9", "R_b4xwkSOfYOui3nP",
"R_bPCma1oTtGdWKpr", "R_cuAayrJroDlIlep", "R_dclKNvKcXlAfHs5",
"R_dgr9ooE75LVOpWh", "R_e2oMPa2WBgFmW9H", "R_eF0zW2RqPXuWtah",
"R_eKfj3XyRPgRlEsh", "R_oXC0xDrC6uq8uEp", "R_qPowYCxUMjc2Bz3",
"R_sMtw9IU4BTmSBMt", "R_tVxiAfQAU1TpQ8p", "R_vq3xb7ppDytKW4x",
"R_x8TnBj3YBj2R345", "R_xitfzNOT9yXK1vr", "R_yJSFhJqUec5dHfH",
"Response ID", "{\"ImportId\":\"_recordId\"}"), class = "factor"),
FL_8_DO = structure(c(6L, 6L, 1L, 1L), .Label = c("", "FL_27",
"FL_28", "FL_8 - Block Randomizer - Display Order", "F",
"H", "{\"ImportId\":\"FL_8_DO\"}"), class = "factor"),
Gender = structure(c(2L, 2L, 1L, 2L), .Label = c("", "1",
"2", "what is your gender?", "{\"ImportId\":\"QID14\"}"), class = "factor"),
Age = structure(c(2L, 10L, 1L, 7L), .Label = c("", "10",
"12", "14", "20", "3", "4", "5", "6", "7", "8", "9", "how old are you?",
"{\"ImportId\":\"QID27\"}"), class = "factor"), Per_1 = c(4,
2, NA, 1), Per_2 = c(3, 2, NA, 1), A_1 = c(NA_real_,
NA_real_, NA_real_, NA_real_), A_2 = c(NA_real_, NA_real_,
NA_real_, NA_real_), A_3 = c(NA_real_, NA_real_, NA_real_,
NA_real_), A_4 = c(NA_real_, NA_real_, NA_real_, NA_real_
), A_5 = c(NA_real_, NA_real_, NA_real_, NA_real_), A_6 = c(NA_real_,
NA_real_, NA_real_, NA_real_), A_7 = c(NA_real_, NA_real_,
NA_real_, NA_real_), A_8 = c(NA_real_, NA_real_, NA_real_,
NA_real_), A_9 = c(NA_real_, NA_real_, NA_real_, NA_real_
), Mo_1 = c(NA_real_, NA_real_, NA_real_, NA_real_),
Mo_2 = c(NA_real_, NA_real_, NA_real_, NA_real_), Mo_3 = c(NA_real_,
NA_real_, NA_real_, NA_real_), Mo_4 = c(NA_real_, NA_real_,
NA_real_, NA_real_), MeanA = c(NA_real_, NA_real_, NA_real_,
NA_real_)), .Names = c("ResponseId", "FL_8_DO", "Gender",
"Age", "Per_1", "Per_2", "A_1", "A_2", "A_3",
"A_4", "A_5", "A_6", "A_7", "A_8", "A_9", "Mo_1",
"Mo_2", "Mo_3", "Mo_4", "MeanA"), row.names = c(NA,
4L), class = "data.frame")
答案 0 :(得分:0)
嗨,问题在于mutate
在列上进行操作。
考虑这个小例子:
df <- tibble(a = 1:3, b = 4:6)
df %>% mutate(x = mean(c(a, b)))
# A tibble: 3 x 3
a b x
<int> <int> <dbl>
1 1 4 3.5
2 2 5 3.5
3 3 6 3.5
变异的作用是将a
和b
堆叠在一起,从而得到向量1,2,3,4,5,6,然后计算平均值为3.5。
要使mutate
按行操作,必须使用rowwise
函数:
df %>% rowwise() %>% mutate(x = mean(c(a, b)))
# A tibble: 3 x 3
a b x
<int> <int> <dbl>
1 1 4 2.5
2 2 5 3.5
3 3 6 4.5
更新:
如果您的列属于factor
类,则应首先将其转换为数字。 in this question提出了几种方法。
您可以使用
df %>% mutate_at(vars("a", "b", "c", "d"), ~as.numeric(levels(.x))[.x])
将a,b,c和d列从系数转换为数字。