为我提供了测试df中SEX差异的代码,我给出的子集在SEX列中包含M,F和U的SPECIES,但我只想测试M与F的差异。有些物种也只有一种性别,所以只有一级
以下代码适用于以下数据子集,其中包含所有SPECIES,M,F以及某些带有U的数据
subset %>%
select(-RING, -AGE, -FAT) %>%
gather(variable, value, -SPECIES, -SEX) %>%
group_by(SPECIES, variable) %>%
nest() %>%
mutate(
chi_sq_results = map(data, ~ chisq.test(.x$SEX, .x$value)),
tidied = map(chi_sq_results, tidy)
) %>%
unnest(tidied, .drop = TRUE)
> dput(subset)
structure(list(RING = c("H8309", "K617", "A264905", "A358705",
"A432721", "O59461", "O92094", "O92095"), SPECIES = c("ACCIPITER NISUS",
"ACCIPITER NISUS", "ACROCEPHALUS SCIRPACEUS", "ACROCEPHALUS SCIRPACEUS",
"ACROCEPHALUS SCIRPACEUS", "AEGITHALOS CAUDATUS", "AEGITHALOS CAUDATUS",
"AEGITHALOS CAUDATUS"), SEX = c("M", "F", "F", "M", "U", "M",
"F", "U"), AGE = c(5L, 4L, 4L, 3L, 4L, 4L, 2L, 2L), FAT = c(0L,
0L, 2L, 2L, 6L, 0L, 0L, 0L), WEIGHT = c(141, 2885, 118, 11, 145,
64, 68, 7), WING = c(199, 232, 645, 66, 63, 57, 56, 58), WINGPRI = c(117L,
167L, NA, 50L, 48L, 42L, 38L, 44L), BEAK = c(192, 204, NA, 182,
16, 82, 796, 878), TARSUS = c(52, 622, NA, 22, 219, 138, 1654,
1785)), .Names = c("RING", "SPECIES", "SEX", "AGE", "FAT", "WEIGHT",
"WING", "WINGPRI", "BEAK", "TARSUS"), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), spec = structure(list(cols = structure(list(
RING = structure(list(), class = c("collector_character",
"collector")), SPECIES = structure(list(), class = c("collector_character",
"collector")), SEX = structure(list(), class = c("collector_character",
"collector")), AGE = structure(list(), class = c("collector_integer",
"collector")), FAT = structure(list(), class = c("collector_integer",
"collector")), WEIGHT = structure(list(), class = c("collector_number",
"collector")), WING = structure(list(), class = c("collector_number",
"collector")), WINGPRI = structure(list(), class = c("collector_integer",
"collector")), BEAK = structure(list(), class = c("collector_number",
"collector")), TARSUS = structure(list(), class = c("collector_number",
"collector"))), .Names = c("RING", "SPECIES", "SEX", "AGE",
"FAT", "WEIGHT", "WING", "WINGPRI", "BEAK", "TARSUS")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
但是当我尝试将相同的代码应用于不带U的另一个子集时,它不起作用,也不能在完整df中起作用。我相信它在完整的df上不起作用,因为某些“性”列中只有一个级别,例如。 SPECIES x只有F(女)
我收到此错误:
Error in mutate_impl(.data, dots) :
Evaluation error: 'x' and 'y' must have at least 2 levels.
不包含U的子集:
> dput(subsetU)
structure(list(RING = c("H8309", "K617", "A264905", "A358705",
"O59461", "O92094"), SPECIES = c("ACCIPITER NISUS", "ACCIPITER NISUS",
"ACROCEPHALUS SCIRPACEUS", "ACROCEPHALUS SCIRPACEUS", "AEGITHALOS CAUDATUS",
"AEGITHALOS CAUDATUS"), SEX = c("M", "F", "F", "M", "M", "F"),
AGE = c(5L, 4L, 4L, 3L, 4L, 2L), FAT = c(0L, 0L, 2L, 2L,
0L, 0L), WEIGHT = c(141, 2885, 118, 11, 64, 68), WING = c(199,
232, 645, 66, 57, 56), WINGPRI = c(117L, 167L, NA, 50L, 42L,
38L), BEAK = c(192, 204, NA, 182, 82, 796), TARSUS = c(52,
622, NA, 22, 138, 1654)), .Names = c("RING", "SPECIES", "SEX",
"AGE", "FAT", "WEIGHT", "WING", "WINGPRI", "BEAK", "TARSUS"), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
希望数据是可重现的,这是我第一次这样做
谢谢
答案 0 :(得分:0)
对于参数相关变量,如果要比较两组,我将使用t检验(如果不满足t检验的假设,则使用Wilcoxon检验)。如果要比较两个以上的组(如您提到的男性,女性和U类),我将使用单向方差分析(如果对方差分析的假设失败,则使用Kruskal–Wallis)。
set.seed(123)
# Dataframe with 2 Variables.
# one is group
# the other one is some sort of continous value choosen randomly from a normal distribution
# for group 1 the values are taken from a ND with mean = 10 and standard deviation = 5
# for group 2 the values are taken from a ND with mean = 30 and standard deviation = 5
# so there should be a clear difference between group 1 and 2
test = data.frame("group" = rep(c(1,2), 50))
test$value = NA
test$value[which(test$group == 1)] = rnorm(50, 10, 5)
test$value[which(test$group == 2)] = rnorm(50, 30, 5)
# A quick look on the data
plot(test$group, test$value)
# t.test to check if there are differences
t.test(formula = value ~ group, data = test)
# > t.test(formula = value ~ group, data = test)
#
# Welch Two Sample t-test
#
# data: value by group
# t = -22.452, df = 97.951, p-value < 0.00000000000000022
# alternative hypothesis: true difference in means is not equal to 0
# 95 percent confidence interval:
# -22.37724 -18.74280
# sample estimates:
# mean in group 1 mean in group 2
# 10.17202 30.73204
# mean group 1 is around 10 and in group 2 around 30 (as we expected)
# it is very unlikely that this is just a random coincidence. Chances therefore would be only 0.00000000000000022
# lets try this with the chisquare test
chisq.test(test$group, test$value)
# > chisq.test(test$group, test$value)
#
# Pearson's Chi-squared test
#
# data: test$group and test$value
# X-squared = 100, df = 99, p-value = 0.453
#
# Warning message:
# In chisq.test(test$group, test$value) :
# Chi-Quadrat-Approximation kann inkorrekt sein
# We get a warning (which translates to english as something like Chi-Square-Approximation might be incorrect)
# this is caused because of the many value-cases that are just present in one and not in the other group.
# chisquare needs at least 5 (if i remember right) values per group X value combination to work.
# But still, we get a p-value which indicates, that there is no significant difference (but don't forget the warning)
# Table to show the many value-cases beeing present just in one group
table(test$value, test$group)
# lets split the values by mean and label the ones below the mean as 1 and those above as 2
# and run the chisquare test again.
test$value2 = 1
test$value2[which(test$value > mean(test$value))] = 2
chisq.test(test$group, test$value2)
table(test$value2, test$group)
# now its working and it tells that there is some significant difference in the distribution of
# value below and above mean between both groups.