我真的在R降价的编码部分苦苦挣扎,但没有人问......
我正在处理的数据是dput(调查):
structure(list(Time = structure(c(5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L,
25L, 26L, 27L, 28L, 1L, 2L, 3L, 4L, 35L, 42L, 46L, 30L, 31L,
33L, 34L, 29L, 36L, 37L, 38L, 39L, 40L, 41L, 43L, 44L, 45L, 47L,
48L, 32L, 54L, 55L, 50L, 49L, 51L, 52L, 53L, 57L, 59L, 56L, 60L,
61L, 58L, 62L), .Label = c("2017/08/06 10:25:01 PM GMT+10", "2017/08/06 10:26:54 PM GMT+10",
"2017/08/06 10:38:13 PM GMT+10", "2017/08/06 10:51:58 PM GMT+10",
"2017/08/06 4:53:07 PM GMT+10", "2017/08/06 4:58:44 PM GMT+10",
"2017/08/06 5:01:05 PM GMT+10", "2017/08/06 5:03:25 PM GMT+10",
"2017/08/06 5:04:50 PM GMT+10", "2017/08/06 5:06:51 PM GMT+10",
"2017/08/06 5:06:54 PM GMT+10", "2017/08/06 5:10:57 PM GMT+10",
"2017/08/06 5:11:16 PM GMT+10", "2017/08/06 5:18:21 PM GMT+10",
"2017/08/06 5:23:46 PM GMT+10", "2017/08/06 5:34:02 PM GMT+10",
"2017/08/06 5:43:10 PM GMT+10", "2017/08/06 5:54:52 PM GMT+10",
"2017/08/06 6:04:06 PM GMT+10", "2017/08/06 7:11:00 PM GMT+10",
"2017/08/06 7:13:21 PM GMT+10", "2017/08/06 7:32:45 PM GMT+10",
"2017/08/06 7:33:58 PM GMT+10", "2017/08/06 7:50:31 PM GMT+10",
"2017/08/06 8:02:07 PM GMT+10", "2017/08/06 8:28:39 PM GMT+10",
"2017/08/06 8:36:46 PM GMT+10", "2017/08/06 9:14:14 PM GMT+10",
"2017/08/07 1:59:14 PM GMT+10", "2017/08/07 10:28:13 AM GMT+10",
"2017/08/07 11:05:40 AM GMT+10", "2017/08/07 11:44:09 PM GMT+10",
"2017/08/07 12:18:04 PM GMT+10", "2017/08/07 12:49:27 PM GMT+10",
"2017/08/07 12:55:41 AM GMT+10", "2017/08/07 2:04:49 PM GMT+10",
"2017/08/07 2:14:56 PM GMT+10", "2017/08/07 2:17:10 PM GMT+10",
"2017/08/07 4:47:38 PM GMT+10", "2017/08/07 4:57:15 PM GMT+10",
"2017/08/07 7:08:44 PM GMT+10", "2017/08/07 9:12:16 AM GMT+10",
"2017/08/07 9:18:11 PM GMT+10", "2017/08/07 9:22:59 PM GMT+10",
"2017/08/07 9:23:43 PM GMT+10", "2017/08/07 9:32:10 AM GMT+10",
"2017/08/07 9:46:41 PM GMT+10", "2017/08/07 9:55:01 PM GMT+10",
"2017/08/08 1:36:16 PM GMT+10", "2017/08/08 10:27:59 AM GMT+10",
"2017/08/08 3:36:15 PM GMT+10", "2017/08/08 4:15:12 PM GMT+10",
"2017/08/08 6:39:28 PM GMT+10", "2017/08/08 8:44:38 AM GMT+10",
"2017/08/08 9:03:07 AM GMT+10", "2017/08/09 1:00:16 PM GMT+10",
"2017/08/09 10:17:55 AM GMT+10", "2017/08/09 10:26:28 PM GMT+10",
"2017/08/09 11:50:50 AM GMT+10", "2017/08/09 3:02:39 PM GMT+10",
"2017/08/09 9:48:19 PM GMT+10", "2017/08/10 7:32:00 AM GMT+10"
), class = "factor"), ID = structure(c(48L, 57L, 38L, 9L, 8L,
42L, 41L, 58L, 31L, 27L, 60L, 34L, 13L, 37L, 40L, 29L, 53L, 28L,
16L, 20L, 47L, 18L, 51L, 3L, 36L, 10L, 32L, 11L, 54L, 22L, 61L,
15L, 35L, 2L, 25L, 55L, 17L, 5L, 14L, 21L, 49L, 45L, 6L, 30L,
26L, 4L, 19L, 50L, 44L, 56L, 43L, 59L, 24L, 12L, 52L, 23L, 1L,
39L, 7L, 62L, 46L, 33L), .Label = c("1907", "3456", "450181964",
"460061490", "A", "ABCABCABC", "adsad", "affordance", "alexxx",
"AliceJ", "blueberry11", "Bob", "byue7515", "Cameron Nichols",
"Coelacanth", "crocophile", "Donald trump ", "DS2012-LB-S", "Gir",
"goly", "Grace", "greyshirt", "grob6576", "hahahahaha", "Harry",
"Insidestella", "ja150", "jane", "Jiashu Wu", "jmc", "Joohee0214",
"kakinna", "Kimbo Slice", "lhar7524", "lizebin", "Lucy", "Magician1213",
"Matchey", "md123", "mia", "MP", "N52981227", "Nattt", "Pete",
"rcon", "Ryan_eats_p-values", "S123", "Salmon ", "smarcon", "smile",
"snail", "sonja kay", "Thelimitdoesnotexist", "Toflin", "Tony Stark ",
"UriLover420", "valerie", "Whatzup", "Winky", "xwn19960829",
"zilu2637", "ZXFAARON"), class = "factor"), Gender = structure(c(3L,
2L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 2L,
2L, 1L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L,
2L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L,
4L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("crocodilian",
"Female", "Male", "Poisson"), class = "factor"), Postcode =structure(c(12L,
30L, 20L, 35L, 28L, 33L, 13L, 22L, 12L, 2L, 3L, 38L, 25L, 13L,
4L, 23L, 19L, 23L, 29L, 32L, 26L, 4L, 14L, 4L, 36L, 12L, 3L,
41L, 28L, 40L, 24L, 9L, 37L, 4L, 3L, 17L, 32L, 27L, 15L, 36L,
12L, 11L, 3L, 7L, 4L, 10L, 39L, 24L, 42L, 8L, 12L, 13L, 5L, 6L,
31L, 20L, 1L, 34L, 18L, 13L, 21L, 16L), .Label = c("14052", "2000",
"2007", "2008", "2020", "2021", "2022", "2026", "2031", "2037",
"2041", "2042", "2050", "2066", "2069", "2074", "2097", "2112",
"2117", "2131", "2134", "2136", "2137", "2138", "2140", "2144",
"2154", "2165", "2166", "2171", "2193", "2200", "2205", "2209",
"2216", "2220", "2228", "2756", "2762", "2765", "2780", "sydney"
), class = "factor"), StatsCourse = structure(c(4L, 4L, 4L, 4L,
4L, 4L, 1L, 4L, 4L, 4L, 3L, 4L, 4L, 5L, 4L, 4L, 5L, 6L, 4L, 4L,
4L, 4L, 5L, 4L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 4L, 2L, 5L, 4L, 5L, 6L, 4L, 2L,
2L, 4L, 4L, 5L, 7L, 5L, 1L, 4L, 4L, 4L), .Label = c("", "BUSS1020",
"MATH1001,MATH1002", "MATH1005", "MATH1015", "MATH1905", "none"
), class = "factor"), Clubs = structure(c(1L, 1L, 4L, 5L, 4L,
2L, 4L, 4L, 2L, 4L, 7L, 2L, 4L, 4L, 1L, 4L, 1L, 4L, 1L, 1L, 6L,
1L, 4L, 1L, 11L, 4L, 5L, 10L, 3L, 5L, 2L, 4L, 1L, 1L, 2L, 1L,
4L, 4L, 4L, 6L, 2L, 2L, 4L, 4L, 9L, 4L, 1L, 8L, 2L, 4L, 2L, 6L,
4L, 4L, 11L, 5L, 1L, 1L, 1L, 4L, 4L, 1L), .Label = c("0", "1",
"10+", "2", "3", "4", "5", "6", "7", "none", "None"), class = "factor"),
StudyTime = structure(c(24L, 3L, 26L, 27L, 17L, 2L, 10L,
14L, 23L, 7L, 19L, 3L, 17L, 29L, 23L, 22L, 10L, 10L, 28L,
23L, 6L, 14L, 20L, 7L, 17L, 28L, 5L, 16L, 20L, 3L, 21L, 3L,
23L, 7L, 17L, 10L, 1L, 18L, 10L, 17L, 10L, 7L, 13L, 5L, 15L,
3L, 8L, 17L, 19L, 17L, 3L, 30L, 31L, 1L, 4L, 3L, 20L, 9L,
14L, 11L, 12L, 25L), .Label = c("0", "05-Jun", "10", "11",
"12", "14", "15", "17", "2", "20", "20-24", "20-25?", "24",
"25", "28", "28 hours", "30", "31", "35", "4", "40", "49",
"5", "50", "6", "7", "70", "8", "8hr", "didn't start uni maybe 6h",
"not sure"), class = "factor"), StudyLoad = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L,
3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("", "full-time", "part-time"), class = "factor"),
SocialMedia = structure(c(1L, 5L, 1L, 1L, 1L, 7L, 1L, 1L,
7L, 7L, 2L, 1L, 2L, 1L, 1L, 8L, 6L, 2L, 1L, 7L, 1L, 4L, 1L,
8L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 7L, 2L, 1L, 2L, 1L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 5L, 5L, 1L, 1L, 2L, 2L,
1L, 3L, 1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("Facebook",
"Instragram", "none! (really)", "reddit", "Snapchat", "Tumblr",
"Twitter", "WeChat"), class = "factor"), Siblings = structure(c(2L,
4L, 4L, 1L, 4L, 1L, 2L, 4L, 5L, 2L, 1L, 2L, 2L, 1L, 4L, 1L,
1L, 4L, 2L, 2L, 8L, 2L, 2L, 3L, 1L, 1L, 2L, 5L, 2L, 7L, 1L,
4L, 2L, 6L, 1L, 6L, 2L, 5L, 1L, 1L, 4L, 4L, 2L, 2L, 1L, 2L,
1L, 1L, 4L, 4L, 2L, 9L, 1L, 2L, 10L, 2L, 4L, 2L, 2L, 1L,
2L, 2L), .Label = c("0", "1", "165", "2", "3", "4", "5",
"6", "none", "one"), class = "factor"), FBFriends = structure(c(49L,
43L, 6L, 3L, 28L, 2L, 9L, 13L, 21L, 19L, 30L, 40L, 37L, 20L,
35L, 32L, 53L, 47L, 30L, 22L, 8L, 45L, 14L, 15L, 38L, 16L,
45L, 31L, 35L, 43L, 34L, 23L, 52L, 18L, 34L, 27L, 33L, 11L,
42L, 24L, 51L, 26L, 17L, 50L, 39L, 19L, 10L, 12L, 4L, 44L,
46L, 29L, 45L, 36L, 54L, 20L, 7L, 5L, 41L, 25L, 1L, 48L), .Label = c("~300",
"10", "100", "1000", "1127", "115", "1192", "12", "120",
"121", "130", "148", "150", "1583", "165", "170", "174",
"190", "200", "213", "228", "229", "235", "240", "242", "256",
"259", "263", "27", "300", "308", "31", "382", "40", "400",
"431", "470", "5", "540", "548", "57", "572", "600", "664",
"700", "724", "800", "850", "90", "936", "978", "do not know",
"Don't have FB", "none (not in facebook)"), class = "factor"),
Grade = structure(c(18L, 19L, 11L, 31L, 33L, 14L, 22L, 18L,
6L, 9L, 19L, 18L, 22L, 23L, 24L, 30L, 28L, 16L, 2L, 14L,
3L, 12L, 21L, 2L, 12L, 12L, 6L, 29L, 12L, 27L, 17L, 6L, 12L,
17L, 17L, 15L, 24L, 20L, 7L, 14L, 12L, 10L, 22L, 34L, 24L,
17L, 16L, 12L, 24L, 32L, 26L, 25L, 26L, 13L, 4L, 12L, 1L,
5L, 12L, 8L, 24L, 35L), .Label = c("2.8", "50", "50-60",
"54", "6.25", "60", "61", "61.5", "62", "63", "64", "65",
"65.9", "66", "68", "69", "70", "72", "73", "73.2", "73.4",
"74", "74.6", "75", "8.7", "80", "82", "82.4", "83.2", "87",
"90", "90.1", "90.5", "91", "D"), class = "factor"), Pet = structure(c(3L,
2L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 3L, 2L, 3L, 3L,
2L, 3L, 3L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L,
2L, 3L, 2L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 2L,
3L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 2L,
3L), .Label = c("", "No", "Yes"), class = "factor"), Home = structure(c(2L,
3L, 3L, 1L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
2L, 2L, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 2L,
2L, 2L, 3L, 3L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 3L, 2L, 3L,
3L), .Label = c("", "No", "Yes"), class = "factor"), ExerciseTime = structure(c(10L,
12L, 7L, 1L, 4L, 7L, 7L, 5L, 7L, 12L, 13L, 5L, 10L, 7L, 15L,
15L, 10L, 10L, 5L, 14L, 2L, 9L, 4L, 5L, 7L, 4L, 14L, 8L,
10L, 13L, 1L, 13L, 1L, 13L, 13L, 5L, 7L, 16L, 16L, 14L, 10L,
14L, 7L, 6L, 12L, 10L, 10L, 13L, 13L, 14L, 7L, 11L, 2L, 2L,
17L, 16L, 7L, 7L, 2L, 3L, 13L, 15L), .Label = c("", "0",
"05-Jun", "1", "10", "12", "2", "2 hours", "20", "3", "3.5",
"4", "5", "6", "7", "8", "none"), class = "factor"), Eyecolor = structure(c(9L,
7L, 5L, 1L, 8L, 2L, 8L, 3L, 3L, 8L, 3L, 7L, 7L, 7L, 7L, 7L,
3L, 4L, 7L, 3L, 11L, 8L, 11L, 2L, 8L, 2L, 2L, 2L, 8L, 7L,
1L, 7L, 2L, 7L, 3L, 4L, 10L, 7L, 8L, 7L, 7L, 6L, 7L, 3L,
8L, 2L, 8L, 7L, 4L, 8L, 9L, 3L, 7L, 5L, 7L, 8L, 12L, 7L,
7L, 8L, 3L, 8L), .Label = c("", "black", "Black", "blue",
"Blue", "Blue/Green", "brown", "Brown", "Brown ", "Brown/black",
"dark brown", "grey"), class = "factor"), Working = structure(c(2L,
8L, 2L, 1L, 4L, 2L, 2L, 8L, 2L, 24L, 2L, 13L, 5L, 3L, 26L,
2L, 8L, 13L, 24L, 2L, 12L, 2L, 9L, 8L, 2L, 2L, 2L, 11L, 2L,
10L, 1L, 4L, 21L, 2L, 2L, 15L, 14L, 21L, 26L, 18L, 4L, 2L,
7L, 27L, 12L, 2L, 20L, 2L, 19L, 25L, 8L, 2L, 2L, 17L, 23L,
16L, 2L, 6L, 2L, 13L, 13L, 22L), .Label = c("", "0", "1.5",
"10", "11", "12", "14", "15", "17", "18", "18 hours", "2",
"20", "24", "25", "26", "3", "3.5", "30", "38", "4", "40",
"44", "5", "6", "7", "8"), class = "factor"), Season = structure(c(2L,
3L, 2L, 1L, 5L, 2L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 3L, 4L, 3L,
3L, 3L, 3L, 5L, 3L, 3L, 2L, 5L, 5L, 4L, 2L, 2L, 5L, 2L, 3L,
2L, 2L, 3L, 2L, 4L, 2L, 3L, 5L, 3L, 4L, 5L, 3L, 4L, 4L, 4L,
3L, 4L, 4L, 4L, 3L, 2L, 2L, 2L, 3L, 4L, 4L, 3L, 2L, 4L, 4L,
3L), .Label = c("", "Autumn", "Spring", "Summer", "Winter"
), class = "factor")), .Names = c("Time", "ID", "Gender",
"Postcode", "StatsCourse", "Clubs", "StudyTime", "StudyLoad",
"SocialMedia", "Siblings", "FBFriends", "Grade", "Pet", "Home",
"ExerciseTime", "Eyecolor", "Working", "Season"), class = "data.frame", row.names = c(NA,
-62L))
到目前为止我做的是,
library(dplyr)
library(ggplot2)
library(tidyr)
library(knitr)
survey <- read.csv("STAT2012Survey.csv")
colnames(survey)
oldname = colnames(survey)
newname = c("Time", "ID", "Gender", "Postcode", "StatsCourse", "Clubs", "StudyTime",
"StudyLoad", "SocialMedia", "Siblings", "FBFriends", "Grade", "Pet", "Home",
"ExerciseTime", "Eyecolor", "Working", "Season")
colnames(survey) = newname
我想要实现的是,我想提供关于
的假设检验&#34;有没有证据表明男性和女性的运动时间存在差异?&#34;
要做到这一点,我需要得到那些东西的均值和标准偏差,以便测试双样本t检验,但我不知道如何处理它
另外,为了用图表可视化数据,我试过了,
ggplot(survey, aes(x = Gender, y = ExerciseTime, fill = Gender)) + geom_boxplot()
然而,它只显示了一些奇怪的图表。我认为这是因为&#34;运动时间&#34;变量不是数字,但我也坚持它,因为ggplot2不处理类数字的数据......
有人请帮帮我......!我想对多个问题进行更多的假设检验,但我仍然坚持第一个问题......如果我知道如何做第一个问题,我或许可以实现目标!感谢。
答案 0 :(得分:1)
在制作箱形图之前,您需要将ExerciseTime
作为数字变量。您将遇到的问题是,某些响应很容易变为数字(2 hours
,例如,应该是2,但是需要额外的步骤才能删除文本)。
首先,让我们做一个最简单的事情,即采取任何不是自然数的东西,让它变为缺失值。
survey2 <-
survey %>%
mutate(ExerciseTime = as.character(ExerciseTime),
ExerciseTime = str_replace(ExerciseTime, "\\d{2}-\\w{3}", ""),
ExerciseTime = str_extract(ExerciseTime, "\\d{1,2}"),
ExerciseTime = as.numeric(ExerciseTime))
ggplot(data = survey,
mapping = aes(x = Gender,
y = ExerciseTime,
fill = Gender)) +
geom_boxplot()