R代码用于Rmarkdown中的假设检验

时间:2017-08-31 10:22:12

标签: r ggplot2 r-markdown

我真的在R降价的编码部分苦苦挣扎,但没有人问......

我正在处理的数据是dput(调查):

structure(list(Time = structure(c(5L, 6L, 7L, 8L, 9L, 10L, 11L, 
 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 
 25L, 26L, 27L, 28L, 1L, 2L, 3L, 4L, 35L, 42L, 46L, 30L, 31L, 
 33L, 34L, 29L, 36L, 37L, 38L, 39L, 40L, 41L, 43L, 44L, 45L, 47L, 
 48L, 32L, 54L, 55L, 50L, 49L, 51L, 52L, 53L, 57L, 59L, 56L, 60L, 
 61L, 58L, 62L), .Label = c("2017/08/06 10:25:01 PM GMT+10", "2017/08/06 10:26:54 PM GMT+10", 
 "2017/08/06 10:38:13 PM GMT+10", "2017/08/06 10:51:58 PM GMT+10", 
 "2017/08/06 4:53:07 PM GMT+10", "2017/08/06 4:58:44 PM GMT+10", 
 "2017/08/06 5:01:05 PM GMT+10", "2017/08/06 5:03:25 PM GMT+10", 
 "2017/08/06 5:04:50 PM GMT+10", "2017/08/06 5:06:51 PM GMT+10", 
 "2017/08/06 5:06:54 PM GMT+10", "2017/08/06 5:10:57 PM GMT+10", 
 "2017/08/06 5:11:16 PM GMT+10", "2017/08/06 5:18:21 PM GMT+10", 
 "2017/08/06 5:23:46 PM GMT+10", "2017/08/06 5:34:02 PM GMT+10", 
 "2017/08/06 5:43:10 PM GMT+10", "2017/08/06 5:54:52 PM GMT+10", 
 "2017/08/06 6:04:06 PM GMT+10", "2017/08/06 7:11:00 PM GMT+10", 
 "2017/08/06 7:13:21 PM GMT+10", "2017/08/06 7:32:45 PM GMT+10", 
 "2017/08/06 7:33:58 PM GMT+10", "2017/08/06 7:50:31 PM GMT+10", 
 "2017/08/06 8:02:07 PM GMT+10", "2017/08/06 8:28:39 PM GMT+10", 
 "2017/08/06 8:36:46 PM GMT+10", "2017/08/06 9:14:14 PM GMT+10", 
 "2017/08/07 1:59:14 PM GMT+10", "2017/08/07 10:28:13 AM GMT+10", 
 "2017/08/07 11:05:40 AM GMT+10", "2017/08/07 11:44:09 PM GMT+10", 
 "2017/08/07 12:18:04 PM GMT+10", "2017/08/07 12:49:27 PM GMT+10", 
 "2017/08/07 12:55:41 AM GMT+10", "2017/08/07 2:04:49 PM GMT+10", 
 "2017/08/07 2:14:56 PM GMT+10", "2017/08/07 2:17:10 PM GMT+10", 
 "2017/08/07 4:47:38 PM GMT+10", "2017/08/07 4:57:15 PM GMT+10", 
 "2017/08/07 7:08:44 PM GMT+10", "2017/08/07 9:12:16 AM GMT+10", 
 "2017/08/07 9:18:11 PM GMT+10", "2017/08/07 9:22:59 PM GMT+10", 
 "2017/08/07 9:23:43 PM GMT+10", "2017/08/07 9:32:10 AM GMT+10", 
 "2017/08/07 9:46:41 PM GMT+10", "2017/08/07 9:55:01 PM GMT+10", 
 "2017/08/08 1:36:16 PM GMT+10", "2017/08/08 10:27:59 AM GMT+10", 
 "2017/08/08 3:36:15 PM GMT+10", "2017/08/08 4:15:12 PM GMT+10", 
 "2017/08/08 6:39:28 PM GMT+10", "2017/08/08 8:44:38 AM GMT+10", 
 "2017/08/08 9:03:07 AM GMT+10", "2017/08/09 1:00:16 PM GMT+10", 
 "2017/08/09 10:17:55 AM GMT+10", "2017/08/09 10:26:28 PM GMT+10", 
 "2017/08/09 11:50:50 AM GMT+10", "2017/08/09 3:02:39 PM GMT+10", 
 "2017/08/09 9:48:19 PM GMT+10", "2017/08/10 7:32:00 AM GMT+10"
 ), class = "factor"), ID = structure(c(48L, 57L, 38L, 9L, 8L, 
 42L, 41L, 58L, 31L, 27L, 60L, 34L, 13L, 37L, 40L, 29L, 53L, 28L, 
 16L, 20L, 47L, 18L, 51L, 3L, 36L, 10L, 32L, 11L, 54L, 22L, 61L, 
 15L, 35L, 2L, 25L, 55L, 17L, 5L, 14L, 21L, 49L, 45L, 6L, 30L, 
 26L, 4L, 19L, 50L, 44L, 56L, 43L, 59L, 24L, 12L, 52L, 23L, 1L, 
 39L, 7L, 62L, 46L, 33L), .Label = c("1907", "3456", "450181964", 
 "460061490", "A", "ABCABCABC", "adsad", "affordance", "alexxx", 
 "AliceJ", "blueberry11", "Bob", "byue7515", "Cameron Nichols", 
 "Coelacanth", "crocophile", "Donald trump ", "DS2012-LB-S", "Gir", 
 "goly", "Grace", "greyshirt", "grob6576", "hahahahaha", "Harry", 
 "Insidestella", "ja150", "jane", "Jiashu Wu", "jmc", "Joohee0214", 
 "kakinna", "Kimbo Slice", "lhar7524", "lizebin", "Lucy", "Magician1213", 
 "Matchey", "md123", "mia", "MP", "N52981227", "Nattt", "Pete", 
 "rcon", "Ryan_eats_p-values", "S123", "Salmon ", "smarcon", "smile", 
 "snail", "sonja kay", "Thelimitdoesnotexist", "Toflin", "Tony Stark ", 
 "UriLover420", "valerie", "Whatzup", "Winky", "xwn19960829", 
 "zilu2637", "ZXFAARON"), class = "factor"), Gender = structure(c(3L, 
 2L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 3L, 2L, 
 2L, 1L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 
 2L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 
 4L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("crocodilian", 
 "Female", "Male", "Poisson"), class = "factor"), Postcode =structure(c(12L, 
 30L, 20L, 35L, 28L, 33L, 13L, 22L, 12L, 2L, 3L, 38L, 25L, 13L, 
 4L, 23L, 19L, 23L, 29L, 32L, 26L, 4L, 14L, 4L, 36L, 12L, 3L, 
 41L, 28L, 40L, 24L, 9L, 37L, 4L, 3L, 17L, 32L, 27L, 15L, 36L, 
 12L, 11L, 3L, 7L, 4L, 10L, 39L, 24L, 42L, 8L, 12L, 13L, 5L, 6L, 
 31L, 20L, 1L, 34L, 18L, 13L, 21L, 16L), .Label = c("14052", "2000", 
 "2007", "2008", "2020", "2021", "2022", "2026", "2031", "2037", 
 "2041", "2042", "2050", "2066", "2069", "2074", "2097", "2112", 
 "2117", "2131", "2134", "2136", "2137", "2138", "2140", "2144", 
 "2154", "2165", "2166", "2171", "2193", "2200", "2205", "2209", 
 "2216", "2220", "2228", "2756", "2762", "2765", "2780", "sydney"
 ), class = "factor"), StatsCourse = structure(c(4L, 4L, 4L, 4L, 
 4L, 4L, 1L, 4L, 4L, 4L, 3L, 4L, 4L, 5L, 4L, 4L, 5L, 6L, 4L, 4L, 
 4L, 4L, 5L, 4L, 5L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
 4L, 4L, 4L, 4L, 4L, 4L, 4L, 6L, 4L, 2L, 5L, 4L, 5L, 6L, 4L, 2L, 
 2L, 4L, 4L, 5L, 7L, 5L, 1L, 4L, 4L, 4L), .Label = c("", "BUSS1020", 
 "MATH1001,MATH1002", "MATH1005", "MATH1015", "MATH1905", "none"
 ), class = "factor"), Clubs = structure(c(1L, 1L, 4L, 5L, 4L, 
 2L, 4L, 4L, 2L, 4L, 7L, 2L, 4L, 4L, 1L, 4L, 1L, 4L, 1L, 1L, 6L, 
 1L, 4L, 1L, 11L, 4L, 5L, 10L, 3L, 5L, 2L, 4L, 1L, 1L, 2L, 1L, 
 4L, 4L, 4L, 6L, 2L, 2L, 4L, 4L, 9L, 4L, 1L, 8L, 2L, 4L, 2L, 6L, 
 4L, 4L, 11L, 5L, 1L, 1L, 1L, 4L, 4L, 1L), .Label = c("0", "1", 
 "10+", "2", "3", "4", "5", "6", "7", "none", "None"), class = "factor"), 
     StudyTime = structure(c(24L, 3L, 26L, 27L, 17L, 2L, 10L, 
     14L, 23L, 7L, 19L, 3L, 17L, 29L, 23L, 22L, 10L, 10L, 28L, 
     23L, 6L, 14L, 20L, 7L, 17L, 28L, 5L, 16L, 20L, 3L, 21L, 3L, 
     23L, 7L, 17L, 10L, 1L, 18L, 10L, 17L, 10L, 7L, 13L, 5L, 15L, 
     3L, 8L, 17L, 19L, 17L, 3L, 30L, 31L, 1L, 4L, 3L, 20L, 9L, 
     14L, 11L, 12L, 25L), .Label = c("0", "05-Jun", "10", "11", 
     "12", "14", "15", "17", "2", "20", "20-24", "20-25?", "24", 
     "25", "28", "28 hours", "30", "31", "35", "4", "40", "49", 
     "5", "50", "6", "7", "70", "8", "8hr", "didn't start uni maybe 6h", 
     "not sure"), class = "factor"), StudyLoad = structure(c(2L, 
     2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
     2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 
     3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 
     3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 
     2L), .Label = c("", "full-time", "part-time"), class = "factor"), 
     SocialMedia = structure(c(1L, 5L, 1L, 1L, 1L, 7L, 1L, 1L, 
     7L, 7L, 2L, 1L, 2L, 1L, 1L, 8L, 6L, 2L, 1L, 7L, 1L, 4L, 1L, 
     8L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 7L, 2L, 1L, 2L, 1L, 1L, 2L, 
     1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 5L, 5L, 1L, 1L, 2L, 2L, 
     1L, 3L, 1L, 2L, 2L, 1L, 2L, 1L, 1L), .Label = c("Facebook", 
     "Instragram", "none! (really)", "reddit", "Snapchat", "Tumblr", 
     "Twitter", "WeChat"), class = "factor"), Siblings = structure(c(2L, 
     4L, 4L, 1L, 4L, 1L, 2L, 4L, 5L, 2L, 1L, 2L, 2L, 1L, 4L, 1L, 
     1L, 4L, 2L, 2L, 8L, 2L, 2L, 3L, 1L, 1L, 2L, 5L, 2L, 7L, 1L, 
     4L, 2L, 6L, 1L, 6L, 2L, 5L, 1L, 1L, 4L, 4L, 2L, 2L, 1L, 2L, 
     1L, 1L, 4L, 4L, 2L, 9L, 1L, 2L, 10L, 2L, 4L, 2L, 2L, 1L, 
     2L, 2L), .Label = c("0", "1", "165", "2", "3", "4", "5", 
     "6", "none", "one"), class = "factor"), FBFriends = structure(c(49L, 
     43L, 6L, 3L, 28L, 2L, 9L, 13L, 21L, 19L, 30L, 40L, 37L, 20L, 
     35L, 32L, 53L, 47L, 30L, 22L, 8L, 45L, 14L, 15L, 38L, 16L, 
     45L, 31L, 35L, 43L, 34L, 23L, 52L, 18L, 34L, 27L, 33L, 11L, 
     42L, 24L, 51L, 26L, 17L, 50L, 39L, 19L, 10L, 12L, 4L, 44L, 
     46L, 29L, 45L, 36L, 54L, 20L, 7L, 5L, 41L, 25L, 1L, 48L), .Label = c("~300", 
     "10", "100", "1000", "1127", "115", "1192", "12", "120", 
     "121", "130", "148", "150", "1583", "165", "170", "174", 
     "190", "200", "213", "228", "229", "235", "240", "242", "256", 
     "259", "263", "27", "300", "308", "31", "382", "40", "400", 
     "431", "470", "5", "540", "548", "57", "572", "600", "664", 
     "700", "724", "800", "850", "90", "936", "978", "do not know", 
     "Don't have FB", "none (not in facebook)"), class = "factor"), 
     Grade = structure(c(18L, 19L, 11L, 31L, 33L, 14L, 22L, 18L, 
     6L, 9L, 19L, 18L, 22L, 23L, 24L, 30L, 28L, 16L, 2L, 14L, 
     3L, 12L, 21L, 2L, 12L, 12L, 6L, 29L, 12L, 27L, 17L, 6L, 12L, 
     17L, 17L, 15L, 24L, 20L, 7L, 14L, 12L, 10L, 22L, 34L, 24L, 
     17L, 16L, 12L, 24L, 32L, 26L, 25L, 26L, 13L, 4L, 12L, 1L, 
     5L, 12L, 8L, 24L, 35L), .Label = c("2.8", "50", "50-60", 
     "54", "6.25", "60", "61", "61.5", "62", "63", "64", "65", 
     "65.9", "66", "68", "69", "70", "72", "73", "73.2", "73.4", 
     "74", "74.6", "75", "8.7", "80", "82", "82.4", "83.2", "87", 
     "90", "90.1", "90.5", "91", "D"), class = "factor"), Pet = structure(c(3L, 
     2L, 3L, 1L, 3L, 2L, 2L, 2L, 2L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 
     2L, 3L, 3L, 2L, 2L, 3L, 2L, 2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 
     2L, 3L, 2L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 2L, 3L, 3L, 2L, 
     3L, 2L, 3L, 3L, 3L, 2L, 2L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 2L, 
     3L), .Label = c("", "No", "Yes"), class = "factor"), Home = structure(c(2L, 
     3L, 3L, 1L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 3L, 3L, 2L, 2L, 3L, 
     3L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
     2L, 2L, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 2L, 2L, 
     2L, 2L, 3L, 3L, 2L, 2L, 2L, 3L, 2L, 3L, 2L, 3L, 3L, 2L, 3L, 
     3L), .Label = c("", "No", "Yes"), class = "factor"), ExerciseTime = structure(c(10L, 
     12L, 7L, 1L, 4L, 7L, 7L, 5L, 7L, 12L, 13L, 5L, 10L, 7L, 15L, 
     15L, 10L, 10L, 5L, 14L, 2L, 9L, 4L, 5L, 7L, 4L, 14L, 8L, 
     10L, 13L, 1L, 13L, 1L, 13L, 13L, 5L, 7L, 16L, 16L, 14L, 10L, 
     14L, 7L, 6L, 12L, 10L, 10L, 13L, 13L, 14L, 7L, 11L, 2L, 2L, 
     17L, 16L, 7L, 7L, 2L, 3L, 13L, 15L), .Label = c("", "0", 
     "05-Jun", "1", "10", "12", "2", "2 hours", "20", "3", "3.5", 
     "4", "5", "6", "7", "8", "none"), class = "factor"), Eyecolor = structure(c(9L, 
     7L, 5L, 1L, 8L, 2L, 8L, 3L, 3L, 8L, 3L, 7L, 7L, 7L, 7L, 7L, 
     3L, 4L, 7L, 3L, 11L, 8L, 11L, 2L, 8L, 2L, 2L, 2L, 8L, 7L, 
     1L, 7L, 2L, 7L, 3L, 4L, 10L, 7L, 8L, 7L, 7L, 6L, 7L, 3L, 
     8L, 2L, 8L, 7L, 4L, 8L, 9L, 3L, 7L, 5L, 7L, 8L, 12L, 7L, 
     7L, 8L, 3L, 8L), .Label = c("", "black", "Black", "blue", 
     "Blue", "Blue/Green", "brown", "Brown", "Brown ", "Brown/black", 
     "dark brown", "grey"), class = "factor"), Working = structure(c(2L, 
     8L, 2L, 1L, 4L, 2L, 2L, 8L, 2L, 24L, 2L, 13L, 5L, 3L, 26L, 
     2L, 8L, 13L, 24L, 2L, 12L, 2L, 9L, 8L, 2L, 2L, 2L, 11L, 2L, 
     10L, 1L, 4L, 21L, 2L, 2L, 15L, 14L, 21L, 26L, 18L, 4L, 2L, 
     7L, 27L, 12L, 2L, 20L, 2L, 19L, 25L, 8L, 2L, 2L, 17L, 23L, 
     16L, 2L, 6L, 2L, 13L, 13L, 22L), .Label = c("", "0", "1.5", 
     "10", "11", "12", "14", "15", "17", "18", "18 hours", "2", 
     "20", "24", "25", "26", "3", "3.5", "30", "38", "4", "40", 
     "44", "5", "6", "7", "8"), class = "factor"), Season =     structure(c(2L, 
     3L, 2L, 1L, 5L, 2L, 3L, 4L, 4L, 3L, 4L, 4L, 3L, 3L, 4L, 3L, 
     3L, 3L, 3L, 5L, 3L, 3L, 2L, 5L, 5L, 4L, 2L, 2L, 5L, 2L, 3L, 
     2L, 2L, 3L, 2L, 4L, 2L, 3L, 5L, 3L, 4L, 5L, 3L, 4L, 4L, 4L, 
     3L, 4L, 4L, 4L, 3L, 2L, 2L, 2L, 3L, 4L, 4L, 3L, 2L, 4L, 4L, 
     3L), .Label = c("", "Autumn", "Spring", "Summer", "Winter"
     ), class = "factor")), .Names = c("Time", "ID", "Gender", 
 "Postcode", "StatsCourse", "Clubs", "StudyTime", "StudyLoad", 
 "SocialMedia", "Siblings", "FBFriends", "Grade", "Pet", "Home", 
 "ExerciseTime", "Eyecolor", "Working", "Season"), class = "data.frame", row.names = c(NA, 
 -62L))

到目前为止我做的是,

library(dplyr)
library(ggplot2)
library(tidyr)
library(knitr)

survey <- read.csv("STAT2012Survey.csv")

colnames(survey)
oldname = colnames(survey)
newname = c("Time", "ID", "Gender", "Postcode", "StatsCourse", "Clubs", "StudyTime", 
"StudyLoad", "SocialMedia", "Siblings", "FBFriends", "Grade", "Pet", "Home", 
"ExerciseTime", "Eyecolor", "Working", "Season")
colnames(survey) = newname

我想要实现的是,我想提供关于

的假设检验

&#34;有没有证据表明男性和女性的运动时间存在差异?&#34;

要做到这一点,我需要得到那些东西的均值和标准偏差,以便测试双样本t检验,但我不知道如何处理它

另外,为了用图表可视化数据,我试过了,

ggplot(survey, aes(x = Gender, y = ExerciseTime, fill = Gender)) + geom_boxplot()
然而,它只显示了一些奇怪的图表。我认为这是因为&#34;运动时间&#34;变量不是数字,但我也坚持它,因为ggplot2不处理类数字的数据......

有人请帮帮我......!我想对多个问题进行更多的假设检验,但我仍然坚持第一个问题......如果我知道如何做第一个问题,我或许可以实现目标!感谢。

1 个答案:

答案 0 :(得分:1)

在制作箱形图之前,您需要将ExerciseTime作为数字变量。您将遇到的问题是,某些响应很容易变为数字(2 hours,例如,应该是2,但是需要额外的步骤才能删除文本)。

首先,让我们做一个最简单的事情,即采取任何不是自然数的东西,让它变为缺失值。

survey2 <- 
  survey %>% 
  mutate(ExerciseTime = as.character(ExerciseTime),
         ExerciseTime = str_replace(ExerciseTime, "\\d{2}-\\w{3}", ""),
         ExerciseTime = str_extract(ExerciseTime, "\\d{1,2}"),
         ExerciseTime = as.numeric(ExerciseTime))

ggplot(data = survey,
       mapping = aes(x = Gender,
                     y = ExerciseTime,
                     fill = Gender)) + 
  geom_boxplot()