以下部分代码对我来说很好。 assessments_jobSmart是导入的csv文件。我试图重塑多列数据,但我无法通过使用tapply来实现。所以我试图通过匹配user_id绑定矩阵,并添加id已经存在的行id。
individual_assess <- filter(assessments_jobSmart, !is.na(assessments_jobSmart$submitted))
quiz_i = filter(individual_assess, assessment_type == 'quiz')
checkin_i = filter(individual_assess, assessment_type == 'checkin')
groupQuiz_i <- group_by(quiz_i, user_id, program_id, name)
summaryQuiz_i <- summarize(groupQuiz_i, maxQuiz = max(moderated_score))
q1 <- with(summaryQuiz_i, tapply(maxQuiz, list(user_id, name) , I))
q2=matrix(NA, nrow = nrow(q1), ncol = ncol(q1)+1)
colnames(q2) = c("user_id", paste("assess_q", colnames(q1)))
q2[1:nrow(q2),1] = rownames(q1)
q2[1:nrow(q2),2:ncol(q2)] = q1[1:nrow(q1),1:ncol(q1)]
lla = merge(lla, q2, by = 'user_id', all=TRUE)
但是对于下一部分,它在我最后一行的末尾给出了错误。我提到了许多链接,但仍然无法弄清楚原因。 c1和c2都有相同的行数。
Error in (function (..., row.names = NULL, check.rows = FALSE, check.names =
TRUE, : arguments imply differing number of rows: 1, 2
错误的代码:
groupCheckin_i <- group_by(checkin_i, user_id, program_id, name)
summaryCheckin_i <- summarize(groupCheckin_i, countCheckin = n())
c1 <- with(summaryCheckin_i, tapply(countCheckin, list(user_id, name) , I))
c1[c1=="NULL"]=NA
c2=matrix(NA, nrow = nrow(c1), ncol = ncol(c1)+1)
colnames(c2) = c("user_id", paste("assess_c", colnames(c1)))
c2[1:nrow(c2),1] = rownames(c1)
c2[1:nrow(c2),2:ncol(c2)] = c1[1:nrow(c1),1:ncol(c1)]
lla = merge(lla, c2, by = 'user_id', all=TRUE)
可重复的例子,不确定我是否正确地复制它。我将从导入,分组和汇总的数据框开始。
install.packages("dplyr")
install.packages("reshape2")
install.packages('ggplot2', dep = TRUE)
library("dplyr")
library(reshape2)
library(ggplot2)
head = c("user_id", "program", "assessment", "type", "marks")
content = c("111", "program A", "quiz 1", "quiz", "1", "112", "program A", "quiz 1", "quiz", "0.5", "112", "program A", "quiz 2", "quiz", "0.75", "113", "program B", "quiz 2", "quiz", "0.8", "110", "program B", "survey 1", "survey", "1", "113", "program B", "survey 1", "survey", "1")
M = as.dataframe(matrix(content, nrow=5, ncol=5)) #kinda replicate my imported csv file.
s = filter(M, type == 'survey')
q = filter(M, type == 'quiz')
groupS = group_by(s, user_id, program, assessment)
groupQ = group_by(q, user_id, program, assessment)
summaryS <- summarize(groupS, maxMarks = max(marks)) # take only maximum marks if there are duplicate entries
s1 <- with(summaryS, tapply(maxMarks, list(user_id, assessment) , I))
s2 = matrix(NA, nrow=nrow(s1), ncol=ncol(s1)+1)
colnames(s2) = c("user_id", colnames(s1))
s2[q:nrow(s2), 1] = rownames(s1) # everything works alright till here
s2[1:nrow(s2),2:ncol(s2)] = s1[1:nrow(s1),1:nrow(s1)]
summaryQ <- summarize(groupQ, count = n()) # it makes more sense to count survey done
q1 <- with(summaryQ, tapply(count, list(user_id, assessment) , I))
q2 = matrix(NA, nrow=nrow(q1), ncol=ncol(q1)+1)
colnames(q2) = c("user_id", colnames(q1))
q2[q:nrow(q2), 1] = rownames(q1) # everything works alright till here
q2[1:nrow(q2),2:ncol(q2)] = q1[1:nrow(q1),1:nrow(q1)] #q2 becomes a list :(
b = merge(b, a2, by = 'user_id', all+TRUE)
# | program | assessment | marks
# ---------------------------------------
# 111 | program A | quiz 1 | 1
# 112 | program A | quiz 1 | 0.5
# 112 | program A | quiz 2 | 0.75
# 113 | program B | quiz 2 | 0.8
# Then I used tapply to reshape data to get something like:
# | assessment 1 | assessment 2
# -------------------------------------
# 111 | 1 | NA
# 112 | 0.5 | 0.75
# 113 | NA | 0.8
这些表有很多,因为结果的提取方式不同,所以我想在最后合并它们以组合所有结果。我不希望user_id出现两次,每次都是单独的表。我想将结果与user_id进行比较,但由于user_id被视为rownames,因此缺少该列的标题。所以我创建了一个更大的矩阵来复制所有内容并包含user_id列名:
# Let's say a1 is the matrix after tapply, a2 is the new dataframe I want to create, b is the successful new dataframe created using the exact same method on same csv file exported.
我想在合并后得到这样的东西:
user_id | survey 1 | survey 2 | assessment 1 | assessment 2
-------------------------------------------------------------
110 | 1 | NA | NA | NA
111 | NA | NA | 1 | NA
112 | NA | NA | 0.5 | 0.75
113 | 1 | NA | NA | 0.8
答案 0 :(得分:0)
我仍然很难理解你的起点和想要的内容。您可重复的示例的某些部分不起作用。我试图填补空白,所以这是我尝试回答的问题。
您可能希望尝试tidyr
函数来重塑数据帧(gather
/ spread
和unite
/ separate
),而不是所有应用函数。
library(dplyr)
library(tidyr)
library(stringr)
df1 <- data_frame(
V1 = c("111", "program A", "quiz 1", "quiz", "1"),
V2 = c("112", "program A", "quiz 1", "quiz", "0.5"),
V3 = c("112", "program A", "quiz 2", "quiz", "0.75"),
V4 = c("113", "program B", "quiz 2", "quiz", "0.8"),
V5 = c("110", "program B", "survey 1", "survey", "1")
)
df1
#> # A tibble: 5 x 5
#> V1 V2 V3 V4 V5
#> <chr> <chr> <chr> <chr> <chr>
#> 1 111 112 112 113 110
#> 2 program A program A program A program B program B
#> 3 quiz 1 quiz 1 quiz 2 quiz 2 survey 1
#> 4 quiz quiz quiz quiz survey
#> 5 1 0.5 0.75 0.8 1
col_names <- c("user_id", "program", "assessment", "type", "marks")
df2 <- df1 %>%
summarise_all(paste, collapse = ",") %>%
gather("key", "val") %>%
separate(val, col_names, ",", convert = TRUE) %>%
mutate(assessment = assessment %>% str_extract("\\d") %>% as.integer(),
program = str_extract(program, ".{1}$")) %>%
select(-key)
df2
#> # A tibble: 5 x 5
#> user_id program assessment type marks
#> <int> <chr> <int> <chr> <dbl>
#> 1 111 A 1 quiz 1.00
#> 2 112 A 1 quiz 0.50
#> 3 112 A 2 quiz 0.75
#> 4 113 B 2 quiz 0.80
#> 5 110 B 1 survey 1.00
df3 <- df2 %>%
group_by(user_id, program, type, assessment) %>%
summarise(marks = max(marks)) %>%
unite(key, program, type, assessment) %>%
spread(key, marks)
df3
#> Source: local data frame [4 x 5]
#> Groups: user_id [4]
#>
#> # A tibble: 4 x 5
#> user_id A_quiz_1 A_quiz_2 B_quiz_2 B_survey_1
#> * <int> <dbl> <dbl> <dbl> <dbl>
#> 1 110 NA NA NA 1
#> 2 111 1.0 NA NA NA
#> 3 112 0.5 0.75 NA NA
#> 4 113 NA NA 0.8 NA
你还提到了一些问题,即user_id
被困为某些数据的rownames。 tibble::rownames_to_column()
对此很有帮助。
library(tibble)
head(mtcars)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
#> Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mtcars %>%
rownames_to_column("model") %>%
head()
#> model mpg cyl disp hp drat wt qsec vs am gear carb
#> 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
#> 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
#> 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
#> 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
#> 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
#> 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
此外,由于您使用了大量tidyverse
函数,因此您可能需要查看dplyr::*_join
函数而不是合并。