Question

在真实考试之前，我有几个预测试的学生分数数据。

a<-(c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b<-c(300,230,400,NA,NA,NA,"2013-04-30")
c<-c(NA,260,410,420,NA,NA,"2013-05-30")
d<-c(300,230,400,NA,370,390,"2013-08-30")
df<-as.data.frame(rbind(b,c,d))
colnames(df)<-a
rownames(df)<-(c("student 1","student 2","student 3"))

实际数据表要大得多。由于日期差异很大，预测和考试之间的时间相对相似，我宁愿将真实日期转换为考试前的天数，因此它们是新的列名，而不是日期。据我所知，这将合并一些列，这是好的。我怎么能这样做？

Answer 1

这是解决这个问题的一种方法。我相信还有很多其他人。我评论了代码来解释每一步发生了什么：

# Load two libraries you need
library(tidyr)
library(dplyr)

# Construct data frame you provided
a <- (c("2013-02-25","2013-03-13","2013-04-24","2013-05-12","2013-07-12","2013-08-11","actual_exam_date"))
b <- c(300,230,400,NA,NA,NA,"2013-04-30")
c <- c(NA,260,410,420,NA,NA,"2013-05-30")
d <- c(300,230,400,NA,370,390,"2013-08-30")
df <- as.data.frame(rbind(b,c,d))
colnames(df) <- a

# Add student IDs as a column instead of row names and move them to first position
df$StudentID <- row.names(df)
row.names(df) <- NULL
df <- select(df, StudentID, everything())

# Gather date columns as 'categories' with score as the new column value
newdf <- df %>% gather(Date, Score, -actual_exam_date, -StudentID) %>% arrange(StudentID)

# Convert dates coded as factor variables into actual dates so we can do days to exam computation
newdf$actual_exam_date <- as.Date(as.character(newdf$actual_exam_date))
newdf$Date <- as.Date(as.character(newdf$Date))

# Create a new column of days before exam per student ID (group) and filter
# out dates with missing scores for each student
newdf <- newdf %>% group_by(StudentID) %>% mutate(daysBeforeExam = as.integer(difftime(actual_exam_date, Date, units = 'days'))) %>% filter(!is.na(Score))

# Plot the trends using ggplot
ggplot(newdf, aes(x = daysBeforeExam, y = Score, col = StudentID, group = StudentID)) + geom_line(size = 1) + geom_point(size = 2)

Answer 2

这是reshape2的另一个好用例，因为你想要用于绘图的长形式。例如：

# you are going to need the student id as a field
df$student_id <- row.names(df)

library('reshape2')

df2 <- melt(df, id.vars = c('student_id','actual_exam_date'),
                variable.name = 'pretest_date',
                value.name = 'pretest_score')

# drop empty observations
df2 <- df2[!is.na(df2$pretest_score),]

# these need to be dates
df2$actual_exam_date <- as.Date(df2$actual_exam_date)
df2$pretest_date <- as.Date(df2$pretest_date)

# date difference
df2$days_before_exam <- as.integer(df2$actual_exam_date - df2$pretest_date)

# scores need to be numeric
df2$pretest_score <- as.numeric(df2$pretest_score)

# now you can make some plots
library('ggplot2')

ggplot(df2, aes(x = days_before_exam, y = pretest_score, col=student_id) ) + 
  geom_line(lwd=1) + scale_x_reverse() + 
  geom_vline(xintercept = 0, linetype = 'dashed', lwd = 1) +
  ggtitle('Pretest Performance') + xlab('Days Before Exam') + ylab('Pretest Score')

用时间序列中的天数替换日期

2 个答案: