我一直在尝试学习R一段时间,但还没有把我的知识提升到一个不错的水平。请帮我解决这个问题。
我有一个包含5000行的csv数据文件,其中包含以下数据字段:名称,频道(内部或外部),调查发送日期&调查收到日期。
基础数据看起来像这样
我希望将其置于以下格式
我试过这个
library("reshape2")
dcast(w, Recruiter~channel)"
工作正常,但我不知道如何添加“调查发送”,“调查收到&”调查发送 - 调查收到“
答案 0 :(得分:5)
dplyr
解决方案......
> head(data)
Name Channel Sent Recd
1 A Internal 2014-07-10 2014-07-12
2 A Internal 2014-07-16 <NA>
3 A External 2014-08-04 2014-08-10
4 A Internal 2014-08-16 2014-08-18
5 A Internal 2014-07-29 <NA>
6 A External 2014-08-05 2014-08-14
然后:
require(dplyr)
data %>%
group_by(Name) %>%
summarise(
External=sum(Channel=="External"),
Internal=sum(Channel=="Internal"),
Total=n(),
Sent=sum(!is.na(Sent)),
Recd=sum(!is.na(Recd))
) %>%
mutate(Pending=Sent-Recd)
给出:
Name External Internal Total Sent Recd Pending
1 A 6 4 10 10 8 2
2 B 2 7 9 9 6 3
3 C 4 5 9 9 4 5
注意我已将真实Date
个对象用于日期,NA
用于丢失数据。
由此产生的数据:
data =
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
Channel = c("Internal", "Internal", "External", "Internal",
"Internal", "External", "External", "External", "External",
"External", "Internal", "External", "Internal", "Internal",
"Internal", "External", "Internal", "Internal", "Internal",
"Internal", "Internal", "External", "Internal", "External",
"External", "External", "Internal", "Internal"), Sent = structure(c(16261,
16267, 16286, 16298, 16280, 16287, 16294, 16292, 16291, 16282,
16304, 16297, 16262, 16274, 16264, 16270, 16252, 16276, 16279,
16275, 16277, 16293, 16253, 16272, 16288, 16283, 16281, 16296
), class = "Date"), Recd = structure(c(16263.5024573486,
NA, 16292.4899729695, 16300.3446546271, NA, 16296.9054549634,
16301.318120582, 16301.4672047794, 16295.238142278, 16286.8117301762,
NA, 16306.6499495078, NA, 16282.0412430186, 16272.4275530744,
16273.9005153924, 16255.7532094959, NA, 16284.9287535194,
NA, 16279.182732366, 16302.4864703286, NA, NA, 16296.6838856321,
NA, 16290.3657759354, NA), class = "Date")), .Names = c("Name",
"Channel", "Sent", "Recd"), row.names = c(NA, -28L), class = "data.frame")
答案 1 :(得分:2)
或使用data.table
(使用@ Spacedman&#39; s数据)
library(data.table)
DT1 <- setDT(data)[, indx:= Channel=="External"][,
list(External=sum(indx), Internal=sum(!indx), Total=.N, Sent=sum(!is.na(Sent)),
Recd=sum(!is.na(Recd))), by=Name][, Pending:=Sent-Recd]
DT1
# Name External Internal Total Sent Recd Pending
#1: A 6 4 10 10 8 2
#2: B 2 7 9 9 6 3
#3: C 4 5 9 9 4 5
答案 2 :(得分:0)
尝试使用以下简单代码:
outdf = dcast(ddf, name~channel, length)
outdf$total_channel = outdf$external + outdf$internal
outdf$survey_sent = data.frame(table(ddf$name))$Freq
outdf$survey_rcd = data.frame(with(ddf[ddf$survey_rcd!="",], table(name)))$Freq
outdf$survey_pending= outdf$survey_sent - outdf$survey_rcd
outdf
# name external internal total_channel survey_sent survey_rcd survey_pending
#1 a 0 4 4 4 2 2
#2 b 4 1 5 5 2 3
#3 c 2 2 4 4 3 1
示例数据:
ddf = structure(list(name = c("a", "a", "a", "a", "b", "b", "b", "b",
"b", "c", "c", "c", "c"), channel = c("internal", "internal",
"internal", "internal", "external", "external", "external", "external",
"internal", "internal", "internal", "external", "external"),
survey_sent = c("15/02/13", "16/02/13", "17/02/13", "18/02/13",
"19/02/13", "20/02/13", "21/02/13", "22/02/13", "23/02/13",
"24/02/13", "25/02/13", "26/02/13", "27/02/13"), survey_rcd = c("26/03/14",
"", "", "29/03/14", "30/03/14", "", "", "", "03/04/14", "04/04/14",
"", "06/04/14", "07/04/14")), .Names = c("name", "channel",
"survey_sent", "survey_rcd"), class = "data.frame", row.names = c(NA,
-13L))
ddf
name channel survey_sent survey_rcd
1 a internal 15/02/13 26/03/14
2 a internal 16/02/13
3 a internal 17/02/13
4 a internal 18/02/13 29/03/14
5 b external 19/02/13 30/03/14
6 b external 20/02/13
7 b external 21/02/13
8 b external 22/02/13
9 b internal 23/02/13 03/04/14
10 c internal 24/02/13 04/04/14
11 c internal 25/02/13
12 c external 26/02/13 06/04/14
13 c external 27/02/13 07/04/14