如何在R中透视表

时间:2014-08-26 10:33:07

标签: r reshape

我一直在尝试学习R一段时间,但还没有把我的知识提升到一个不错的水平。请帮我解决这个问题。

我有一个包含5000行的csv数据文件,其中包含以下数据字段:名称,频道(内部或外部),调查发送日期&调查收到日期。

基础数据看起来像这样

enter image description here

我希望将其置于以下格式

enter image description here

我试过这个

library("reshape2") 
dcast(w, Recruiter~channel)" 

工作正常,但我不知道如何添加“调查发送”,“调查收到&”调查发送 - 调查收到“

3 个答案:

答案 0 :(得分:5)

dplyr解决方案......

> head(data)
  Name  Channel       Sent       Recd
1    A Internal 2014-07-10 2014-07-12
2    A Internal 2014-07-16       <NA>
3    A External 2014-08-04 2014-08-10
4    A Internal 2014-08-16 2014-08-18
5    A Internal 2014-07-29       <NA>
6    A External 2014-08-05 2014-08-14

然后:

require(dplyr)
data %>% 
group_by(Name) %>% 
summarise(
  External=sum(Channel=="External"),
  Internal=sum(Channel=="Internal"),
  Total=n(),
  Sent=sum(!is.na(Sent)),
  Recd=sum(!is.na(Recd))
) %>% 
mutate(Pending=Sent-Recd)

给出:

  Name External Internal Total Sent Recd Pending
1    A        6        4    10   10    8       2
2    B        2        7     9    9    6       3
3    C        4        5     9    9    4       5

注意我已将真实Date个对象用于日期,NA用于丢失数据。

由此产生的数据:

data = 
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"), 
    Channel = c("Internal", "Internal", "External", "Internal", 
    "Internal", "External", "External", "External", "External", 
    "External", "Internal", "External", "Internal", "Internal", 
    "Internal", "External", "Internal", "Internal", "Internal", 
    "Internal", "Internal", "External", "Internal", "External", 
    "External", "External", "Internal", "Internal"), Sent = structure(c(16261, 
    16267, 16286, 16298, 16280, 16287, 16294, 16292, 16291, 16282, 
    16304, 16297, 16262, 16274, 16264, 16270, 16252, 16276, 16279, 
    16275, 16277, 16293, 16253, 16272, 16288, 16283, 16281, 16296
    ), class = "Date"), Recd = structure(c(16263.5024573486, 
    NA, 16292.4899729695, 16300.3446546271, NA, 16296.9054549634, 
    16301.318120582, 16301.4672047794, 16295.238142278, 16286.8117301762, 
    NA, 16306.6499495078, NA, 16282.0412430186, 16272.4275530744, 
    16273.9005153924, 16255.7532094959, NA, 16284.9287535194, 
    NA, 16279.182732366, 16302.4864703286, NA, NA, 16296.6838856321, 
    NA, 16290.3657759354, NA), class = "Date")), .Names = c("Name", 
"Channel", "Sent", "Recd"), row.names = c(NA, -28L), class = "data.frame")

答案 1 :(得分:2)

或使用data.table(使用@ Spacedman&#39; s数据)

 library(data.table)
  DT1 <- setDT(data)[, indx:= Channel=="External"][,
            list(External=sum(indx), Internal=sum(!indx), Total=.N, Sent=sum(!is.na(Sent)),
           Recd=sum(!is.na(Recd))), by=Name][, Pending:=Sent-Recd]

 DT1
#   Name External Internal Total Sent Recd Pending
#1:    A        6        4    10   10    8       2
#2:    B        2        7     9    9    6       3
#3:    C        4        5     9    9    4       5

答案 2 :(得分:0)

尝试使用以下简单代码:

outdf = dcast(ddf, name~channel, length)
outdf$total_channel = outdf$external + outdf$internal
outdf$survey_sent = data.frame(table(ddf$name))$Freq
outdf$survey_rcd = data.frame(with(ddf[ddf$survey_rcd!="",], table(name)))$Freq
outdf$survey_pending= outdf$survey_sent - outdf$survey_rcd
outdf
# name external internal total_channel survey_sent survey_rcd survey_pending
#1    a        0        4             4           4          2              2
#2    b        4        1             5           5          2              3
#3    c        2        2             4           4          3              1

示例数据:

ddf = structure(list(name = c("a", "a", "a", "a", "b", "b", "b", "b", 
"b", "c", "c", "c", "c"), channel = c("internal", "internal", 
"internal", "internal", "external", "external", "external", "external", 
"internal", "internal", "internal", "external", "external"), 
    survey_sent = c("15/02/13", "16/02/13", "17/02/13", "18/02/13", 
    "19/02/13", "20/02/13", "21/02/13", "22/02/13", "23/02/13", 
    "24/02/13", "25/02/13", "26/02/13", "27/02/13"), survey_rcd = c("26/03/14", 
    "", "", "29/03/14", "30/03/14", "", "", "", "03/04/14", "04/04/14", 
    "", "06/04/14", "07/04/14")), .Names = c("name", "channel", 
"survey_sent", "survey_rcd"), class = "data.frame", row.names = c(NA, 
-13L))

 ddf
   name  channel survey_sent survey_rcd
1     a internal    15/02/13   26/03/14
2     a internal    16/02/13           
3     a internal    17/02/13           
4     a internal    18/02/13   29/03/14
5     b external    19/02/13   30/03/14
6     b external    20/02/13           
7     b external    21/02/13           
8     b external    22/02/13           
9     b internal    23/02/13   03/04/14
10    c internal    24/02/13   04/04/14
11    c internal    25/02/13           
12    c external    26/02/13   06/04/14
13    c external    27/02/13   07/04/14