我有一个数据集,其中包含加州公立学校的开学日期和开学日期。问题底部的here或dput()
可用。数据还列出了它的学校类型和所在地。我正在尝试创建一个运行总计列,该列也考虑了学校关闭以及学校类型。
这是我提出的解决方案,这基本上需要我使用ifelse
根据条件编码许多不同的1和0:
# open charter schools
pubschls$open_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# open public schools
pubschls$open_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==TRUE, 1, 0)
# closed charters
pubschls$closed_chart <- ifelse(pubschls$Charter=="Y" & is.na(pubschls$ClosedDate)==FALSE, 1, 0)
# closed public schools
pubschls$closed_pub <- ifelse(pubschls$Charter=="N" & is.na(pubschls$ClosedDate)==FALSE, 1, 0)
lausd <- filter(pubschls, NCESDist=="0622710")
# count number open during each year
然后我从彼此中减去列以获得总计。
la_schools_count <- aggregate(lausd[c('open_chart','closed_chart','open_pub','closed_pub')],
by=list(year(lausd$OpenDate)), sum)
# find net charters by subtracting closed from open
la_schools_count$net_chart <- la_schools_count$open_chart - la_schools_count$closed_chart
# find net public schools by subtracting closed from open
la_schools_count$net_pub <- la_schools_count$open_pub - la_schools_count$closed_pub
# add running totals
la_schools_count$cum_chart <- cumsum(la_schools_count$net_chart)
la_schools_count$cum_pub <- cumsum(la_schools_count$net_pub)
# total totals
la_schools_count$total <- la_schools_count$cum_chart + la_schools_count$cum_pub
我的输出如下:
la_schools_count <- select(la_schools_count, "year", "cum_chart", "cum_pub", "pen_rate", "total")
year cum_chart cum_pub pen_rate total
1 1952 1 0 100.00000 1
2 1956 1 1 50.00000 2
3 1969 1 2 33.33333 3
4 1980 55 469 10.49618 524
5 1989 55 470 10.47619 525
6 1990 55 470 10.47619 525
7 1991 55 473 10.41667 528
8 1992 55 476 10.35782 531
9 1993 55 477 10.33835 532
10 1994 56 478 10.48689 534
11 1995 57 478 10.65421 535
12 1996 57 479 10.63433 536
13 1997 58 481 10.76067 539
14 1998 59 480 10.94620 539
15 1999 61 480 11.27542 541
16 2000 61 481 11.25461 542
17 2001 62 482 11.39706 544
18 2002 64 484 11.67883 548
19 2003 73 485 13.08244 558
20 2004 83 496 14.33506 579
21 2005 90 524 14.65798 614
22 2006 96 532 15.28662 628
23 2007 90 534 14.42308 624
24 2008 97 539 15.25157 636
25 2009 108 546 16.51376 654
26 2010 124 566 17.97101 690
27 2011 140 580 19.44444 720
28 2012 144 605 19.22563 749
29 2013 162 609 21.01167 771
30 2014 179 611 22.65823 790
31 2015 195 611 24.19355 806
32 2016 203 614 24.84700 817
33 2017 211 619 25.42169 830
我只是想知道这是否可以以更好的方式完成。就像基于条件的所有行的apply
语句一样?
dput:
structure(list(CDSCode = c("19647330100289", "19647330100297",
"19647330100669", "19647330100677", "19647330100743", "19647330100750"
), OpenDate = structure(c(12324, 12297, 12240, 12299, 12634,
12310), class = "Date"), ClosedDate = structure(c(NA, 15176,
NA, NA, NA, NA), class = "Date"), Charter = c("Y", "Y", "Y",
"Y", "Y", "Y")), .Names = c("CDSCode", "OpenDate", "ClosedDate",
"Charter"), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
答案 0 :(得分:0)
我按照你的代码了解了你在做什么,除了pen_rate
。似乎pen_rate
的计算方法是将cum_chart
除以total
。我下载原始数据集并执行以下操作。我调用了数据集foo
。 Whenclosed_pub),我合并了Charter
和ClosedDate
。我检查了ClosedDate是否为NA,并将逻辑输出转换为数字(1 =打开,0 =关闭)。这就是我创建四个组的方式(即open_chart,closed_chart,open_pub和closed_pub)。我猜这会要求你少打字。由于日期是有特色的,我使用substr()
提取年份。如果您有日期对象,则需要执行其他操作。有了年份,您可以使用它对数据进行分组,并使用count()
计算每种类型学校的学校数量。此部分相当于您的aggregate()
代码。然后,使用spread()
将输出转换为宽格式数据,并完成您在代码中演示的其余计算。最终输出似乎与您在问题中的结果不同,但我的结果与我通过运行您的代码得到的结果相同。我希望这会对你有所帮助。
library(dplyr)
library(tidyr)
library(readxl)
# Get the necessary data
foo <- read_xls("pubschls.xls") %>%
select(NCESDist, CDSCode, OpenDate, ClosedDate, Charter) %>%
filter(NCESDist == "0622710" & (!Charter %in% NA))
mutate(foo, group = paste(Charter, as.numeric(is.na(ClosedDate)), sep = "_"),
year = substr(OpenDate, star = nchar(OpenDate) - 3, stop = nchar(OpenDate))) %>%
count(year, group) %>%
spread(key = group, value = n, fill = 0) %>%
mutate(net_chart = Y_1 - Y_0,
net_pub = N_1 - N_0,
cum_chart = cumsum(net_chart),
cum_pub = cumsum(net_pub),
total = cum_chart + cum_pub,
pen_rate = cum_chart / total)
# A part of the outcome
# year N_0 N_1 Y_0 Y_1 net_chart net_pub cum_chart cum_pub total pen_rate
#1 1866 0 1 0 0 0 1 0 1 1 0.00000000
#2 1873 0 1 0 0 0 1 0 2 2 0.00000000
#3 1878 0 1 0 0 0 1 0 3 3 0.00000000
#4 1881 0 1 0 0 0 1 0 4 4 0.00000000
#5 1882 0 2 0 0 0 2 0 6 6 0.00000000
#110 2007 0 2 15 9 -6 2 87 393 480 0.18125000
#111 2008 2 8 9 15 6 6 93 399 492 0.18902439
#112 2009 1 9 4 15 11 8 104 407 511 0.20352250
#113 2010 5 26 5 21 16 21 120 428 548 0.21897810
#114 2011 2 16 2 18 16 14 136 442 578 0.23529412
#115 2012 2 27 3 7 4 25 140 467 607 0.23064250
#116 2013 1 5 1 19 18 4 158 471 629 0.25119237
#117 2014 1 3 1 18 17 2 175 473 648 0.27006173
#118 2015 0 0 2 18 16 0 191 473 664 0.28765060
#119 2016 0 3 0 8 8 3 199 476 675 0.29481481
#120 2017 0 5 0 9 9 5 208 481 689 0.30188679