如何填写每月时间序列数据的缺失行?

时间:2019-03-20 21:16:12

标签: r data.table time-series tidyverse lubridate

这是我的数据的副本:

structure(list(date = structure(c(8596, 8631, 8659, 8687, 8733, 
8743, 8796, 8806, 8853, 8880, 8908, 8932, 8971, 8999, 9027, 9069, 
9097, 9111, 9160, 9188, 9212, 9230, 9279, 9309, 9328, 9363, 9391, 
9434, 9449, 9482, 9519, 9541, 9580, 9610, 9643, 9672, 9708, 9736, 
9764, 9799, 9827, 9850, 9890, 9920, 9947, 9975, 10007, 10038, 
10072, 10100, 10122, 10163, 10191, 10213, 10254, 10282, 10310, 
10345, 10354, 10385, 10418, 10469, 10497, 10528, 10556, 10570, 
10612, 10641, 10668, 10710, 10742, 10759, 10802, 10830, 10858, 
10893, 10914, 10947, 10984, 11010, 11038, 11066, 11096, 11135, 
11164, 11193, 11229, 11257, 11285, 11313, 11346, 11374, 11411, 
11435, 11467, 11502, 11514, 11565, 11592, 11621, 11649, 11677, 
11718, 11746, 11776, 11797, 11838, 11867, 11894, 11923, 11951, 
11979, 12021, 12035, 12077, 12105, 12133, 12160, 12189, 12231, 
12259, 12273, 12315, 12356, 12385, 12399, 12441, 12472, 12497, 
12538, 12553, 12591, 12630, 12658, 12686, 12714, 12742, 12770, 
12804, 12832, 12860, 12903, 12917, 12938, 12986, 13015, 13056, 
13085, 13116, 13139, 13169, 13204, 13232, 13260, 13288, 13301, 
13357, 13385, 13414, 13442, 13470, 13498, 13533, 13561, 13603, 
13631, 13658, 13694, 13722, 13750, 13778, 13805, 13846, 13862, 
13896, 13925, 13967, 13995, 14009, 14050, 14078, 14121, 14149, 
14177, 14205, 14233, 14268, 14296, 14323, 14352, 14380, 14449, 
14474, 14506, 14548, 14575, 14590, 14618, 14661, 14688, 14729, 
14758, 14761, 14821, 14849, 14877, 14905, 14933, 14961, 14995, 
15024, 15038, 15093, 15121, 15135, 15185, 15212, 15241, 15269, 
15297, 15325, 15360, 15387, 15430, 15458, 15485, 15513, 15542, 
15583, 15611, 15639, 15667, 15696, 15731, 15745, 15786, 15815, 
15842, 15917, 15945, 15966, 16001, 16030, 16076, 16129, 16143, 
16184, 16276, 16303, 16343, 16374, 16400, 16417, 16455, 16482, 
16525, 16553, 16585, 16612, 16646, 16678, 16706, 16729, 16752, 
16777, 16819, 16860, 16891, 16916, 16925, 16976, 17002, 17042, 
17072, 17100, 17120, 17141, 17178, 17224, 17245, 17261, 17304, 
17330, 17373, 17401, 17459, 17488, 17512, 17548, 17581, 17598, 
17631), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(16.5027083333333, 
17.325, 17.1888888888889, 15.8277777777778, 16.6583333333333, 
17.3333333333333, 16.64375, 17.1133333333333, 17.895119047619, 
18.5694444444444, 18.8222222222222, 17.4305555555556, 17.6555555555556, 
17.025, 17.3222222222222, 17.2770833333333, 17.4805555555556, 
16.9708333333333, 17.9666666666667, 17.1222222222222, 18.0166666666667, 
17.25, 18.1875, 17.6577777777778, 16.6541666666667, 17.1083333333333, 
16.4666666666667, 17.5972756410256, 17.2, 17.4444444444444, 16.95, 
17.7, 17.9222222222222, 18.4875, 17.8229166666667, 16.9166666666667, 
16.7083333333333, 17.1666666666667, 17.3111111111111, 18.2333333333333, 
16.6277777777778, 17.5875, 17.3833333333333, 17.4638888888889, 
17.725, 18.1388888888889, 17.7001111111111, 17.7222222222222, 
17.2041666666667, 17.8255952380952, 17.1833333333333, 17.8103070175439, 
17.8194444444444, 17.952, 18.158412414966, 18.4910714285714, 
18.3488562091503, 19.1341830065359, 18.45, 18.9107142857143, 
17.2275, 19.0828761904762, 18.1599701591512, 18.965739220457, 
18.6720606060606, 18.8786057692308, 18.602656449553, 18.6327347883598, 
19.2925198412698, 20.1952463624339, 18.8900384227765, 18.0934444444444, 
18.0554871794872, 17.8405270655271, 17.5540598290598, 17.454122110648, 
17.5764155982906, 16.9989942528736, 16.4252032967033, 16.5388571428571, 
17.0108695652174, 17.7725308641975, 18.4252564102564, 17.2278899240856, 
17.3102091315453, 17.3627204585538, 17.280641025641, 17.3746616809117, 
17.3014601139601, 17.2238271604938, 16.379012345679, 16.6044444444444, 
17.624415954416, 18.4023148148148, 18.0341435185185, 17.3016666666667, 
17.8204861111111, 17.827264957265, 17.2772467320261, 17.8786954365079, 
17.84375, 17.1732638888889, 16.9219907407407, 17.3826388888889, 
17.7413333333333, 18.4948412698413, 18.2363425925926, 17.3282057823129, 
17.5083333333333, 17.414898989899, 16.9453125, 17.4988095238095, 
17.6704012345679, 18.1333333333333, 18.11875, 17.4805555555556, 
17.4271367521368, 17.9006944444444, 17.9818181818182, 17.3125, 
16.73625, 17.2666666666667, 17.4279340277778, 17.8584444444444, 
17.2966666666667, 17.1, 18.3420833333333, 18.5814285714286, 17.6430555555556, 
18.2307122507123, 18.0830687830688, 16.7563492063492, 16.9055555555556, 
17.0090277777778, 17.3863095238095, 16.9139880952381, 16.7479166666667, 
17.0888888888889, 17.7648148148148, 18.2277777777778, 19.3694444444444, 
17.7064021164021, 18.7371527777778, 17.94375, 17.9416666666667, 
17.8736111111111, 18.5354166666667, 18.1919444444444, 18.2555555555556, 
17.7704365079365, 17.3509259259259, 17.3931216931217, 18.3355923202614, 
17.9180555555556, 18.2104166666667, 18.0171121593291, 17.6840277777778, 
17.5509259259259, 16.9631313131313, 17.4478070175439, 17.6916666666667, 
17.6143376068376, 18.7415656565657, 19.0048611111111, 18.285462962963, 
18.3816964285714, 18.2041310541311, 17.2343518518519, 17.2149382716049, 
17.3684027777778, 17.5229861111111, 16.8517857142857, 19.0929141414141, 
19.300404040404, 18.735, 17.9280277777778, 18.4470274170274, 
19.0686597406425, 18.325, 18.5, 18.4388888888889, 18.7291666666667, 
18.3708333333333, 18.0234918630752, 19.4925980392157, 19.2101488095238, 
19.3890625, 18.5150793650794, 19.1944444444444, 19.0815277777778, 
19.5192658730159, 17.2212418300654, 17.8081168831169, 18.2517361111111, 
17.7775555555556, 18.012962962963, 17.0347222222222, 16.5888888888889, 
18.8123101604278, 18.9187091503268, 19.0161111111111, 19.2625, 
20.875, 18.8092592592593, 18.6526515151515, 18.9083333333333, 
18.9835227272727, 18.1829292929293, 17.9060606060606, 17.7835227272727, 
17.8237719298246, 19.7386363636364, 18.4961051693405, 18.5332727272727, 
18.3787878787879, 18.5134199134199, 17.8098930481283, 18.4179292929293, 
17.230303030303, 18.9035064935065, 17.8935897435897, 17.6211966604824, 
17.9238095238095, 18.8382886904762, 19.42625, 18.6395833333333, 
18.0652777777778, 19.3354166666667, 18.75359375, 17.951123043623, 
17.6063068181818, 17.828022875817, 17.5528846153846, 18.5647727272727, 
19.0318181818182, 19.1659090909091, 18.8997564935065, 19.1301136363636, 
18.1705882352941, 17.1361570247934, 18.6090909090909, 18.1429951690821, 
17.8829545454545, 18.3387983091787, 18.41875, 19.7, 20.2508333333333, 
17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812, 
17.246412037037, 17.0846153846154, 17.7060185185185, 18.325, 
18.5408333333333, 19.4251587301587, 18.3706018518519, 17.917, 
17.91, 18.6451388888889, 18.29375, 17.2316666666667, 18.7189393939394, 
18.1669193548387, 18.367297979798, 17.7043055555556, 18.1879520697168, 
19.12, 20.425, 18.6663888888889, 17.5108796296296, 18.1883333333333, 
18.3060049019608, 18.32625, 18.2861111111111, 18.0375, 17.3445175438596, 
18.6451058201058, 18.97875, 19.4583333333333, 18.2597222222222, 
19.9197222222222, 18.2342307692308, 18.7666666666667, 19.8277777777778, 
17.6464285714286, 18.690873015873, 18.4520833333333, 19.8696428571429, 
19.9833333333333, 18.2416666666667)), class = "data.frame", row.names = c(NA, 
-292L))

我的数据采用YYYY-MM-DD格式,并且是每月数据。目前,有几个月的数据缺失(例如2017-09、2014-05、2014-06、2013-12),但未在数据框中指定。如何为整个数据集中可能丢失的月份创建新行?由于我的数据集有两列,因此除了日期列之外的另一列应具有为新的缺失月份行指定的NA值。我正在寻找tidyverse,lubridate或data.table解决方案。

1 个答案:

答案 0 :(得分:1)

您可以为此使用tidyr::complete,但是您还有额外的皱纹,因为每个月的不同日期都有约会。首先,您需要创建一个列来数月,我们可以使用day(x) <-中的lubridate设置器来完成此操作。

这里是一个示例,其中使用的数据为了简洁起见被截断为2014年。请注意,您应该使用seq.Date来指定要包含在month列中的全部日期,并且NA列中还将有date个。 (如果需要,您可以替换为每月的第一天)

library(tidyverse)
library(lubridate)

tbl <- structure(list(date = structure(c(16076, 16129, 16143, 16184, 16276, 16303, 16343, 16374, 16400, 16417), tzone = "UTC", tclass = "Date", class = "Date"), AverageTemp = c(18.3387983091787, 18.41875, 19.7, 20.2508333333333, 17.6387426900585, 18.1770897832817, 17.5400297619048, 17.7547246376812, 17.246412037037, 17.0846153846154)), row.names = c(NA, -10L), class = "data.frame")

tbl %>%
  mutate(month = date %>% `day<-`(1)) %>%
  complete(month = seq.Date(min(month), max(month), by = "month"))
#> # A tibble: 12 x 3
#>    month      date       AverageTemp
#>    <date>     <date>           <dbl>
#>  1 2014-01-01 2014-01-06        18.3
#>  2 2014-02-01 2014-02-28        18.4
#>  3 2014-03-01 2014-03-14        19.7
#>  4 2014-04-01 2014-04-24        20.3
#>  5 2014-05-01 NA                NA  
#>  6 2014-06-01 NA                NA  
#>  7 2014-07-01 2014-07-25        17.6
#>  8 2014-08-01 2014-08-21        18.2
#>  9 2014-09-01 2014-09-30        17.5
#> 10 2014-10-01 2014-10-31        17.8
#> 11 2014-11-01 2014-11-26        17.2
#> 12 2014-12-01 2014-12-13        17.1

作为替代,您可以只获取年和月组成部分,并在两者的组合上使用complete

tbl %>%
  mutate(year = year(date), month = month(date)) %>%
  complete(year = min(year):max(year), month = 1:12)
#> # A tibble: 12 x 4
#>     year month date       AverageTemp
#>    <dbl> <dbl> <date>           <dbl>
#>  1  2014     1 2014-01-06        18.3
#>  2  2014     2 2014-02-28        18.4
#>  3  2014     3 2014-03-14        19.7
#>  4  2014     4 2014-04-24        20.3
#>  5  2014     5 NA                NA  
#>  6  2014     6 NA                NA  
#>  7  2014     7 2014-07-25        17.6
#>  8  2014     8 2014-08-21        18.2
#>  9  2014     9 2014-09-30        17.5
#> 10  2014    10 2014-10-31        17.8
#> 11  2014    11 2014-11-26        17.2
#> 12  2014    12 2014-12-13        17.1

reprex package(v0.2.1)于2019-03-20创建