从宽数据创建时间序列对象,按列值分组

时间:2018-11-28 20:22:00

标签: r time-series linear-regression

下面是我的数据框(my_df)。我正在尝试将时间序列作为预测2020年的对象,但我正在努力转换这种格式的数据。

我正在尝试使用下面的代码将其转换为时间序列,但是我在位置中得到NA,并且数据不是时间序列格式

我的尝试

ts(my_df[,c(-2,-3)], start=c(2009), end=c(2014), frequency=1)

下面的数据框my_df

structure(list(`Geogrphical Location` = c("United States", "Northeast", 
"Midwest", "South", "West", ".Alabama", ".Alaska", ".Arizona", 
".Arkansas", ".California", ".Colorado", ".Connecticut", ".Delaware", 
".District of Columbia", ".Florida", ".Georgia", ".Hawaii", ".Idaho", 
".Illinois", ".Indiana", ".Iowa", ".Kansas", ".Kentucky", ".Louisiana", 
".Maine", ".Maryland", ".Massachusetts", ".Michigan", ".Minnesota", 
".Mississippi", ".Missouri", ".Montana", ".Nebraska", ".Nevada", 
".New Hampshire", ".New Jersey", ".New Mexico", ".New York", 
".North Carolina", ".North Dakota", ".Ohio", ".Oklahoma", ".Oregon", 
".Pennsylvania", ".Rhode Island", ".South Carolina", ".South Dakota", 
".Tennessee", ".Texas", ".Utah", ".Vermont", ".Virginia", ".Washington", 
".West Virginia", ".Wisconsin", ".Wyoming", "Puerto Rico"), Census = c(308745538, 
55317240, 66927001, 114555744, 71945553, 4779736, 710231, 6392017, 
2915918, 37253956, 5029196, 3574097, 897934, 601723, 18801310, 
9687653, 1360301, 1567582, 12830632, 6483802, 3046355, 2853118, 
4339367, 4533372, 1328361, 5773552, 6547629, 9883640, 5303925, 
2967297, 5988927, 989415, 1826341, 2700551, 1316470, 8791894, 
2059179, 19378102, 9535483, 672591, 11536504, 3751351, 3831074, 
12702379, 1052567, 4625364, 814180, 6346105, 25145561, 2763885, 
625741, 8001024, 6724540, 1852994, 5686986, 563626, 3725789), 
    `Estimates Base` = c(308758105, 55318353, 66929825, 114563005, 
    71946922, 4780131, 710249, 6392301, 2916025, 37254522, 5029324, 
    3574114, 897936, 601766, 18804592, 9688680, 1360301, 1567650, 
    12831574, 6484136, 3046869, 2853129, 4339344, 4533479, 1328364, 
    5773786, 6547813, 9884129, 5303924, 2968103, 5988928, 989414, 
    1826334, 2700691, 1316461, 8791953, 2059198, 19378110, 9535688, 
    672591, 11536727, 3751615, 3831072, 12702857, 1052940, 4625410, 
    814195, 6346298, 25146100, 2763888, 625741, 8001041, 6724545, 
    1853011, 5687289, 563767, 3726157), `2010` = c(309348193, 
    55388056, 66978602, 114863114, 72118421, 4785492, 714031, 
    6408312, 2921995, 37332685, 5048644, 3579899, 899816, 605183, 
    18849098, 9713521, 1363945, 1571010, 12841578, 6490528, 3050738, 
    2858850, 4348662, 4544996, 1327730, 5788584, 6565524, 9877495, 
    5311147, 2970322, 5996118, 990641, 1830051, 2703284, 1316872, 
    8803729, 2064756, 19402640, 9558915, 674526, 11540983, 3759603, 
    3838048, 12712343, 1053337, 4635943, 816325, 6356671, 25244310, 
    2775326, 625982, 8025773, 6743226, 1854230, 5690263, 564513, 
    3721525), `2011` = c(311663358, 55632766, 67153331, 116061801, 
    72815460, 4799918, 722713, 6467163, 2939493, 37676861, 5118360, 
    3589893, 907924, 620477, 19096952, 9811610, 1377864, 1584143, 
    12860012, 6516480, 3065223, 2869503, 4369354, 4575404, 1328231, 
    5843603, 6611923, 9876213, 5348562, 2978162, 6010717, 997821, 
    1842283, 2718379, 1318473, 8841243, 2077756, 19519529, 9650963, 
    685476, 11544824, 3786274, 3868031, 12744293, 1052451, 4672637, 
    824398, 6397634, 25646389, 2816124, 626730, 8110035, 6822520, 
    1854972, 5709640, 567725, 3678732), `2012` = c(313998379, 
    55829059, 67332320, 117299171, 73537829, 4815960, 731089, 
    6549634, 2950685, 38011074, 5189867, 3593795, 916993, 635327, 
    19344156, 9914668, 1391820, 1595911, 12870798, 6537743, 3076310, 
    2885262, 4384799, 4603429, 1328895, 5889651, 6658008, 9887238, 
    5380285, 2984945, 6025415, 1005196, 1855725, 2752565, 1321182, 
    8873211, 2083784, 19602769, 9746175, 702087, 11550839, 3817054, 
    3899116, 12771854, 1052901, 4720760, 834441, 6454306, 26071655, 
    2855782, 626444, 8192048, 6895226, 1856560, 5726177, 576765, 
    3634488), `2013` = c(316204908, 55988771, 67543948, 118424320, 
    74247869, 4829479, 736879, 6624617, 2958663, 38335203, 5267603, 
    3596003, 925395, 649165, 19582022, 9984938, 1406481, 1612011, 
    12879505, 6569102, 3091930, 2892821, 4400477, 4626402, 1329076, 
    5931129, 6706786, 9898982, 5418521, 2990482, 6042711, 1014314, 
    1868559, 2786464, 1322687, 8899162, 2085193, 19673546, 9841590, 
    724019, 11570022, 3852415, 3925751, 12781338, 1053033, 4767894, 
    844922, 6494821, 26473525, 2902663, 627140, 8262692, 6968006, 
    1853231, 5742854, 582684, 3593077), `2014` = c(318563456, 
    56116791, 67726368, 119696311, 75023986, 4843214, 736705, 
    6719993, 2966912, 38680810, 5349648, 3591873, 934948, 659005, 
    19888741, 10087231, 1416349, 1633532, 12867544, 6595233, 
    3108030, 2899360, 4413057, 4647880, 1330719, 5967295, 6749911, 
    9915767, 5453109, 2992400, 6060930, 1022867, 1881145, 2833013, 
    1328743, 8925001, 2083024, 19718515, 9934399, 739904, 11594408, 
    3877499, 3968371, 12790565, 1054480, 4828430, 852561, 6544663, 
    26944751, 2941836, 626984, 8317372, 7054196, 1848514, 5758377, 
    583642, 3534874), `2015` = c(320896618, 56184737, 67838387, 
    121039206, 75834288, 4853875, 737709, 6817565, 2977853, 38993940, 
    5448819, 3584730, 944076, 670377, 20244914, 10199398, 1425157, 
    1652828, 12839047, 6612768, 3121997, 2906721, 4424611, 4668960, 
    1329453, 5994983, 6784240, 9917715, 5482435, 2989390, 6076204, 
    1032073, 1893765, 2883758, 1330111, 8935421, 2080328, 19747183, 
    10035186, 756835, 11605090, 3907414, 4024634, 12791904, 1055607, 
    4894834, 857919, 6595056, 27429639, 2990632, 626088, 8367587, 
    7160290, 1841053, 5767891, 586555, 3473181), `2016` = c(323127513, 
    56209510, 67941429, 122319574, 76657000, 4863300, 741894, 
    6931071, 2988248, 39250017, 5540545, 3576452, 952065, 681170, 
    20612439, 10310371, 1428557, 1683140, 12801539, 6633053, 
    3134693, 2907289, 4436974, 4681666, 1331479, 6016447, 6811779, 
    9928300, 5519952, 2988726, 6093000, 1042520, 1907116, 2940058, 
    1334795, 8944469, 2081015, 19745289, 10146788, 757952, 11614373, 
    3923561, 4093465, 12784227, 1056426, 4961119, 865454, 6651194, 
    27862596, 3051217, 624594, 8411808, 7288000, 1831102, 5778708, 
    585501, 3411307)), row.names = c(NA, -57L), class = c("tbl_df", 
"tbl", "data.frame"))

请帮助我建立一个时间序列对象,以便可以使用线性回归然后针对第1列中提到的任何状态预测2020年

2 个答案:

答案 0 :(得分:1)

tsibble软件包旨在简化此过程。

library(tidyverse)
library(tsibble)
my_ts <- my_df %>% 
  rename(Location = "Geogrphical Location") %>%
  select(Location, `2010`:`2016`) %>%
  gather(key="Year", value="value", `2010`:`2016`) %>%
  mutate(
    Year = as.numeric(Year),
    Location = gsub("\\.", "", Location)
  ) %>%
  as_tsibble(index=Year, key=id(Location)) %>%
  as.ts()

其中大多数只是tidyverse代码,将数据以长格式存储,并清理变量名称和位置值。 as_tsibble()行完成了将其设置为多元时间序列的大部分工作,然后as.ts()使其成为ts对象。

my_ts
#> Time Series:
#> Start = 2010 
#> End = 2016 
#> Frequency = 1 
#>      Alabama Alaska Arizona Arkansas California Colorado Connecticut
#> 2010 4785492 714031 6408312  2921995   37332685  5048644     3579899
#> 2011 4799918 722713 6467163  2939493   37676861  5118360     3589893
#> 2012 4815960 731089 6549634  2950685   38011074  5189867     3593795
#> 2013 4829479 736879 6624617  2958663   38335203  5267603     3596003
#> 2014 4843214 736705 6719993  2966912   38680810  5349648     3591873
#> 2015 4853875 737709 6817565  2977853   38993940  5448819     3584730
#> 2016 4863300 741894 6931071  2988248   39250017  5540545     3576452

答案 1 :(得分:0)

尝试将位置变量转换为因子。但是,位置然后以因子的数字格式显示(标签被丢弃)。

df1$`Geogrphical Location` <- as.factor(df1$`Geogrphical Location`)

> ts(df1[, -(2:3)], start=c(2009), end=c(2014), frequency=1)
Time Series:
Start = 2009 
End = 2014 
Frequency = 1 
     Geogrphical Location      2010      2011      2012      2013      2014      2015      2016
2009                   56 309348193 311663358 313998379 316204908 318563456 320896618 323127513
2010                   53  55388056  55632766  55829059  55988771  56116791  56184737  56209510
2011                   52  66978602  67153331  67332320  67543948  67726368  67838387  67941429
2012                   55 114863114 116061801 117299171 118424320 119696311 121039206 122319574
2013                   57  72118421  72815460  73537829  74247869  75023986  75834288  76657000
2014                    1   4785492   4799918   4815960   4829479   4843214   4853875   4863300

一些“密码本”为您提供了此密码:

data.frame(lbl=df1$`Geogrphical Location`, 
           num=as.numeric(df1$`Geogrphical Location`))

注意:最好不要在R中的变量名中使用空格。

names(df1)[1] <- "Geographical.Location"

您也可以摆脱领先优势。

df1$Geographical.Location <- gsub("\\.", "", df1$Geographical.Location)

然后像上面一样做

df1$Geographical.Location <- as.factor(df1$Geographical.Location)