下面是我的数据框(my_df)。我正在尝试将时间序列作为预测2020年的对象,但我正在努力转换这种格式的数据。
我正在尝试使用下面的代码将其转换为时间序列,但是我在位置中得到NA,并且数据不是时间序列格式
我的尝试
ts(my_df[,c(-2,-3)], start=c(2009), end=c(2014), frequency=1)
下面的数据框my_df
structure(list(`Geogrphical Location` = c("United States", "Northeast",
"Midwest", "South", "West", ".Alabama", ".Alaska", ".Arizona",
".Arkansas", ".California", ".Colorado", ".Connecticut", ".Delaware",
".District of Columbia", ".Florida", ".Georgia", ".Hawaii", ".Idaho",
".Illinois", ".Indiana", ".Iowa", ".Kansas", ".Kentucky", ".Louisiana",
".Maine", ".Maryland", ".Massachusetts", ".Michigan", ".Minnesota",
".Mississippi", ".Missouri", ".Montana", ".Nebraska", ".Nevada",
".New Hampshire", ".New Jersey", ".New Mexico", ".New York",
".North Carolina", ".North Dakota", ".Ohio", ".Oklahoma", ".Oregon",
".Pennsylvania", ".Rhode Island", ".South Carolina", ".South Dakota",
".Tennessee", ".Texas", ".Utah", ".Vermont", ".Virginia", ".Washington",
".West Virginia", ".Wisconsin", ".Wyoming", "Puerto Rico"), Census = c(308745538,
55317240, 66927001, 114555744, 71945553, 4779736, 710231, 6392017,
2915918, 37253956, 5029196, 3574097, 897934, 601723, 18801310,
9687653, 1360301, 1567582, 12830632, 6483802, 3046355, 2853118,
4339367, 4533372, 1328361, 5773552, 6547629, 9883640, 5303925,
2967297, 5988927, 989415, 1826341, 2700551, 1316470, 8791894,
2059179, 19378102, 9535483, 672591, 11536504, 3751351, 3831074,
12702379, 1052567, 4625364, 814180, 6346105, 25145561, 2763885,
625741, 8001024, 6724540, 1852994, 5686986, 563626, 3725789),
`Estimates Base` = c(308758105, 55318353, 66929825, 114563005,
71946922, 4780131, 710249, 6392301, 2916025, 37254522, 5029324,
3574114, 897936, 601766, 18804592, 9688680, 1360301, 1567650,
12831574, 6484136, 3046869, 2853129, 4339344, 4533479, 1328364,
5773786, 6547813, 9884129, 5303924, 2968103, 5988928, 989414,
1826334, 2700691, 1316461, 8791953, 2059198, 19378110, 9535688,
672591, 11536727, 3751615, 3831072, 12702857, 1052940, 4625410,
814195, 6346298, 25146100, 2763888, 625741, 8001041, 6724545,
1853011, 5687289, 563767, 3726157), `2010` = c(309348193,
55388056, 66978602, 114863114, 72118421, 4785492, 714031,
6408312, 2921995, 37332685, 5048644, 3579899, 899816, 605183,
18849098, 9713521, 1363945, 1571010, 12841578, 6490528, 3050738,
2858850, 4348662, 4544996, 1327730, 5788584, 6565524, 9877495,
5311147, 2970322, 5996118, 990641, 1830051, 2703284, 1316872,
8803729, 2064756, 19402640, 9558915, 674526, 11540983, 3759603,
3838048, 12712343, 1053337, 4635943, 816325, 6356671, 25244310,
2775326, 625982, 8025773, 6743226, 1854230, 5690263, 564513,
3721525), `2011` = c(311663358, 55632766, 67153331, 116061801,
72815460, 4799918, 722713, 6467163, 2939493, 37676861, 5118360,
3589893, 907924, 620477, 19096952, 9811610, 1377864, 1584143,
12860012, 6516480, 3065223, 2869503, 4369354, 4575404, 1328231,
5843603, 6611923, 9876213, 5348562, 2978162, 6010717, 997821,
1842283, 2718379, 1318473, 8841243, 2077756, 19519529, 9650963,
685476, 11544824, 3786274, 3868031, 12744293, 1052451, 4672637,
824398, 6397634, 25646389, 2816124, 626730, 8110035, 6822520,
1854972, 5709640, 567725, 3678732), `2012` = c(313998379,
55829059, 67332320, 117299171, 73537829, 4815960, 731089,
6549634, 2950685, 38011074, 5189867, 3593795, 916993, 635327,
19344156, 9914668, 1391820, 1595911, 12870798, 6537743, 3076310,
2885262, 4384799, 4603429, 1328895, 5889651, 6658008, 9887238,
5380285, 2984945, 6025415, 1005196, 1855725, 2752565, 1321182,
8873211, 2083784, 19602769, 9746175, 702087, 11550839, 3817054,
3899116, 12771854, 1052901, 4720760, 834441, 6454306, 26071655,
2855782, 626444, 8192048, 6895226, 1856560, 5726177, 576765,
3634488), `2013` = c(316204908, 55988771, 67543948, 118424320,
74247869, 4829479, 736879, 6624617, 2958663, 38335203, 5267603,
3596003, 925395, 649165, 19582022, 9984938, 1406481, 1612011,
12879505, 6569102, 3091930, 2892821, 4400477, 4626402, 1329076,
5931129, 6706786, 9898982, 5418521, 2990482, 6042711, 1014314,
1868559, 2786464, 1322687, 8899162, 2085193, 19673546, 9841590,
724019, 11570022, 3852415, 3925751, 12781338, 1053033, 4767894,
844922, 6494821, 26473525, 2902663, 627140, 8262692, 6968006,
1853231, 5742854, 582684, 3593077), `2014` = c(318563456,
56116791, 67726368, 119696311, 75023986, 4843214, 736705,
6719993, 2966912, 38680810, 5349648, 3591873, 934948, 659005,
19888741, 10087231, 1416349, 1633532, 12867544, 6595233,
3108030, 2899360, 4413057, 4647880, 1330719, 5967295, 6749911,
9915767, 5453109, 2992400, 6060930, 1022867, 1881145, 2833013,
1328743, 8925001, 2083024, 19718515, 9934399, 739904, 11594408,
3877499, 3968371, 12790565, 1054480, 4828430, 852561, 6544663,
26944751, 2941836, 626984, 8317372, 7054196, 1848514, 5758377,
583642, 3534874), `2015` = c(320896618, 56184737, 67838387,
121039206, 75834288, 4853875, 737709, 6817565, 2977853, 38993940,
5448819, 3584730, 944076, 670377, 20244914, 10199398, 1425157,
1652828, 12839047, 6612768, 3121997, 2906721, 4424611, 4668960,
1329453, 5994983, 6784240, 9917715, 5482435, 2989390, 6076204,
1032073, 1893765, 2883758, 1330111, 8935421, 2080328, 19747183,
10035186, 756835, 11605090, 3907414, 4024634, 12791904, 1055607,
4894834, 857919, 6595056, 27429639, 2990632, 626088, 8367587,
7160290, 1841053, 5767891, 586555, 3473181), `2016` = c(323127513,
56209510, 67941429, 122319574, 76657000, 4863300, 741894,
6931071, 2988248, 39250017, 5540545, 3576452, 952065, 681170,
20612439, 10310371, 1428557, 1683140, 12801539, 6633053,
3134693, 2907289, 4436974, 4681666, 1331479, 6016447, 6811779,
9928300, 5519952, 2988726, 6093000, 1042520, 1907116, 2940058,
1334795, 8944469, 2081015, 19745289, 10146788, 757952, 11614373,
3923561, 4093465, 12784227, 1056426, 4961119, 865454, 6651194,
27862596, 3051217, 624594, 8411808, 7288000, 1831102, 5778708,
585501, 3411307)), row.names = c(NA, -57L), class = c("tbl_df",
"tbl", "data.frame"))
请帮助我建立一个时间序列对象,以便可以使用线性回归然后针对第1列中提到的任何状态预测2020年
答案 0 :(得分:1)
tsibble
软件包旨在简化此过程。
library(tidyverse)
library(tsibble)
my_ts <- my_df %>%
rename(Location = "Geogrphical Location") %>%
select(Location, `2010`:`2016`) %>%
gather(key="Year", value="value", `2010`:`2016`) %>%
mutate(
Year = as.numeric(Year),
Location = gsub("\\.", "", Location)
) %>%
as_tsibble(index=Year, key=id(Location)) %>%
as.ts()
其中大多数只是tidyverse代码,将数据以长格式存储,并清理变量名称和位置值。 as_tsibble()
行完成了将其设置为多元时间序列的大部分工作,然后as.ts()
使其成为ts
对象。
my_ts
#> Time Series:
#> Start = 2010
#> End = 2016
#> Frequency = 1
#> Alabama Alaska Arizona Arkansas California Colorado Connecticut
#> 2010 4785492 714031 6408312 2921995 37332685 5048644 3579899
#> 2011 4799918 722713 6467163 2939493 37676861 5118360 3589893
#> 2012 4815960 731089 6549634 2950685 38011074 5189867 3593795
#> 2013 4829479 736879 6624617 2958663 38335203 5267603 3596003
#> 2014 4843214 736705 6719993 2966912 38680810 5349648 3591873
#> 2015 4853875 737709 6817565 2977853 38993940 5448819 3584730
#> 2016 4863300 741894 6931071 2988248 39250017 5540545 3576452
答案 1 :(得分:0)
尝试将位置变量转换为因子。但是,位置然后以因子的数字格式显示(标签被丢弃)。
df1$`Geogrphical Location` <- as.factor(df1$`Geogrphical Location`)
> ts(df1[, -(2:3)], start=c(2009), end=c(2014), frequency=1)
Time Series:
Start = 2009
End = 2014
Frequency = 1
Geogrphical Location 2010 2011 2012 2013 2014 2015 2016
2009 56 309348193 311663358 313998379 316204908 318563456 320896618 323127513
2010 53 55388056 55632766 55829059 55988771 56116791 56184737 56209510
2011 52 66978602 67153331 67332320 67543948 67726368 67838387 67941429
2012 55 114863114 116061801 117299171 118424320 119696311 121039206 122319574
2013 57 72118421 72815460 73537829 74247869 75023986 75834288 76657000
2014 1 4785492 4799918 4815960 4829479 4843214 4853875 4863300
一些“密码本”为您提供了此密码:
data.frame(lbl=df1$`Geogrphical Location`,
num=as.numeric(df1$`Geogrphical Location`))
注意:最好不要在R中的变量名中使用空格。
names(df1)[1] <- "Geographical.Location"
您也可以摆脱领先优势。
df1$Geographical.Location <- gsub("\\.", "", df1$Geographical.Location)
然后像上面一样做
df1$Geographical.Location <- as.factor(df1$Geographical.Location)