我没有找到任何指定如何使用时间列,id列和具有多个变量的列重新整形数据框的内容,我想在不同的列中使用这些变量。
如果只需要两个类别,那就很简单:
How to reshape data from long to wide format?
但是,我有:
geo time indic_na value
AT 2014Q1 B11 2556
BE 2014Q1 B11 1506.0
... ... ... ...
AT 2014Q1 B1G 72065.1
我希望:
geo time B11 B1G ...
AT 2014Q1 2556 72065.1 ...
AT 2013Q4 2535.4 ...
... ... ... ... ...
BE 2014Q1 1506.0 86513.0 ...
所以我希望indic_na中的每个唯一字符串都成为一个列变量。获取数据:
install.packages("SmarterPoland")
library(zoo)
library(SmarterPoland)
GDP <- getEurostatRCV(kod = "namq_gdp_c")
GDP$time = as.yearqtr(GDP$time)
GDP <- subset(GDP, (s_adj == "SWDA") & (unit == "MIO_EUR") & (time > "1989Q4"))
然后我尝试了:
testvector <- as.vector(unique(GDP$indic_na))
test <- reshape(data = GDP, direction = "long", idvar = "geo", timevar = "time", varying = testvector)
在maaany其他事情中&#34;变化&#34; ;-)我收到此错误消息:
猜测错误(变化):
无法从名字中猜出时变变量
我感觉如此接近!但不知怎的,我不能告诉R变量在我的数据框的第3列。我在网上找到的所有例子在不同的列中都有不同的变量,或者只有id或时间和一列变量。
任何帮助都会很棒!
> dput(head(GDP))
structure(list(geo = structure(c(1L, 3L, 4L, 5L, 6L, 7L), .Names = c("SWDA,MIO_EUR,B11,AT",
"SWDA,MIO_EUR,B11,BE", "SWDA,MIO_EUR,B11,BG", "SWDA,MIO_EUR,B11,CH",
"SWDA,MIO_EUR,B11,CY", "SWDA,MIO_EUR,B11,CZ"), .Label = c("AT",
"BA", "BE", "BG", "CH", "CY", "CZ", "DE", "DK", "EA", "EA12",
"EA17", "EA18", "EE", "EL", "ES", "EU15", "EU27", "EU28", "FI",
"FR", "HR", "HU", "IE", "IS", "IT", "JP", "LT", "LU", "LV", "ME",
"MK", "MT", "NL", "NO", "PL", "PT", "RO", "RS", "SE", "SI", "SK",
"TR", "UK", "US"), class = "factor"), time = structure(c(2014,
2014, 2014, 2014, 2014, 2014), class = "yearqtr"), indic_na = structure(c(1L,
1L, 1L, 1L, 1L, 1L), .Names = c("SWDA,MIO_EUR,B11,AT", "SWDA,MIO_EUR,B11,BE",
"SWDA,MIO_EUR,B11,BG", "SWDA,MIO_EUR,B11,CH", "SWDA,MIO_EUR,B11,CY",
"SWDA,MIO_EUR,B11,CZ"), .Label = c("B11", "B111", "B112", "B1G",
"B1GM", "B1GM_XE", "B1GM_XI", "B1GM_XO", "B2G_B3G", "D1", "D2_M_D3",
"D21_M_D31", "P3", "P3_P5", "P3_S13", "P31_S13", "P31_S14", "P31_S14_S15",
"P31_S15", "P32_S13", "P5", "P51", "P52", "P52_P53", "P53", "P6",
"P7"), class = "factor"), value = c(2556.8, 1506, NA, NA, NA,
3056.1)), .Names = c("geo", "time", "indic_na", "value"), row.names = 7753:7758, class = "data.frame")
答案 0 :(得分:3)
感谢您提出这样一个明确的问题!对于新用户来说很少见。我建议reshape2
超过reshape
。
GDP <- subset(GDP, (s_adj == "SWDA") & (unit == "MIO_EUR") & (time > "1989Q4"),
select = c("geo", "time", "indic_na", "value"))
# Making your data match your example
library(reshape2)
GDP_wide <- dcast(GDP, geo + time ~ indic_na, value.var = "value")
> head(GDP_wide)
geo time B11 B111 B112 ...
1 AT 1990 Q1 -64.3 -1407.1 1337.6
2 AT 1990 Q2 -37.2 -1432.0 1450.3
3 AT 1990 Q3 -39.4 -1457.4 1544.2
4 AT 1990 Q4 -78.7 -1546.7 1592.7
5 AT 1991 Q1 -140.2 -1771.9 1583.0
6 AT 1991 Q2 -183.7 -1938.5 1568.3
答案 1 :(得分:2)
您也可以尝试:
library(dplyr)
library(tidyr)
GDP %>%
filter(s_adj=="SWDA" & unit=="MIO_EUR" & time >"1989Q4") %>%
select(geo, time, indic_na, value) %>%
spread(indic_na, value) %>%
mutate(time=as.yearqtr(time)) %>%
head(2)
geo time B11 B111 B112 B1G B1GM B1GM_XE B1GM_XI B1GM_XO
1 AT 1990 Q1 -64.3 -1407.1 1337.6 28198.6 31783.3 -132.0 0 0
2 AT 1990 Q2 -37.2 -1432.0 1450.3 28611.4 32215.3 -256.7 0 0
B2G_B3G D1 D21_M_D31 D2_M_D3 P3 P31_S13 P31_S14 P31_S14_S15 P31_S15
1 11123.2 16710.8 3592.1 3913.5 23858.2 3342.0 17493.5 17925.6 429.9
2 11304.7 16950.7 3613.1 3941.5 24150.3 3380.1 17696.8 18137.3 438.2
P32_S13 P3_P5 P3_S13 P5 P51 P52 P52_P53 P53 P6 P7
1 2614.0 31963.5 5969.7 8105.3 7544.3 440.6 516.1 75.5 11640.6 11785.3
2 2632.1 32329.2 6020.1 8178.9 7652.9 394.7 472.2 77.5 11916.3 11851.2
答案 2 :(得分:1)
作为参考,这里是基础R reshape
方法:
## Subsetting as per @Gregor's answer
GDP <- subset(GDP, (s_adj == "SWDA") & (unit == "MIO_EUR") & (time > "1989Q4"),
select = c("geo", "time", "indic_na", "value"))
## The actual reshaping step
gdp_wide <- reshape(GDP, direction = "wide",
idvar=c("geo", "time"),
timevar="indic_na")
head(gdp_wide, 3)
# geo time value.B11 value.B111 value.B112 value.B1G value.B1GM
# 7753 AT 2014 Q1 2556.8 -352.6 4295.2 72065.1 79361.4
# 7754 BE 2014 Q1 1506.0 NA NA 86513.0 96789.0
# 7755 BG 2014 Q1 NA NA NA 8496.6 9881.7
# value.B1GM_XE value.B1GM_XI value.B1GM_XO value.B2G_B3G value.D1
# 7753 177 NA NA 30409.6 40402.3
# 7754 NA NA NA 36251.0 50573.0
# 7755 NA NA NA 4475.7 4133.9
# value.D21_M_D31 value.D2_M_D3 value.P3 value.P31_S13 value.P31_S14
# 7753 7351.4 8540.1 58941.2 9043.1 42602.2
# 7754 10276.0 9965.0 75733.0 NA 50434.0
# 7755 1370.1 1247.5 8099.9 879.7 6296.1
# value.P31_S14_S15 value.P31_S15 value.P32_S13 value.P3_P5 value.P3_S13
# 7753 43838.4 1221.8 6153.4 75305.2 15154.2
# 7754 51499.0 1065.0 NA 95283.0 24234.0
# 7755 6338.8 40.7 895.6 10230.8 1790.3
# value.P5 value.P51 value.P52 value.P52_P53 value.P53 value.P6 value.P7
# 7753 16364.0 16647.9 -281.3 68 349.3 46458.8 41903.8
# 7754 19550.0 19694.0 -144.0 -144 0.0 81501.0 79996.0
# 7755 2130.9 1975.9 72.0 72 NA 6850.2 7117.0
以下是您的方法,其中包含一些可以帮助您使用reshape
的评论。但是,现在有dcast.data.table
,reshape
开始感觉很慢。
reshape(data = GDP, # This is correct
direction = "long", # Your data are already "long". You're going from
# "long" to "wide", so set it to "wide".
idvar = "geo", # idvars are like the row stubs in the resulting
# data. You can have multiple idvars (use `c()`)
timevar = "time", # timevars are like the colnames or how you want
# to spread the data out in a wide form.
varying = testvector) # If you only have one value column, this argument
# is rarely necessary. I generally set the
# "data" argument to just have the columns of
# interest so that I don't need to worry about
# this argument or to use the `drop` argument.