Question

我没有找到任何指定如何使用时间列，id列和具有多个变量的列重新整形数据框的内容，我想在不同的列中使用这些变量。

如果只需要两个类别，那就很简单：

How to reshape data from long to wide format?

但是，我有：

geo    time    indic_na    value
AT    2014Q1    B11        2556
BE    2014Q1    B11        1506.0
...   ...       ...        ...
AT    2014Q1    B1G        72065.1

我希望：

geo    time    B11       B1G       ...
AT     2014Q1  2556      72065.1   ...
AT     2013Q4  2535.4    ...
...    ...     ...       ...       ...
BE     2014Q1  1506.0    86513.0   ...

所以我希望indic_na中的每个唯一字符串都成为一个列变量。获取数据：

install.packages("SmarterPoland")
library(zoo)
library(SmarterPoland)
GDP <- getEurostatRCV(kod = "namq_gdp_c")
GDP$time = as.yearqtr(GDP$time)
GDP <- subset(GDP, (s_adj == "SWDA") & (unit == "MIO_EUR") & (time > "1989Q4"))

然后我尝试了：

testvector <- as.vector(unique(GDP$indic_na))
test <- reshape(data = GDP, direction = "long", idvar = "geo", timevar = "time", varying = testvector)

在maaany其他事情中＆＃34;变化＆＃34; ;-)我收到此错误消息：

猜测错误（变化）：

无法从名字中猜出时变变量

我感觉如此接近！但不知怎的，我不能告诉R变量在我的数据框的第3列。我在网上找到的所有例子在不同的列中都有不同的变量，或者只有id或时间和一列变量。

任何帮助都会很棒！

易于重现的数据

> dput(head(GDP))
structure(list(geo = structure(c(1L, 3L, 4L, 5L, 6L, 7L), .Names = c("SWDA,MIO_EUR,B11,AT", 
"SWDA,MIO_EUR,B11,BE", "SWDA,MIO_EUR,B11,BG", "SWDA,MIO_EUR,B11,CH", 
"SWDA,MIO_EUR,B11,CY", "SWDA,MIO_EUR,B11,CZ"), .Label = c("AT", 
"BA", "BE", "BG", "CH", "CY", "CZ", "DE", "DK", "EA", "EA12", 
"EA17", "EA18", "EE", "EL", "ES", "EU15", "EU27", "EU28", "FI", 
"FR", "HR", "HU", "IE", "IS", "IT", "JP", "LT", "LU", "LV", "ME", 
"MK", "MT", "NL", "NO", "PL", "PT", "RO", "RS", "SE", "SI", "SK", 
"TR", "UK", "US"), class = "factor"), time = structure(c(2014, 
2014, 2014, 2014, 2014, 2014), class = "yearqtr"), indic_na = structure(c(1L, 
1L, 1L, 1L, 1L, 1L), .Names = c("SWDA,MIO_EUR,B11,AT", "SWDA,MIO_EUR,B11,BE", 
"SWDA,MIO_EUR,B11,BG", "SWDA,MIO_EUR,B11,CH", "SWDA,MIO_EUR,B11,CY", 
"SWDA,MIO_EUR,B11,CZ"), .Label = c("B11", "B111", "B112", "B1G", 
"B1GM", "B1GM_XE", "B1GM_XI", "B1GM_XO", "B2G_B3G", "D1", "D2_M_D3", 
"D21_M_D31", "P3", "P3_P5", "P3_S13", "P31_S13", "P31_S14", "P31_S14_S15", 
"P31_S15", "P32_S13", "P5", "P51", "P52", "P52_P53", "P53", "P6", 
"P7"), class = "factor"), value = c(2556.8, 1506, NA, NA, NA, 
3056.1)), .Names = c("geo", "time", "indic_na", "value"), row.names = 7753:7758, class = "data.frame")

Answer 1

感谢您提出这样一个明确的问题！对于新用户来说很少见。我建议reshape2超过reshape。

GDP <- subset(GDP, (s_adj == "SWDA") & (unit == "MIO_EUR") & (time > "1989Q4"),
              select = c("geo", "time", "indic_na", "value"))
# Making your data match your example 

library(reshape2)    
GDP_wide <- dcast(GDP, geo + time ~ indic_na, value.var = "value")

> head(GDP_wide)
  geo    time    B11    B111   B112 ...
1  AT 1990 Q1  -64.3 -1407.1 1337.6 
2  AT 1990 Q2  -37.2 -1432.0 1450.3
3  AT 1990 Q3  -39.4 -1457.4 1544.2
4  AT 1990 Q4  -78.7 -1546.7 1592.7
5  AT 1991 Q1 -140.2 -1771.9 1583.0
6  AT 1991 Q2 -183.7 -1938.5 1568.3

Answer 2

您也可以尝试：

library(dplyr)
library(tidyr)
GDP %>%
filter(s_adj=="SWDA" & unit=="MIO_EUR" & time >"1989Q4") %>%
select(geo, time, indic_na, value) %>%
spread(indic_na, value) %>%
mutate(time=as.yearqtr(time)) %>%
head(2) 
   geo    time   B11    B111   B112     B1G    B1GM B1GM_XE B1GM_XI B1GM_XO
1  AT 1990 Q1 -64.3 -1407.1 1337.6 28198.6 31783.3  -132.0       0       0
2  AT 1990 Q2 -37.2 -1432.0 1450.3 28611.4 32215.3  -256.7       0       0
   B2G_B3G      D1 D21_M_D31 D2_M_D3      P3 P31_S13 P31_S14 P31_S14_S15 P31_S15
1 11123.2 16710.8    3592.1  3913.5 23858.2  3342.0 17493.5     17925.6   429.9
2 11304.7 16950.7    3613.1  3941.5 24150.3  3380.1 17696.8     18137.3   438.2
  P32_S13   P3_P5 P3_S13     P5    P51   P52 P52_P53  P53      P6      P7
1  2614.0 31963.5 5969.7 8105.3 7544.3 440.6   516.1 75.5 11640.6 11785.3
2  2632.1 32329.2 6020.1 8178.9 7652.9 394.7   472.2 77.5 11916.3 11851.2

Answer 3

作为参考，这里是基础R reshape方法：

## Subsetting as per @Gregor's answer
GDP <- subset(GDP, (s_adj == "SWDA") & (unit == "MIO_EUR") & (time > "1989Q4"),
              select = c("geo", "time", "indic_na", "value"))

## The actual reshaping step
gdp_wide <- reshape(GDP, direction = "wide", 
                    idvar=c("geo", "time"), 
                    timevar="indic_na")

head(gdp_wide, 3)
#      geo    time value.B11 value.B111 value.B112 value.B1G value.B1GM
# 7753  AT 2014 Q1    2556.8     -352.6     4295.2   72065.1    79361.4
# 7754  BE 2014 Q1    1506.0         NA         NA   86513.0    96789.0
# 7755  BG 2014 Q1        NA         NA         NA    8496.6     9881.7
#      value.B1GM_XE value.B1GM_XI value.B1GM_XO value.B2G_B3G value.D1
# 7753           177            NA            NA       30409.6  40402.3
# 7754            NA            NA            NA       36251.0  50573.0
# 7755            NA            NA            NA        4475.7   4133.9
#      value.D21_M_D31 value.D2_M_D3 value.P3 value.P31_S13 value.P31_S14
# 7753          7351.4        8540.1  58941.2        9043.1       42602.2
# 7754         10276.0        9965.0  75733.0            NA       50434.0
# 7755          1370.1        1247.5   8099.9         879.7        6296.1
#      value.P31_S14_S15 value.P31_S15 value.P32_S13 value.P3_P5 value.P3_S13
# 7753           43838.4        1221.8        6153.4     75305.2      15154.2
# 7754           51499.0        1065.0            NA     95283.0      24234.0
# 7755            6338.8          40.7         895.6     10230.8       1790.3
#      value.P5 value.P51 value.P52 value.P52_P53 value.P53 value.P6 value.P7
# 7753  16364.0   16647.9    -281.3            68     349.3  46458.8  41903.8
# 7754  19550.0   19694.0    -144.0          -144       0.0  81501.0  79996.0
# 7755   2130.9    1975.9      72.0            72        NA   6850.2   7117.0

以下是您的方法，其中包含一些可以帮助您使用reshape的评论。但是，现在有dcast.data.table，reshape开始感觉很慢。

reshape(data = GDP,            # This is correct
        direction = "long",    # Your data are already "long". You're going from
                               #   "long" to "wide", so set it to "wide".
        idvar = "geo",         # idvars are like the row stubs in the resulting 
                               #   data. You can have multiple idvars (use `c()`)
        timevar = "time",      # timevars are like the colnames or how you want 
                               #   to spread the data out in a wide form.
        varying = testvector)  # If you only have one value column, this argument
                               #   is rarely necessary. I generally set the 
                               #   "data" argument to just have the columns of 
                               #   interest so that I don't need to worry about
                               #   this argument or to use the `drop` argument.

在R中重构数据帧，其中包含id，time和一个包含多个数据变量的列

易于重现的数据

3 个答案: