从长到宽转换并创建多个新列

时间:2019-12-16 10:54:31

标签: r dplyr tidyverse reshape2

我正在尝试将数据帧从长格式转换为宽格式。当前有一个InputCode列,其中包含输入A和B,并且这些列必须使用“ DataValue”中的值作为自己的列。我一直在尝试传播和传播,

data_wide <- spread(oldData_long, InputCode, DataValue)

data_wide2 <- dcast(oldData_long, Indicator + IndicatorID + InputName DataYear + Country + Division + InputUnit ~ InputCode, value.var="DataValue")

但是,尽管创建了输入A和输入B列,但数据框中的行数却保持不变(84)而不是变为42。只要输入A有一个值,输入B的列中就会有NA,反之亦然。

此外,理想情况下,每个输入代码都应有一个InputUnit列,例如“ InputAUnit”,因为在尝试分发数据时该值也将是唯一的,并且可能导致上述问题。对于InputName来说也是一样,但是我不知道如何将信息整齐地拉出来。

任何帮助将不胜感激!

dput:

    structure(list(ID = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Indicator = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "Waste Generated", class = "factor"), IndicatorID = c(11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
11L, 11L, 11L, 11L, 11L), InputCode = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L), .Label = c("InputA", "InputB"), class = "factor"), InputName = structure(c(2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = c("Waste Generated - Waste incinerated", 
"Waste Generated - Waste sent to landfill"), class = "factor"), 
    DataValue = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 5L, 1L, 7L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    4L, 6L, 8L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 9L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 10L, 
    3L), .Label = c("0", "155", "19", "2,898.00", "20,462.34", 
    "22.317", "4.368", "40", "6,695.65", "8.998"), class = "factor"), 
    UnitCode = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = "t", class = "factor"), DataYear = c(2009L, 2009L, 
    2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 
    2009L, 2009L, 2009L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 
    2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2011L, 
    2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 
    2011L, 2011L, 2011L, 2011L, 2009L, 2009L, 2009L, 2009L, 2009L, 
    2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 2009L, 
    2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 2010L, 
    2010L, 2010L, 2010L, 2010L, 2010L, 2011L, 2011L, 2011L, 2011L, 
    2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 
    2011L), Country = structure(c(4L, 1L, 2L, 3L, 5L, 6L, 7L, 
    8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L, 3L, 5L, 6L, 
    7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L, 3L, 5L, 
    6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L, 3L, 
    5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 2L, 
    3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 4L, 1L, 
    2L, 3L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L), .Label = c("Afghanistan", 
    "Albania", "Algeria", "All", "American Samoa", "Andorra", 
    "Angola", "Antigua and Barbuda", "Argentina", "Armenia", 
    "Aruba", "Australia", "Austria", "Azerbaijan"), class = "factor"), 
    ISO = structure(c(5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 7L, 8L, 
    1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 7L, 
    8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 
    7L, 8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 
    7L, 8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 
    7L, 8L, 1L, 11L, 12L, 13L, 5L, 2L, 4L, 14L, 9L, 6L, 3L, 10L, 
    7L, 8L, 1L, 11L, 12L, 13L), .Label = c("ABW", "AFG", "AGO", 
    "ALB", "ALL", "AND", "ARG", "ARM", "ASM", "ATG", "AUS", "AUT", 
    "AZE", "DZA"), class = "factor"), Division = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Test", class = "factor"), 
    FurtherDetails1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L), .Label = "Test1", class = "factor"), FurtherDetails2 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Test2", class = "factor")), class = "data.frame", row.names = c(NA, 
-84L))

这将是理想的输出:

    structure(list(ID = c(NA, NA, NA, NA, NA, NA), Indicator = structure(c(1L, 
1L, 1L, 1L, 1L, 1L), .Label = "Waste Generated", class = "factor"), 
    IndicatorID = c(11L, 11L, 11L, 11L, 11L, 11L), DataYear = c(2009L, 
    2009L, 2009L, 2009L, 2009L, 2009L), Country = structure(c(4L, 
    1L, 2L, 3L, 5L, 6L), .Label = c("Afghanistan", "Albania", 
    "Algeria", "All", "American Samoa", "Andorra", "Angola", 
    "Antigua and Barbuda", "Argentina", "Armenia", "Aruba", "Australia", 
    "Austria", "Azerbaijan"), class = "factor"), ISO = structure(c(5L, 
    2L, 4L, 14L, 9L, 6L), .Label = c("ABW", "AFG", "AGO", "ALB", 
    "ALL", "AND", "ARG", "ARM", "ASM", "ATG", "AUS", "AUT", "AZE", 
    "DZA"), class = "factor"), Division = structure(c(1L, 1L, 
    1L, 1L, 1L, 1L), .Label = "Test", class = "factor"), FurtherDetails1 = structure(c(1L, 
    1L, 1L, 1L, 1L, 1L), .Label = "Test1", class = "factor"), 
    FurtherDetails2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L), .Label = "Test2", class = "factor"), 
    InputA = c(0L, 0L, 0L, 0L, 0L, 0L), InputAUnit = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L), .Label = c("", "t"), class = "factor"), 
    InputAName = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", 
    "Waste Generated - Waste sent to landfill"), class = "factor"), 
    InputB = c(0L, 0L, 0L, 0L, 0L, 0L), InputBUnit = structure(c(2L, 
    2L, 2L, 2L, 2L, 2L), .Label = c("", "t"), class = "factor"), 
    InputBName = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("", 
    "Waste Generated - Waste incinerated"), class = "factor")), row.names = c(NA, 
6L), class = "data.frame")

谢谢!

1 个答案:

答案 0 :(得分:2)

可能的tidyr解决方案。

library(tidyr)

out <- pivot_wider(oldData_long, names_from = InputCode, values_from = c(DataValue, UnitCode, InputName))

out
# A tibble: 42 x 15
   ID    Indicator IndicatorID DataYear Country ISO   Division FurtherDetails1 FurtherDetails2 DataValue_InputA DataValue_InputB
   <lgl> <fct>           <int>    <int> <fct>   <fct> <fct>    <fct>           <fct>           <fct>            <fct>           
 1 NA    Waste Ge…          11     2009 All     ALL   Test     Test1           Test2           0                0               
 2 NA    Waste Ge…          11     2009 Afghan… AFG   Test     Test1           Test2           0                0               
 3 NA    Waste Ge…          11     2009 Albania ALB   Test     Test1           Test2           0                0               
 4 NA    Waste Ge…          11     2009 Algeria DZA   Test     Test1           Test2           0                0               
 5 NA    Waste Ge…          11     2009 Americ… ASM   Test     Test1           Test2           0                0               
 6 NA    Waste Ge…          11     2009 Andorra AND   Test     Test1           Test2           0                0               
 7 NA    Waste Ge…          11     2009 Angola  AGO   Test     Test1           Test2           0                0               
 8 NA    Waste Ge…          11     2009 Antigu… ATG   Test     Test1           Test2           0                0               
 9 NA    Waste Ge…          11     2009 Argent… ARG   Test     Test1           Test2           0                0               
10 NA    Waste Ge…          11     2009 Armenia ARM   Test     Test1           Test2           0                0               
# … with 32 more rows, and 4 more variables: UnitCode_InputA <fct>, UnitCode_InputB <fct>, InputName_InputA <fct>, InputName_InputB <fct>

str(out)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame':   42 obs. of  15 variables:
 $ ID              : logi  NA NA NA NA NA NA ...
 $ Indicator       : Factor w/ 1 level "Waste Generated": 1 1 1 1 1 1 1 1 1 1 ...
 $ IndicatorID     : int  11 11 11 11 11 11 11 11 11 11 ...
 $ DataYear        : int  2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
 $ Country         : Factor w/ 14 levels "Afghanistan",..: 4 1 2 3 5 6 7 8 9 10 ...
 $ ISO             : Factor w/ 14 levels "ABW","AFG","AGO",..: 5 2 4 14 9 6 3 10 7 8 ...
 $ Division        : Factor w/ 1 level "Test": 1 1 1 1 1 1 1 1 1 1 ...
 $ FurtherDetails1 : Factor w/ 1 level "Test1": 1 1 1 1 1 1 1 1 1 1 ...
 $ FurtherDetails2 : Factor w/ 1 level "Test2": 1 1 1 1 1 1 1 1 1 1 ...
 $ DataValue_InputA: Factor w/ 10 levels "0","155","19",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ DataValue_InputB: Factor w/ 10 levels "0","155","19",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ UnitCode_InputA : Factor w/ 1 level "t": 1 1 1 1 1 1 1 1 1 1 ...
 $ UnitCode_InputB : Factor w/ 1 level "t": 1 1 1 1 1 1 1 1 1 1 ...
 $ InputName_InputA: Factor w/ 2 levels "Waste Generated - Waste incinerated",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ InputName_InputB: Factor w/ 2 levels "Waste Generated - Waste incinerated",..: 1 1 1 1 1 1 1 1 1 1 ...