R:将数据帧中的数据从列表重新排列到并排表

时间:2015-12-12 14:40:37

标签: r dataframe time-series

我在R中有一个像这样的数据框:

Company | Date | Revenues
 c1     | d1   | r1
 c1     | d2   | r2
 c1     | d3   | r3
 c2     | d1   | r4
 c2     | d2   | r5
 c2     | d3   | r6
 c3     | d2   | r7
 c3     | d3   | r8

我想将格式更改为:

 Company|Date|Revs|YOY|Company|Date|Revs|YOY|Company|Date|Revenues|YOY
   c1   |d1  |r1  |y1 |  c2   |d1  |r4  |y4 |  c3   |NA  |NA      |NA
   c1   |d2  |r2  |y2 |  c2   |d2  |r5  |y5 |  c3   |d2  |r7      |y7
   c1   |d3  |r3  |y3 |  c2   |d3  |r6  |y6 |  c3   |d3  |r8      |y8

也就是说,我想更改数据框,以便将每组公司数据分组并相互设置,并计算收入的年度差异,并在收入后插入新列柱。

观察结果是每个季度/月,但有些数据缺失,因此我希望将NA插入到缺失的字段中,并按照第二个表中的指示排列日期。

我包含了一些代码,但经过大量搜索,我仍然不知道如何处理这个问题。我看过重塑,但这似乎没有做我需要做的事情。我想也许循环是解决这个问题的正确方法吗?

以下是来自dput的一组示例输入数据,其中包含一些缺失值:

structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 7L, 54L, 55L, 
56L, 57L, 58L, 59L, 60L, 61L, 82L, 83L, 84L, 85L, 86L, 87L, 88L, 
89L, 596L, 597L, 598L, 599L, 600L, 601L), Company = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Company 1", 
"Company 2", "Company 3", "Company 4"), class = "factor"), Date = structure(c(8L, 
7L, 6L, 5L, 4L, 3L, 2L, 1L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 8L, 
7L, 6L, 5L, 4L, 3L, 2L, 1L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", 
"2015-Q4"), class = "factor"), Revenue = structure(c(16L, 11L, 
12L, 7L, 10L, 14L, 9L, 8L, 15L, 2L, 4L, 1L, 30L, 6L, 3L, 5L, 
13L, 26L, 27L, 21L, 23L, 24L, 25L, 22L, 29L, 19L, 20L, 17L, 28L, 
18L), .Label = c("$1,009,306", "$1,129,899", "$1,173,698", "$1,290,262", 
"$1,329,210", "$1,338,107", "$1,342,401", "$1,455,988", "$1,632,968", 
"$1,697,235", "$1,716,398", "$1,756,648", "$17,215,900", "$2,405,874", 
"$216,536", "$422,063", "$47,665,398", "$52,772,667", "$53,941,124", 
"$54,059,612", "$54,548,057", "$54,946,768", "$55,735,568", "$58,099,615", 
"$59,753,619", "$59,955,413", "$60,655,988", "$66,236,339", "$79,135,033", 
"$962,366"), class = "factor")), .Names = c("X", "Company", "Date", 
"Revenue"), class = "data.frame", row.names = c(NA, -30L))

以下是转换后我想要的数据:

structure(list(Company = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = "Company 1", class = "factor"), Date = structure(c(8L, 
7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", "2014-Q2", 
"2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", "2015-Q4"
), class = "factor"), Revenue = structure(c(8L, 5L, 6L, 1L, 4L, 
7L, 3L, 2L), .Label = c("$134", "$146", "$163", "$170", "$172", 
"$176", "$241", "$42"), class = "factor"), YOY = structure(c(2L, 
1L, 4L, 3L, NA, NA, NA, NA), .Label = c("-29%", "-75%", "-8%", 
"8%"), class = "factor"), Company.1 = structure(c(1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L), .Label = "Company 2", class = "factor"), 
Date.1 = structure(c(8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", 
"2015-Q4"), class = "factor"), Revenue.1 = structure(c(7L, 
2L, 4L, 1L, 8L, 6L, 3L, 5L), .Label = c("$101", "$113", "$117", 
"$129", "$133", "$134", "$22", "$96"), class = "factor"), 
YOY.1 = structure(c(3L, 1L, 4L, 2L, NA, NA, NA, NA), .Label = c("-16%", 
"-24%", "-77%", "10%"), class = "factor"), Company.2 = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Company 3", class = "factor"), 
Date.2 = structure(c(8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", 
"2015-Q4"), class = "factor"), Revenue.2 = structure(c(1L, 
7L, 8L, 2L, 4L, 5L, 6L, 3L), .Label = c("$1,722", "$5,455", 
"$5,495", "$5,574", "$5,810", "$5,975", "$5,996", "$6,066"
), class = "factor"), YOY.2 = structure(c(2L, 4L, 3L, 1L, 
NA, NA, NA, NA), .Label = c("-1%", "-69%", "2%", "3%"), class = "factor"), 
Company.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Company 4", class = "factor"), 
Date.3 = structure(c(NA, NA, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", 
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2"), class = "factor"), 
Revenue.3 = structure(c(NA, NA, 6L, 3L, 4L, 1L, 5L, 2L), .Label = c("$4,767", 
"$5,277", "$5,394", "$5,406", "$6,624", "$7,914"), class = "factor"), 
YOY.3 = structure(c(NA, NA, 1L, 2L, NA, NA, NA, NA), .Label = c("19%", 
"2%"), class = "factor"), X = c(NA, NA, NA, NA, NA, NA, NA, 
NA)), .Names = c("Company", "Date", "Revenue", "YOY", "Company.1", 
"Date.1", "Revenue.1", "YOY.1", "Company.2", "Date.2", "Revenue.2", 
"YOY.2", "Company.3", "Date.3", "Revenue.3", "YOY.3", "X"), class =     "data.frame", row.names = c(NA, 
-8L))

谢谢!

1 个答案:

答案 0 :(得分:0)

这是一个肮脏的解决方案:

# How to use:
# output <- proccessData(calculateYoY(realData))

# Function for calculating YoY
calculateYoY <- function(df){
      df$Revenue <- as.double(gsub(",", "", 
                               substring(
                                 as.character(
                                   df$Revenue), 2)))

  df$YoY <- rep(NA, times=length(df[,1]))
  companies <- unique(df$Company)
  for (comp in companies){
    auxdf <- df[which(df$Company==comp),]
    periods <- length(auxdf[,1]) - 4
    for (per in (1:periods)){
      auxdf[per,5] <- (auxdf[per,4] - auxdf[per + 4,4])/auxdf[per + 4,4]
    }
    df[which(df$Company==comp),] <- auxdf
  }
  df
}

# Function for generating the new data.frame
proccessData <- function(df){
  companies <- unique(df$Company)
  dates <- unique(df$Date)

  resultdf <- data.frame(row.names=dates)
  dummyVector = rep(NA, times=length(dates))
  comp <- "Company 4"
  for (comp in companies){
    auxdf <- df[which(df$Company==comp),2:5]
    dummydf <- data.frame(Company=rep(as.character(comp), 
                                      times=length(dates)), 
                          Date=dates, Revenue=dummyVector, YoY=dummyVector)
    for (dat in as.character(auxdf$Date)){
      dummydf[which(dummydf$Date==dat),] <- auxdf[which(auxdf$Date==dat),]  
    } 
    resultdf <- cbind(resultdf, dummydf)
  }
  resultdf
}

它在&#34;命令式&#34;格式,但工作,它是一个很好的起点,以使更多的东西&#34;功能&#34;。也许使用reshape + plyr脚本会更短。

(该脚本假定在每个公司的子集中,数据按日期排序。收入是文本字段。)