我在R中有一个像这样的数据框:
Company | Date | Revenues
c1 | d1 | r1
c1 | d2 | r2
c1 | d3 | r3
c2 | d1 | r4
c2 | d2 | r5
c2 | d3 | r6
c3 | d2 | r7
c3 | d3 | r8
我想将格式更改为:
Company|Date|Revs|YOY|Company|Date|Revs|YOY|Company|Date|Revenues|YOY
c1 |d1 |r1 |y1 | c2 |d1 |r4 |y4 | c3 |NA |NA |NA
c1 |d2 |r2 |y2 | c2 |d2 |r5 |y5 | c3 |d2 |r7 |y7
c1 |d3 |r3 |y3 | c2 |d3 |r6 |y6 | c3 |d3 |r8 |y8
也就是说,我想更改数据框,以便将每组公司数据分组并相互设置,并计算收入的年度差异,并在收入后插入新列柱。
观察结果是每个季度/月,但有些数据缺失,因此我希望将NA插入到缺失的字段中,并按照第二个表中的指示排列日期。
我包含了一些代码,但经过大量搜索,我仍然不知道如何处理这个问题。我看过重塑,但这似乎没有做我需要做的事情。我想也许循环是解决这个问题的正确方法吗?
以下是来自dput的一组示例输入数据,其中包含一些缺失值:
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 7L, 54L, 55L,
56L, 57L, 58L, 59L, 60L, 61L, 82L, 83L, 84L, 85L, 86L, 87L, 88L,
89L, 596L, 597L, 598L, 599L, 600L, 601L), Company = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("Company 1",
"Company 2", "Company 3", "Company 4"), class = "factor"), Date = structure(c(8L,
7L, 6L, 5L, 4L, 3L, 2L, 1L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 8L,
7L, 6L, 5L, 4L, 3L, 2L, 1L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1",
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3",
"2015-Q4"), class = "factor"), Revenue = structure(c(16L, 11L,
12L, 7L, 10L, 14L, 9L, 8L, 15L, 2L, 4L, 1L, 30L, 6L, 3L, 5L,
13L, 26L, 27L, 21L, 23L, 24L, 25L, 22L, 29L, 19L, 20L, 17L, 28L,
18L), .Label = c("$1,009,306", "$1,129,899", "$1,173,698", "$1,290,262",
"$1,329,210", "$1,338,107", "$1,342,401", "$1,455,988", "$1,632,968",
"$1,697,235", "$1,716,398", "$1,756,648", "$17,215,900", "$2,405,874",
"$216,536", "$422,063", "$47,665,398", "$52,772,667", "$53,941,124",
"$54,059,612", "$54,548,057", "$54,946,768", "$55,735,568", "$58,099,615",
"$59,753,619", "$59,955,413", "$60,655,988", "$66,236,339", "$79,135,033",
"$962,366"), class = "factor")), .Names = c("X", "Company", "Date",
"Revenue"), class = "data.frame", row.names = c(NA, -30L))
以下是转换后我想要的数据:
structure(list(Company = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "Company 1", class = "factor"), Date = structure(c(8L,
7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1", "2014-Q2",
"2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3", "2015-Q4"
), class = "factor"), Revenue = structure(c(8L, 5L, 6L, 1L, 4L,
7L, 3L, 2L), .Label = c("$134", "$146", "$163", "$170", "$172",
"$176", "$241", "$42"), class = "factor"), YOY = structure(c(2L,
1L, 4L, 3L, NA, NA, NA, NA), .Label = c("-29%", "-75%", "-8%",
"8%"), class = "factor"), Company.1 = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), .Label = "Company 2", class = "factor"),
Date.1 = structure(c(8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1",
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3",
"2015-Q4"), class = "factor"), Revenue.1 = structure(c(7L,
2L, 4L, 1L, 8L, 6L, 3L, 5L), .Label = c("$101", "$113", "$117",
"$129", "$133", "$134", "$22", "$96"), class = "factor"),
YOY.1 = structure(c(3L, 1L, 4L, 2L, NA, NA, NA, NA), .Label = c("-16%",
"-24%", "-77%", "10%"), class = "factor"), Company.2 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Company 3", class = "factor"),
Date.2 = structure(c(8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1",
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2", "2015-Q3",
"2015-Q4"), class = "factor"), Revenue.2 = structure(c(1L,
7L, 8L, 2L, 4L, 5L, 6L, 3L), .Label = c("$1,722", "$5,455",
"$5,495", "$5,574", "$5,810", "$5,975", "$5,996", "$6,066"
), class = "factor"), YOY.2 = structure(c(2L, 4L, 3L, 1L,
NA, NA, NA, NA), .Label = c("-1%", "-69%", "2%", "3%"), class = "factor"),
Company.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Company 4", class = "factor"),
Date.3 = structure(c(NA, NA, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("2014-Q1",
"2014-Q2", "2014-Q3", "2014-Q4", "2015-Q1", "2015-Q2"), class = "factor"),
Revenue.3 = structure(c(NA, NA, 6L, 3L, 4L, 1L, 5L, 2L), .Label = c("$4,767",
"$5,277", "$5,394", "$5,406", "$6,624", "$7,914"), class = "factor"),
YOY.3 = structure(c(NA, NA, 1L, 2L, NA, NA, NA, NA), .Label = c("19%",
"2%"), class = "factor"), X = c(NA, NA, NA, NA, NA, NA, NA,
NA)), .Names = c("Company", "Date", "Revenue", "YOY", "Company.1",
"Date.1", "Revenue.1", "YOY.1", "Company.2", "Date.2", "Revenue.2",
"YOY.2", "Company.3", "Date.3", "Revenue.3", "YOY.3", "X"), class = "data.frame", row.names = c(NA,
-8L))
谢谢!
答案 0 :(得分:0)
这是一个肮脏的解决方案:
# How to use:
# output <- proccessData(calculateYoY(realData))
# Function for calculating YoY
calculateYoY <- function(df){
df$Revenue <- as.double(gsub(",", "",
substring(
as.character(
df$Revenue), 2)))
df$YoY <- rep(NA, times=length(df[,1]))
companies <- unique(df$Company)
for (comp in companies){
auxdf <- df[which(df$Company==comp),]
periods <- length(auxdf[,1]) - 4
for (per in (1:periods)){
auxdf[per,5] <- (auxdf[per,4] - auxdf[per + 4,4])/auxdf[per + 4,4]
}
df[which(df$Company==comp),] <- auxdf
}
df
}
# Function for generating the new data.frame
proccessData <- function(df){
companies <- unique(df$Company)
dates <- unique(df$Date)
resultdf <- data.frame(row.names=dates)
dummyVector = rep(NA, times=length(dates))
comp <- "Company 4"
for (comp in companies){
auxdf <- df[which(df$Company==comp),2:5]
dummydf <- data.frame(Company=rep(as.character(comp),
times=length(dates)),
Date=dates, Revenue=dummyVector, YoY=dummyVector)
for (dat in as.character(auxdf$Date)){
dummydf[which(dummydf$Date==dat),] <- auxdf[which(auxdf$Date==dat),]
}
resultdf <- cbind(resultdf, dummydf)
}
resultdf
}
它在&#34;命令式&#34;格式,但工作,它是一个很好的起点,以使更多的东西&#34;功能&#34;。也许使用reshape + plyr脚本会更短。
(该脚本假定在每个公司的子集中,数据按日期排序。收入是文本字段。)