我希望为员工提供一个包含横截面工资数据的数据集,并创建一个大的不间断时间序列,并沿途计算价值。假设我有:
name <- c("carl","carl","bob","rick","rick","rick","rick")
sex <- c(rep("M",7))
salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
"2008-01-01","2009-12-09", "2010-01-01"))
salaries <- data.frame(name,sex,salary,date)
salaries
name sex salary date
carl M 18000 2007-04-30
carl M 14000 2007-07-30
bob M 34000 2009-12-09
rick M 11000 2006-01-01
rick M 23000 2008-01-01
rick M 23000 2009-12-09
rick M 25000 2010-01-01
正如我们所看到的,可怜的卡尔在7月份的薪水减少了4k。在此之前,他的收入是18k。在他获得晋级之前的3个月就是这种情况,但我的数据并未反映出这一点。我想做一个很好的图片来展示这个趋势,但首先我需要将数据更改为这样(其中*表示推算值):
head(salaries)
name sex salary date change
carl M 18000 2007-04-30 0
carl M 18000 2007-05-30* 0
carl M 18000 2007-06-30* 0
carl M 14000 2007-07-30 1
bob M 34000 2009-12-09 0
rick M 11000 2006-01-01 0
rick M 11000 2006-02-01* 0
... .. ....... ...... ....
rick M 11000 2007-12-01* 0
rick M 23000 2008-01-01 1
rick M 23000 2008-02-01* 1
.... ...... ...... ........
rick M 23000 2009-12-09 1
rick M 25000 2010-01-01 2
因此,我想在中间值之间进行估算,并在发生变化时进行标记。像鲍勃这样从未有工资变化的人只是保持在0岁。但每次都有多次工资变动的里克每次都会被标记,所以我们知道变化发生的时间和数字。我只对月份作为分析单位感兴趣,但知道每天如何计算也很有用。
答案 0 :(得分:2)
如果你有一个时间序列,
您可以使用na.locf
将缺失值替换为最后一个可用值
或approx
如果您只想在值之间进行插值。
要创建这些单独的时间序列,您可以使用dcast
和melt
在“高”(标准化)格式和“宽”格式之间转换数据。
要计算更改次数,您可以使用ddply
和cumsum
。
library(reshape2)
library(plyr)
library(zoo)
# Convert to wide format
d <- dcast( salaries, date ~ name, value.var = "salary" )
# Add all the dates you want
dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
d <- merge( d, data.frame(date=dates), all=TRUE )
# Fill in the missing values
# If you want the last non-missing value:
#d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE))
# If you only want to interpolate between values:
d <- as.data.frame(lapply(d,
function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
))
# Convert back to the tall format
d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)
# Add the number of changes
d <- ddply(
d, "name", transform,
change = cumsum(c(0, diff(salary) != 0))
)
答案 1 :(得分:0)
阐述@Vincent的建议:
name <- c("carl","carl","bob","rick","rick","rick","rick")
sex <- c(rep("M",7))
salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
office <- c('melbourne','sydney','adelaide','perth','perth','melbourne','melbourne')
date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
"2008-01-01","2009-12-09", "2010-01-01"))
salaries <- data.frame(name,sex,salary,date, office)
salaries
library(reshape2)
library(plyr)
library(zoo)
使用约
处理数字向量 # Convert to wide format
d <- dcast( salaries, date ~ name, value.var = "salary" )
# Add all the dates you want
dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
d <- merge( d, data.frame(date=dates), all=TRUE )
# Fill in the missing values
# If you want the last non-missing value:
#d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE, fromLast = T))
#If you only want to interpolate between values:
d <- as.data.frame(lapply(d,
function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
))
# Convert back to the tall format
d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)
# Add the number of changes
d <- ddply(
d, "name", transform,
change = cumsum(c(0, diff(salary) != 0))
)
使用na.locf转换字符向量
# Convert to wide format
a <- dcast( salaries, date ~ name, value.var = "office" )
# Add all the dates you want
dates <- seq.Date( from = min(a$date), max(a$date), by="month" )
a <- merge( a, data.frame(date=dates), all=TRUE )
# Fill in the missing values using na.locf
a <- as.data.frame(lapply(a, na.locf, na.rm=FALSE, fromLast = T))
# Convert back to the tall format
a <- melt(a, id.vars="date", value.name="office", variable.name="name", na.rm=TRUE)
合并结果
d$date <- as.Date(d$date)
out = merge(a,d, by = c('name','date'))