将观测结果与R中的时间序列进行对比

时间:2013-04-04 11:27:06

标签: r time-series missing-data

我希望为员工提供一个包含横截面工资数据的数据集,并创建一个大的不间断时间序列,并沿途计算价值。假设我有:

name <- c("carl","carl","bob","rick","rick","rick","rick")
sex <- c(rep("M",7))
salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
                 "2008-01-01","2009-12-09", "2010-01-01"))

salaries <- data.frame(name,sex,salary,date)
salaries
  name sex salary       date
  carl   M  18000 2007-04-30
  carl   M  14000 2007-07-30
   bob   M  34000 2009-12-09
  rick   M  11000 2006-01-01
  rick   M  23000 2008-01-01
  rick   M  23000 2009-12-09
  rick   M  25000 2010-01-01

正如我们所看到的,可怜的卡尔在7月份的薪水减少了4k。在此之前,他的收入是18k。在他获得晋级之前的3个月就是这种情况,但我的数据并未反映出这一点。我想做一个很好的图片来展示这个趋势,但首先我需要将数据更改为这样(其中*表示推算值):

head(salaries)
  name sex salary       date change
  carl   M  18000 2007-04-30    0
  carl   M  18000 2007-05-30*   0 
  carl   M  18000 2007-06-30*   0 
  carl   M  14000 2007-07-30    1
   bob   M  34000 2009-12-09    0 
  rick   M  11000 2006-01-01    0
  rick   M  11000 2006-02-01*   0
  ...   .. ....... ...... ....
  rick   M  11000 2007-12-01*   0
  rick   M  23000 2008-01-01    1
  rick   M  23000 2008-02-01*   1
  ....   ...... ...... ........
  rick   M  23000 2009-12-09    1     
  rick   M  25000 2010-01-01    2 

因此,我想在中间值之间进行估算,并在发生变化时进行标记。像鲍勃这样从未有工资变化的人只是保持在0岁。但每次都有多次工资变动的里克每次都会被标记,所以我们知道变化发生的时间和数字。我只对月份作为分析单位感兴趣,但知道每天如何计算也很有用。

2 个答案:

答案 0 :(得分:2)

如果你有一个时间序列, 您可以使用na.locf将缺失值替换为最后一个可用值 或approx如果您只想在值之间进行插值。 要创建这些单独的时间序列,您可以使用dcastmelt在“高”(标准化)格式和“宽”格式之间转换数据。 要计算更改次数,您可以使用ddplycumsum

library(reshape2)
library(plyr)
library(zoo)

# Convert to wide format
d <- dcast( salaries, date ~ name, value.var = "salary" )

# Add all the dates you want
dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
d <- merge( d, data.frame(date=dates), all=TRUE )

# Fill in the missing values
# If you want the last non-missing value:
#d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE))
# If you only want to interpolate between values:
d <- as.data.frame(lapply(d, 
  function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
))

# Convert back to the tall format
d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)

# Add the number of changes
d <- ddply(
  d, "name", transform, 
  change = cumsum(c(0, diff(salary) != 0))
)

答案 1 :(得分:0)

阐述@Vincent的建议:

        name <- c("carl","carl","bob","rick","rick","rick","rick")
        sex <- c(rep("M",7))
        salary <- c(18000, 14000, 34000, 11000, 23000, 23000, 25000)
        office <- c('melbourne','sydney','adelaide','perth','perth','melbourne','melbourne')
        date <- as.Date(c("2007-04-30","2007-07-30","2009-12-09","2006-01-01",
                          "2008-01-01","2009-12-09", "2010-01-01"))

        salaries <- data.frame(name,sex,salary,date, office)
        salaries


        library(reshape2)
        library(plyr)
        library(zoo)

使用约

处理数字向量
        # Convert to wide format
        d <- dcast( salaries, date ~ name, value.var = "salary" )

        # Add all the dates you want
        dates <- seq.Date( from = min(d$date), max(d$date), by="month" )
        d <- merge( d, data.frame(date=dates), all=TRUE )

        # Fill in the missing values
        # If you want the last non-missing value:
        #d <- as.data.frame(lapply(d, na.locf, na.rm=FALSE, fromLast = T))
        #If you only want to interpolate between values:
        d <- as.data.frame(lapply(d, 
                                  function(x) approx( seq_along(x), x, seq_along(x), method="constant" )$y
        ))

        # Convert back to the tall format
        d <- melt(d, id.vars="date", value.name="salary", variable.name="name", na.rm=TRUE)

        # Add the number of changes
        d <- ddply(
          d, "name", transform, 
          change = cumsum(c(0, diff(salary) != 0))
        )

使用na.locf转换字符向量

        # Convert to wide format
        a <- dcast( salaries, date ~ name, value.var = "office" )

        # Add all the dates you want
        dates <- seq.Date( from = min(a$date), max(a$date), by="month" )
        a <- merge( a, data.frame(date=dates), all=TRUE )

        # Fill in the missing values using na.locf
        a <- as.data.frame(lapply(a, na.locf, na.rm=FALSE, fromLast = T))

        # Convert back to the tall format
        a <- melt(a, id.vars="date", value.name="office", variable.name="name", na.rm=TRUE)

合并结果

        d$date <- as.Date(d$date)
        out = merge(a,d, by = c('name','date'))