如何使用R测试几个时间序列之间的差异

时间:2012-02-07 17:27:54

标签: r time-series statistics

我有很多时间序列,每个都代表一种植物物种。我认为有一种模式取决于木质密度。 高木质密度物种在雨季之间开花。 低木质密度物种在任何时期开花。

对于许多物种的时间序列和木质密度的测量,我如何用R模型来证明这种模式?

以下是数据的示例:

#Woody Density
set.seed(69)
wden<-round(c(rnorm(5,mean=50),rnorm(5,mean=90)))
names(wden)<-c(paste("sp",1:10,sep=""))
wden

#Chuva
rain<-c(150,100,50,40,20,20,30,50,70,100,150,200,150,100,50,30,20,20,40,50,70,100,150,200)


#Flowering measures
ydet<-c(10,10,10,10,20,40,50,40,20,10,10,10)

#2 years for 5 low woody density and 5 high density species
flowering<-matrix(NA,nrow=24, ncol=10,dimnames=list(paste("month",1:24,sep=""),paste("sp",1:10,sep="")))
for (i in 1:5) {
               flowering[,i]<-round(c(ydet+rnorm(12,mean=5,sd=5),ydet+rnorm(12,mean=5,sd=5)),digits=2)
               }
for (i in 6:10) {
               flowering[,i]<-round(c(rnorm(12,mean=30,sd=5),rnorm(12,mean=30,sd=5)),digits=2)
               }
#Changing objects to Time series
flowering<-ts(flowering)
#Plot series
plot(flowering)

#Making colors for wood density
cores<-heat.colors(10,alpha=1)
matplot(c(1:24),flowering,type="l",lwd=2,lty=1,xlab="Time",ylab="Flowering",col=cores[order(wden)])

#Plotting Rain Together with time series
bargraph<-barplot(rain/max(rain)*100,xlab="Time",ylab="Rain")
matlines(bargraph,flowering,type="l",lwd=2,lty=1,xlab="Time",ylab="Flowering",col=cores[order(wden)])
axis(1,at=bargraph,labels=1:24)
axis(4,at=seq(0,100,by=10))

2 个答案:

答案 0 :(得分:2)

我可能会建议您在http://stats.stackexchange.comr-sig-ecology@r-project.org邮件列表上尝试此操作。它只是一堆蠕虫。根本问题在于,很难证明两个时间序列的关联是因果关系,因为(特别是当两者都随时间波动时),它们很容易在大约同一时期波动因此看起来排成一列(这个经典的例子是太阳黑子周期和各种完全不相关的时间序列,如纽约证券交易所)。

对此的经典方法是&#34;白化&#34;每个时间序列(您可以拟合周期样条或正弦模型或仅从季节平均值中获取模式的差异),直到每个时间序列与白噪声无法区分,然后检查时间序列之间的互相关(at滞后零,即常规相关,或在其他滞后以表示前导/后续模式)。在你的情况下,你可能会想要看到交叉关系如何随木材密度而变化。

或者,您可以将数据归入&#34;雨季&#34;和#34;旱季&#34;并且做一个更标准的分析(你通过集总来摆脱大多数相关性),但是(1)有一个先验季节划分而不是通过查看数据; (2)你可能会失去一些力量和/或有趣的精细尺度模式; (3)一些基本的推论问题仍然存在 - 与雨有关的开花,还是只有多雨的季节

很好的例子。

答案 1 :(得分:0)

显然这已经过期了,但在过去的几年里,在使用自相关数据方面取得了一些非常大的进步。我目前正在做同样的事情,只有二项式和错误分类错误(去大或回家吧?)。无论如何,我使用过MRSea Package和Pirotta等人的Scott-Hawyard代码。 2011年确定自相关时态数据的重要预测因子。以下是我如何处理它。我编写了一个自定义模型拟合函数,该函数缓慢(以帮助避免滥用和滥用)以识别有意义的预测变量(通过向后QIC选择)以及进行重复Walds测试以确定重要性的预测变量。在这种情况下,森林类型确实很重要而物种则不然。

对于我使用物种的阻断结构,它可能比你需要的更保守(森林类型可能就足够了)但我倾向于在我的选择中保守。希望这有所帮助,如果您使用此代码,请参阅Pirotta等2011和MRSea软件包(可在CREEM网站上获得)。

另请注意,anova功能可能不合适。 Pirotta使用了geepack的anova.gee函数,但是这个函数似乎已经从新版本中消失了,我还没有找到替代品。建议欢迎。

### Functions  

# This code  calculates do backwards stepwise QIC to select the best model,
# then repeated Walds tests to retain meaningful variables and CalcAUC to caluclate
# the area under the ROC curve for each fitted model. All code based on
# Pirotta E, Matthiopoulos J, MacKenzie M, Scott-Hayward L, Rendell L Modelling sperm whale habitat preference: a novel approach combining transect and follow data
library(geepack)
library(splines)
library(AUC)
library(PresenceAbsence)
library(ROCR)            # to build the ROC curve
library(mvtnorm)         # for rmvnorm used in predictions/plotting


SelectModel=function(ModelFull){

  # Calculate the QIC of the full model
  fullmodQ=QIC(ModelFull)
  newQIC=0
  terms=attr(ModelFull$terms,"term.labels")


  while(newQIC != fullmodQ & length(terms)>1){


    # get all the terms for the full model
    terms <- attr(ModelFull$terms,"term.labels")
    n=length(terms)

    newmodel=list()
    newQIC=list()

    newmodel[[1]]=ModelFull
    newQIC[[1]]=fullmodQ

    # Make n models with selection
    for (ii in 1:n){
      dropvar=terms[ii]
      newTerms <- terms[-match(dropvar,terms)]
      newform <- as.formula(paste(".~.-",dropvar))
      newmodel[[ii+1]] <- update(ModelFull,newform)
      newQIC[[ii+1]] =QIC(newmodel[[ii]])

    }

    # Get the model with the lowest QIC
    LowestMod=which.min(unlist(newQIC))

    if (LowestMod != 1){
      ModelFull=newmodel[[LowestMod]]
      newQIC=min(unlist(newQIC))
    } else {
      ModelFull=ModelFull
      newQIC=min(unlist(newQIC))
    }


    #end the model selection


  }
  return(ModelFull)

}

DropVarsWalds=function(ModelFull){

  # If no terms included return 
  if (length(attr(ModelFull$terms,"term.labels"))<2){
    NewModel='No Covariates to select from'

  }else{


    OldModel=ModelFull
    # Get the anova values
    temp=anova(ModelFull)

    # Make n models with selection
    while(length(which(temp$`P(>|Chi|)`>.05))>0 & is.data.frame(temp)){


      # get the maximum value
      dropvar=rownames(temp)[which.max(temp$`P(>|Chi|)`)]

      # new formula for the full model
      newform <- as.formula(paste(".~.-",dropvar))

      # new full model
      ModelFull= update(ModelFull,newform) 

      # Get the model covariate names
      terms <- attr(ModelFull$terms,"term.labels")

      # # Get the anova values
      # temp=anova(ModelFull)

      temp=tryCatch({anova(ModelFull)}, error=function(e){e})


    }

    NewModel=ModelFull
  }

  return(NewModel)
}


# functions to produce partial plots (largely based on MRSea, Scott-Hawyard)
partialDF=function(mod, data, Variable){

  coefpos <- c(1, grep(Variable, colnames(model.matrix(mod))))
  xvals <- data[, which(names(data) == Variable)]
  newX <- seq(min(xvals), max(xvals), length = 500)
  eval(parse(text = paste(Variable, "<- newX", 
                          sep = "")))
  response <- rep(1, 500)
  newBasis <- eval(parse(text = labels(terms(mod))[grep(Variable, 
                                                        labels(terms(mod)))]))
  partialfit <- cbind(rep(1, 500), newBasis) %*% coef(mod)[coefpos]
  rcoefs <- NULL
  try(rcoefs <- rmvnorm(1000, coef(mod), summary(mod)$cov.scaled), 
      silent = T)
  if (is.null(rcoefs) || length(which(is.na(rcoefs) == 
                                      T)) > 0) {
    rcoefs <- rmvnorm(1000, coef(mod), as.matrix(nearPD(summary(mod)$cov.scaled)$mat))
  }
  rpreds <- cbind(rep(1, 500), newBasis) %*% t(rcoefs[, 
                                                      coefpos])
  quant.func <- function(x) {
    quantile(x, probs = c(0.025, 0.975))
  }
  cis <- t(apply(rpreds, 1, quant.func))

  partialfit <- mod$family$linkinv(partialfit)
  cis <- mod$family$linkinv(cis)

  fitdf=data.frame(x=newX, y=partialfit, LCI=cis[,1], UCI=cis[,2]) 
  colnames(fitdf)[1]=Variable
  return(fitdf)
}

partialdf_factor=function(mod, data, variable){
  coeffac <- c(1,grep(variable, colnames(model.matrix(mod))))
  coefradial <- c(grep("LocalRadialFunction", colnames(model.matrix(mod))))
  coefpos <- coeffac[which(is.na(match(coeffac, coefradial)))]
  xvals <- data[, which(names(data) == variable)]
  newX <- sort(unique(xvals))
  newX <- newX[2:length(newX)]
  partialfit <- coef(mod)[c(coefpos)]
  rcoefs <- NULL
  try(rcoefs <- rmvnorm(1000, coef(mod), summary(mod)$cov.scaled), 
      silent = T)
  if (is.null(rcoefs) || length(which(is.na(rcoefs) == T)) > 0) {
    rcoefs <- rmvnorm(1000, coef(mod), as.matrix(nearPD(summary(mod)$cov.scaled)$mat))
  }

  if((length(coefpos))>1){

    rpreds <- as.data.frame(rcoefs[, c(coefpos)])
    fitdf_year=data.frame(vals=rpreds[,1])
    fitdf_year$FactorVariable=as.factor(paste(variable, levels(xvals)[1], sep = ''))

    ############################
    # Recompile for plotting #
    #############################


    for(jj in 2:ncol(rpreds)){
      temp=data.frame(vals=rpreds[,jj])
      temp$FactorVariable=as.factor(colnames(rpreds)[jj])
      fitdf_year=rbind(fitdf_year, temp)
      rm(temp)
    }

  }else{

    rpreds <- rcoefs[,coefpos]
    fitdf_year=data.frame(vals=rpreds)
    fitdf_year$FactorVariable=colnames(model.matrix(mod))[coefpos[2:length(coefpos)]]

  }

  fitdf=fitdf_year
  colnames(fitdf)[2]=variable

  return(fitdf)


}


#Reshape your data

temp=data.frame(flowering)
flowering=data.frame(y=temp[,1], spp=colnames(temp)[1])

for(ii in 2:ncol(temp)){

  flowering=rbind(flowering, data.frame(y=temp[,ii], spp=colnames(temp)[ii]))

}

flowering$rain=rep(rain, ncol(temp))
flowering$woods=as.factor(c(rep('dense',nrow(temp)*5), rep('light',nrow(temp)*5)))

fulmod=geeglm(formula=y~bs(rain)+woods, 
              family = gaussian, 
              id=spp, 
              data=flowering, 
              corstr = 'ar1')

# Use stepwise QIC to select the best model
QICmod=SelectModel(fulmod)

# Use repeated walds to select signficinat variables
sig_mod=DropVarsWalds(ModelFull = QICmod)