在data.frame上使用apply for unit root testing

时间:2015-06-28 10:02:06

标签: r statistics

我正在尝试一次对多个变量进行单位根测试。

我尝试了以下内容:

> library(tseries)
> #Unit Root Test
> data1 <- data.frame(data$Date_Quandl, data$GDP_Quaterly, data$Employment_Rate)
> dput(data1)
structure(list(data.Date_Quandl = structure(c(60L, 30L, 15L, 
45L, 59L, 29L, 14L, 44L, 58L, 28L, 13L, 43L, 57L, 27L, 12L, 42L, 
56L, 26L, 11L, 41L, 55L, 25L, 10L, 40L, 54L, 24L, 9L, 39L, 53L, 
23L, 8L, 38L, 52L, 22L, 7L, 37L, 51L, 21L, 6L, 36L, 50L, 20L, 
5L, 35L, 49L, 19L, 4L, 34L, 48L, 18L, 3L, 33L, 47L, 17L, 2L, 
32L, 46L, 16L, 1L, 31L), .Label = c("30.06.2000", "30.06.2001", 
"30.06.2002", "30.06.2003", "30.06.2004", "30.06.2005", "30.06.2006", 
"30.06.2007", "30.06.2008", "30.06.2009", "30.06.2010", "30.06.2011", 
"30.06.2012", "30.06.2013", "30.06.2014", "30.09.2000", "30.09.2001", 
"30.09.2002", "30.09.2003", "30.09.2004", "30.09.2005", "30.09.2006", 
"30.09.2007", "30.09.2008", "30.09.2009", "30.09.2010", "30.09.2011", 
"30.09.2012", "30.09.2013", "30.09.2014", "31.03.2000", "31.03.2001", 
"31.03.2002", "31.03.2003", "31.03.2004", "31.03.2005", "31.03.2006", 
"31.03.2007", "31.03.2008", "31.03.2009", "31.03.2010", "31.03.2011", 
"31.03.2012", "31.03.2013", "31.03.2014", "31.12.2000", "31.12.2001", 
"31.12.2002", "31.12.2003", "31.12.2004", "31.12.2005", "31.12.2006", 
"31.12.2007", "31.12.2008", "31.12.2009", "31.12.2010", "31.12.2011", 
"31.12.2012", "31.12.2013", "31.12.2014"), class = "factor"), 
    data.GDP_Quaterly = c(17703.7, 17599.8, 17328.2, 17044, 17078.3, 
    16872.3, 16619.2, 16502.4, 16332.5, 16268.9, 16094.7, 15956.5, 
    15785.3, 15587.1, 15460.9, 15238.4, 15230.2, 15057.7, 14888.6, 
    14681.1, 14566.5, 14384.1, 14340.4, 14383.9, 14549.9, 14843, 
    14813, 14668.4, 14685.3, 14569.7, 14422.3, 14233.2, 14066.4, 
    13908.5, 13799.8, 13648.9, 13381.6, 13205.4, 12974.1, 12813.7, 
    12562.2, 12367.7, 12181.4, 11988.4, 11816.8, 11625.1, 11370.7, 
    11230.1, 11103.8, 11037.1, 10934.8, 10834.4, 10701.3, 10639.5, 
    10638.4, 10508.1, 10472.3, 10357.4, 10278.3, 10031), data.Employment_Rate = c(71.0619, 
    70.9383, 71.162, 71.138, 71.2286, 71.5095, 71.565, 71.3246, 
    71.4963, 71.3738, 71.4276, 71.3065, 71.0246, 71.3244, 71.0619, 
    70.9811, 71.2149, 70.8342, 70.5568, 70.5444, 70.3286, 70.179, 
    70.2555, 70.5103, 70.8038, 70.6748, 70.9769, 70.6988, 70.2125, 
    70.1661, 69.6284, 69.5613, 68.9837, 68.8606, 68.4223, 67.963, 
    67.6293, 67.5905, 67.1857, 67.1248, 66.7075, 66.5857, 66.4303, 
    66.2826, 68.7514, 68.8897, 69.0824, 68.9718, 68.7927, 68.6387, 
    68.8053, 68.7286, 68.4141, 68.2357, 68.4785, 68.4171, 68.4782, 
    68.3978, 68.5344, 68.4772)), .Names = c("data.Date_Quandl", 
"data.GDP_Quaterly", "data.Employment_Rate"), row.names = c(NA, 
-60L), class = "data.frame")
> apply(data1,data1[1:2],function(x){ adf.test(x,k=0) })
Error in ds[-MARGIN] : invalid subscript type 'list'
In addition: Warning message:
In Ops.factor(left) : ‘-’ not meaningful factors

应用功能无法正常工作。

任何建议我做错了什么?

2 个答案:

答案 0 :(得分:3)

这是你想要获得的吗?

res <- sapply(data1[2:3],function(x){ adf.test(x,k=0) })
#> res
#            data.GDP_Quaterly              data.Employment_Rate          
#statistic   -1.198207                      -1.601795                     
#parameter   0                              0                             
#alternative "stationary"                   "stationary"                  
#p.value     0.8988656                      0.7357338                     
#method      "Augmented Dickey-Fuller Test" "Augmented Dickey-Fuller Test"
#data.name   "x"                            "x"  

P.S。:您确定数据的顺序正确吗?您的数据描述了一个时间序列,其值按逆时间顺序存储(最近的条目优先)。

#> head(data1)
#  data.Date_Quandl data.GDP_Quaterly data.Employment_Rate
#1       31.12.2014           17703.7              71.0619
#2       30.09.2014           17599.8              70.9383
#3       30.06.2014           17328.2              71.1620
#4       31.03.2014           17044.0              71.1380
#5       31.12.2013           17078.3              71.2286
#6       30.09.2013           16872.3              71.5095

我怀疑逆转此顺序可能是一个好主意。这可以通过以下方式完成:

dat2<- data1[rev(rownames(data1)),]
rownames(dat2) <- c(1:nrow(data1))
res <- sapply(dat2[2:3],function(x){ adf.test(x,k=0) })

现在我们有了

#> head(dat2)
#  data.Date_Quandl data.GDP_Quaterly data.Employment_Rate
#1       31.03.2000           10031.0              68.4772
#2       30.06.2000           10278.3              68.5344
#3       30.09.2000           10357.4              68.3978
#4       31.12.2000           10472.3              68.4782
#5       31.03.2001           10508.1              68.4171
#6       30.06.2001           10638.4              68.4785

,结果不同:

#> res
#            data.GDP_Quaterly              data.Employment_Rate          
#statistic   -1.062353                      -1.835968                     
#parameter   0                              0                             
#alternative "stationary"                   "stationary"                  
#p.value     0.9207886                      0.6410799                     
#method      "Augmented Dickey-Fuller Test" "Augmented Dickey-Fuller Test"
#data.name   "x"                            "x" 

希望这有帮助。

答案 1 :(得分:1)

如果您指定了正确的apply

MARGIN将有效。假设你想按列循环

apply(data1[-1], MARGIN = 2, adf.test, k=0)
#$data.GDP_Quaterly

#   Augmented Dickey-Fuller Test

#data:  newX[, i]
#Dickey-Fuller = -1.1982, Lag order = 0, p-value = 0.8989
#alternative hypothesis: stationary


#$data.Employment_Rate

#   Augmented Dickey-Fuller Test

#data:  newX[, i]
#Dickey-Fuller = -1.6018, Lag order = 0, p-value = 0.7357
# alternative hypothesis: stationary

如果是反方向

 apply(dat2[-1], 2, adf.test, k=0) #@RHertel's data