我正在使用quantmod
处理IBM的数据集。我创建了两个变量然后使用glm
函数来查看它们之间的关系。代码运行良好但后来我注意到数据框的一部分包含NA
s。我怎样才能克服这个问题?
这是我的代码:
library("quantmod")
getSymbols("IBM")
dim(IBM)
IBM$CurrtDay_up <- ifelse(IBM$IBM.Open < IBM$IBM.Close,1,0)
IBM$LastDay_green <- ifelse((lag(IBM$IBM.Open,k=1) < lag(IBM$IBM.Close,k=1)),1,0)
head(IBM)
IBM.Open IBM.High IBM.Low IBM.Close IBM.Volume IBM.Adjusted CurrtDay_up LastDay_green
2007-01-03 97.18 98.40 96.26 97.27 9196800 82.78498 1 NA
2007-01-04 97.25 98.79 96.88 98.31 10524500 83.67011 1 1
2007-01-05 97.60 97.95 96.91 97.42 7221300 82.91264 0 1
2007-01-08 98.50 99.50 98.35 98.90 10340000 84.17225 1 0
2007-01-09 99.08 100.33 99.07 100.07 11108200 85.16802 1 1
2007-01-10 98.50 99.05 97.93 98.89 8744800 84.16374 1 1
然后我添加了glm
函数:
IBM_1 <- IBM[3:1000,] # to avoid the first row's NA.
glm_greenDay <- glm(CurrtDay_up~LastDay_green,data=IBM_1,family=binomial(link='logit'))
IBM_1$glm_pred<-predict(glm_greenDay,newdata=IBM_1,type='response')
head(IBM_1)
IBM.Open IBM.High IBM.Low IBM.Close IBM.Volume IBM.Adjusted CurrtDay_up LastDay_green glm_pred
2007-01-04 NA NA NA NA NA NA NA NA 0.5683453
2007-01-05 97.60 97.95 96.91 97.42 7221300 82.91264 0 1 NA
2007-01-07 NA NA NA NA NA NA NA NA 0.5407240
2007-01-08 98.50 99.50 98.35 98.90 10340000 84.17225 1 0 NA
2007-01-08 NA NA NA NA NA NA NA NA 0.5683453
2007-01-09 99.08 100.33 99.07 100.07 11108200 85.16802 1 1 NA
更新的代码(请注意,已经复制了一行(第2行):
IBM_1<-IBM[complete.cases(IBM[1:1000,]),] # to evoid the first row's NA.
glm_greenDay<-glm(CurrtDay_up~LastDay_green,data=IBM_1,family=binomial(link='logit'))
IBM_1$glm_pred<-glm_greenDay$fitted.values
head(IBM_1)
IBM.Open IBM.High IBM.Low IBM.Close IBM.Volume IBM.Adjusted CurrtDay_up LastDay_green glm_pred
2007-01-03 NA NA NA NA NA NA NA NA 0.5691203
2007-01-04 97.25 98.79 96.88 98.31 10524500 83.67011 1 1 NA
2007-01-04 NA NA NA NA NA NA NA NA 0.5691203
2007-01-05 97.60 97.95 96.91 97.42 7221300 82.91264 0 1 NA
2007-01-07 NA NA NA NA NA NA NA NA 0.5407240
2007-01-08 98.50 99.50 98.35 98.90 10340000 84.17225 1 0 NA
答案 0 :(得分:1)
您可能正在构建最终数据框以及R如何处理NAs。
我读取您的代码的方式是将结果列添加到数据框中:
IBM_1$glm_pred<-glm_greenDay$fitted.values
您可以将结果投放到单独的对象中,并使用cbind
将其附加到数据框的其余部分,而不会跨列传播NA
也许......
glm_pred<-matrix(glm_greenDay$fitted.values,ncol=1)
IBM_glm<-cbind(IBM_1,glm_pred)
不知道它是否最优雅,但可能是一个开始。
答案 1 :(得分:1)
问题出现是因为predict()
的输出不是xts
类对象。预测值向量中的槽具有名称的日期,但是向量仍然只是没有时间索引的向量。通过首先将merge()
的输出转换为类predict()
,我能够在建模之前通过简单的方式调用xts
而无需删除NAs:
library(quantmod)
getSymbols("IBM")
IBM$CurrtDay_up <- ifelse(IBM$IBM.Open < IBM$IBM.Close, 1, 0)
IBM$LastDay_green <- ifelse((lag(IBM$IBM.Open, k=1) < lag(IBM$IBM.Close, k=1)), 1, 0)
glm_greenDay <- glm(CurrtDay_up~LastDay_green, data=IBM, family=binomial(link='logit'), na.action=na.exclude)
glm_pred <- predict(glm_greenDay, type='response')
glm_pred_xts <- xts(x = glm_pred, order.by = as.Date(names(glm_pred)))
IBM2 <- merge(IBM, glm_pred_xts)
这似乎产生了所需的输出:
> head(glm_pred)
2007-01-03 2007-01-04 2007-01-05 2007-01-08 2007-01-09 2007-01-10
NA 0.5383952 0.5383952 0.5383065 0.5383952 0.5383952
> head(IBM2)
IBM.Open IBM.High IBM.Low IBM.Close IBM.Volume IBM.Adjusted CurrtDay_up LastDay_green glm_pred_xts
2007-01-03 97.18 98.40 96.26 97.27 9196800 82.78498 1 NA NA
2007-01-04 97.25 98.79 96.88 98.31 10524500 83.67011 1 1 0.5383952
2007-01-05 97.60 97.95 96.91 97.42 7221300 82.91264 0 1 0.5383952
2007-01-08 98.50 99.50 98.35 98.90 10340000 84.17225 1 0 0.5383065
2007-01-09 99.08 100.33 99.07 100.07 11108200 85.16802 1 1 0.5383952
2007-01-10 98.50 99.05 97.93 98.89 8744800 84.16374 1 1 0.5383952