对于一个回归项目,我使用kmeans通过nbclust创建了数据集群。该项目目前有4个集群,但如果成功,该项目可能涉及分析多个美国城市。尽管我可以手动构建每个回归,但我希望减少代码量,从而减少编码错误的机会。
当前模型使用具有多个子集和10-15个不同自变量的glm。我目前正在手动建立每个聚类回归。
Tulsa$Cluster <- Tulsa$Best.partition
#This creates a vector of clusters; there are currently 4 distinct clusters
summary(Tulsa_Cluster1 <- glm(formula = Tulsa$Result ~ Tulsa$FactorA Tulsa$FactorB + Tulsa$FactorC + Tulsa$FactorD + Tulsa$FactorE + Tulsa$FactorF,
method = "glm.fit", family = gaussian(),subset = Tulsa$Cluster =="1"))
summary(Tulsa_Step <- stepAIC(Tulsa_Cluster1, direction = "both",trace = FALSE,
scope=list(lower = ~ Tulsa_Cluster1$FactorA)))
我希望通过聚类进行回归,然后可以对其进行总结。
答案 0 :(得分:1)
请考虑使用unique
,levels
,split
和by
进行以下任意一种方法来迭代 Cluster 的唯一值:< / p>
unique
+ for
for (c in unique(Tulsa$Cluster)) {
Tulsa_Cluster <- glm(formula = Result ~ FactorA + FactorB + FactorC + FactorD + FactorE + FactorF,
data = Tulsa, method = "glm.fit", family = gaussian(), subset = Cluster == c)
print(summary(Tulsa_Cluster)) # REQUIRED IN for LOOPS TO FOR CONSOLE OUTPUT
Tulsa_Step <- stepAIC(Tulsa_Cluster, direction = "both", trace = FALSE,
scope=list(lower = ~ Tulsa_Cluster$FactorA))
print(summary(Tulsa_Step)) # REQUIRED IN for LOOPS TO FOR CONSOLE OUTPUT
}
levels
+ for
for (c in levels(factor(Tulsa$Cluster))) {
Tulsa_Cluster <- glm(formula = Result ~ FactorA + FactorB + FactorC + FactorD + FactorE + FactorF,
data = Tulsa, method = "glm.fit", family = gaussian(), subset = Cluster == c)
print(summary(Tulsa_Cluster))
Tulsa_Step <- stepAIC(Tulsa_Cluster, direction = "both", trace = FALSE,
scope=list(lower = ~ Tulsa_Cluster$FactorA))
print(summary(Tulsa_Step))
}
下面两个返回命名对象列表。
split
+ lapply
(无子集arg)
data_list <- split(Tulsa, Tulsa$Cluster)
model_list <- lapply(data_list, function(sub_df) {
Tulsa_Cluster <- glm(formula = Result ~ FactorA + FactorB + FactorC + FactorD + FactorE + FactorF,
data = sub_df, method = "glm.fit", family = gaussian())
Tulsa_Step <- stepAIC(Tulsa_Cluster, direction = "both", trace = FALSE,
scope=list(lower = ~ Tulsa_Cluster$FactorA))
# RETURN BOTH SUMMARIES INTO LIST
return(list(Tulsa_Cluster, Tulsa_Step))
})
by
(无子集arg)
model_list <- by(Tulsa, Tulsa$Cluster, function(sub_df) {
Tulsa_Cluster <- glm(formula = Result ~ FactorA + FactorB + FactorC + FactorD + FactorE + FactorF,
data = sub_df, method = "glm.fit", family = gaussian())
Tulsa_Step <- stepAIC(Tulsa_Cluster, direction = "both", trace = FALSE,
scope=list(lower = ~ Tulsa_Cluster$FactorA))
# RETURN BOTH SUMMARIES INTO LIST
return(list(Tulsa_Cluster, Tulsa_Step))
})