我目前正在使用2个单独的CSV数据集。我已经使用了名为PRICEtable4.1的第一个数据集来可视化x值(GBA)和y值(PRICE)之间的关系。我在下面附上了这张图的图片。
我现在要做的是使用第一个CSV数据集中的三次回归模型,根据给定的x值(GBA)预测第二个CSV数据集中的y值(PREDICTED_PRICE)。是否有让我建立连接的功能?
下面是我用来创建回归模型的代码train_X <- PRICEtable4.1$GBA
train_y <- PRICEtable4.1$PRICE
test_X <- PRICEtable4.1$GBA
test_y <- PRICEtable4.1$PRICE
X <- train_X
view(X)
y <- train_y
View(y)
poly_order <- 3
model <- lm (y~poly(X, poly_order))
print(model)
#MSE
test_yhat <- predict(model, data.frame (X = test_X))
MSE <- mean((test_y-test_yhat )^2)
print(MSE)
#R squared
test_ymean <- mean(test_y)
test_yhatmean <- mean( test_yhat)
R_squared <- (sum((test_yhat-test_yhatmean)*(test_y-test_ymean)))^2/(sum((test_yhat-test_yhatmean)^2)*sum((test_y-test_ymean)^2))
print(R_squared)
error2 <- data.frame(MSE=c(MSE),R_squared=c(R_squared))
View(error2)
#Visualization of the model
X_new = X
View(X_new)
y_new <- predict(model, data.frame (X = X_new))
View(y_new)
PRICEmodel <- ggplot(PRICEtable4.1,aes(x=GBA,y=PRICE))+geom_point(size=2)
PRICEmodel+geom_line(aes(x=X_new,y=y_new),color="Red") + labs(x="Gross building area (ft^2)", y="Price", title="Price Regression Model")
这是第一个名为PRICEtable4.1的数据集(前20行)的描述
structure(list(ID = c(1L, 2L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L), GBA = c(1324L,
2120L, 1216L, 1804L, 1836L, 1228L, 1312L, 1262L, 1461L, 1120L,
1037L, 832L, 1500L, 920L, 1565L, 1134L, 1184L, 1420L, 2082L,
1422L), PRICE = c(1375000L, 1467000L, 549410L, 1180000L, 828000L,
742000L, 829000L, 710000L, 775000L, 380000L, 600000L, 189000L,
200000L, 265000L, 560000L, 300000L, 200000L, 940000L, 1050000L,
979000L)), row.names = c(NA, 20L), class = "data.frame")
这是第二个CSV数据集的名称为Test(前20行)
structure(list(ID = c(1L, 2L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L), GBA = c(1324L,
2120L, 1216L, 1804L, 1836L, 1228L, 1312L, 1262L, 1461L, 1120L,
1037L, 832L, 1500L, 920L, 1565L, 1134L, 1184L, 1420L, 2082L,
1422L), PRICE = c(1375000L, 1467000L, 549410L, 1180000L, 828000L,
742000L, 829000L, 710000L, 775000L, 380000L, 600000L, 189000L,
200000L, 265000L, 560000L, 300000L, 200000L, 940000L, 1050000L,
979000L)), row.names = c(NA, 20L), class = "data.frame")
> dput(Test[1:20, ])
structure(list(ID = 1:20, BATHRM = c(2L, 2L, 1L, 3L, 4L, 2L,
1L, 3L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 3L, 1L, 2L, 3L, 2L), HF_BATHRM = c(1L,
1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 1L, 0L), HEAT = c("Forced Air", "Forced Air", "Warm Cool",
"Forced Air", "Forced Air", "Warm Cool", "Hot Water Rad", "Forced Air",
"Warm Cool", "Forced Air", "Forced Air", "Hot Water Rad", "Forced Air",
"Forced Air", "Warm Cool", "Forced Air", "Forced Air", "Warm Cool",
"Ht Pump", "Forced Air"), AC = c("Y", "Y", "N", "Y", "Y", "Y",
"N", "Y", "Y", "N", "N", "N", "N", "Y", "Y", "Y", "Y", "Y", "Y",
"Y"), NUM_UNITS = c(2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), ROOMS = c(9L, 7L, 6L, 7L,
13L, 5L, 7L, 7L, 6L, 7L, 7L, 8L, 5L, 8L, 5L, 8L, 6L, 8L, 10L,
7L), BEDRM = c(3L, 3L, 3L, 4L, 6L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
2L, 4L, 2L, 4L, 3L, 2L, 3L, 3L), AYB = c(1870L, 1890L, 1911L,
1920L, 1993L, 1947L, 1895L, 1910L, 1910L, 1950L, 1951L, 1928L,
1941L, 2018L, 1939L, 2018L, 1980L, 1951L, 1910L, 1908L), YR_RMDL = c(1980L,
1963L, NA, 2001L, 2018L, NA, 1987L, 2017L, NA, NA, NA, NA, NA,
NA, 1992L, NA, 2013L, 2005L, 2004L, 1984L), EYB = c(1967L, 1982L,
1957L, 1972L, 2003L, 1958L, 1957L, 1964L, 1954L, 1960L, 1951L,
1954L, 1961L, 2018L, 1957L, 2018L, 1991L, 1961L, 1975L, 1960L
), STORIES = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1.75, 2, 1, 2,
2, 2, 2, 2, 2), GBA = c(1324L, 2120L, 1216L, 1804L, 5036L, 1836L,
1228L, 1312L, 1262L, 1461L, 1120L, 1037L, 832L, 1500L, 920L,
1565L, 1134L, 1184L, 1420L, 2082L), BLDG_NUM = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), STYLE = c("2 Story", "2 Story", "2 Story", "2 Story", "2 Story",
"2 Story", "2 Story", "2 Story", "2 Story", "2 Story", "2 Story",
"2 Story", "2 Story", "1 Story", "2 Story", "2 Story", "2 Story",
"2 Story", "2 Story", "2 Story"), STRUCT = c("Row Inside", "Row Inside",
"Row Inside", "Single", "Single", "Single", "Row Inside", "Row Inside",
"Row Inside", "Single", "Semi-Detached", "Single", "Row Inside",
"Semi-Detached", "Semi-Detached", "Single", "Row Inside", "Multi",
"Row Inside", "Row Inside"), LANDAREA = c(1575L, 1800L, 1280L,
5000L, 10252L, 3000L, 1500L, 1641L, 1358L, 6300L, 1818L, 3500L,
1280L, 5098L, 1899L, 5009L, 1152L, 2910L, 1762L, 1400L), ASSESSMENT_NBHD = c("Old City 2",
"Capitol Hill", "Old City 1", "Palisades", "Chevy Chase", "Chevy Chase",
"Eckington", "Ledroit Park", "Eckington", "Riggs Park", "Riggs Park",
"Woodridge", "Lily Ponds", "Fort Dupont Park", "Hillcrest", "Hillcrest",
"Congress Heights", "Congress Heights", "Old City 1", "Capitol Hill"
), PREDICTED_PRICE = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, 20L
), class = "data.frame")
因此,我需要使用从第一个数据集创建的回归模型预测并填写第二个CSV数据集的“ PREDICTED_PRICE”列中的值