我正在尝试运行线性回归模型,其中我的数据中有虚拟变量,以指示某个预测变量是否不存在。我总共有15个预测变量。
无论预测变量的顺序如何,最后五个变量总是会产生NA
。
此问题几乎与此处提出的问题完全相同:linear regression "NA" estimate just for last coefficient
我尝试在代码
中添加-1或+0lm(H~id11+id21+id22+id23+id24+id31+id41+id42+id43+id52+id71+id81+id82+id90+id95, data=macro.shed)
这导致只有一个值NA
。所以现在我有4个而不是5个预测变量为NA
。
我正在阅读来自csv文档的数据。
这是我的代码:
watershed = read.csv("nlcd_2000_watershed.csv")
macro_2000 = read.csv("wapp_macro_2000.csv")
temp1 = matrix(watershed$Area,ncol=15,byrow=T)
nlcd_watershed = data.frame(cbind(unique(watershed $ WaterID),temp1))name(nlcd_watershed)= c(" WaterID",paste(" id",unique(watershed $ Value) ),九月="&#34))
macro.shed = merge(macro_2000,nlcd_watershed,by.x =" WaterID",by.y =" WaterID")
data.frame(唯一的(分水岭$值),唯一的(分水岭$ NLCD))
这是我的macro.shed数据:
dput(macro.shed)
structure(list(WaterID = c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 10L, 10L, 10L, 10L, 11L), ID = structure(c(1L, 16L,
2L, 9L, 10L, 11L, 12L, 13L, 15L, 8L, 3L, 4L, 5L, 6L, 7L, 14L), .Label = c("L1",
"L10", "L11", "L12", "L13", "L14", "L15", "L16", "L2", "L3",
"L4", "L5", "L6", "L7", "L8", "L9"), class = "factor"), Date = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "8/20/2001", class = "factor"),
UTMX = c(607308L, 607112L, 598526L, 592235L, 603094L, 597749L,
605523L, 608668L, 600517L, 601806L, 597548L, 593815L, 591453L,
607187L, 606851L, 589528L), UTMY = c(4639040L, 4643780L,
4622470L, 4608350L, 4629780L, 4623340L, 4634330L, 4636950L,
4628160L, 4630380L, 4621720L, 4611960L, 4607960L, 4636480L,
4636020L, 4605120L), Watershed = structure(c(1L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 10L, 10L, 10L, 10L, 11L), .Label = c("Cold Spring Creek",
"Drake Brook", "Dutchess County Airport", "East Branch Wappinger",
"Great Spring Creek", "Grist Mill Creek", "Hunns Lake Creek",
"Little Wappinger", "Upton Lake Creek", "Wappinger Creek",
"Wappinger Falls"), class = "factor"), richness = c(37L,
20L, 32L, 14L, 23L, 20L, 23L, 28L, 25L, 32L, 31L, 30L, 23L,
33L, 19L, 19L), H = c(0.9, 1, 0.9, 0.8, 1, 0.8, 0.7, 1, 1,
1, 1, 1, 1, 1, 0.9, 1), EPT = c(18L, 14L, 13L, 3L, 15L, 12L,
15L, 19L, 15L, 21L, 17L, 16L, 13L, 20L, 13L, 12L), DOM = c(62.1,
61.5, 64.1, 73.7, 53.4, 74, 80.3, 59.2, 55.6, 56.8, 57.4,
59.4, 54.2, 59.8, 66, 52.2), PMA = c(58.1, 51, 59.3, 39.9,
58.4, 45.2, 54.5, 75.3, 56.2, 64.3, 66, 53.7, 55.6, 60.4,
52.3, 42.4), FBI = c(3.8, 3.4, 4, 3.9, 3.6, 4.2, 5.2, 3.8,
3.5, 4.1, 3.7, 3.7, 4, 3.8, 3.5, 3.6), BAP = c(8.3, 6.8,
7.8, 3.9, 7.4, 6, 6.8, 8.4, 7.5, 8.2, 8.3, 7.8, 6.8, 8.3,
6.6, 6), Insects.sample = c(7123L, 516L, 2061L, 1341L, 921L,
961L, 580L, 1567L, 1180L, 4226L, 4133L, 1400L, 2325L, 2596L,
687L, 609L), id11 = c(216900L, 216900L, 4923900L, 131400L,
1806300L, 0L, 41945400L, 250200L, 200700L, 1908000L, 4500L,
4500L, 4500L, 4500L, 4500L, 25427700L), id21 = c(83700L,
83700L, 1163700L, 1290600L, 0L, 0L, 11841300L, 2824200L,
110700L, 136800L, 9000L, 9000L, 9000L, 9000L, 9000L, 9145800L
), id22 = c(111600L, 111600L, 596700L, 7245000L, 63900L,
11700L, 7293600L, 5060700L, 323100L, 179100L, 55800L, 55800L,
55800L, 55800L, 55800L, 3876300L), id23 = c(413100L, 413100L,
611100L, 1817100L, 0L, 0L, 11107800L, 208800L, 1713600L,
33300L, 204300L, 204300L, 204300L, 204300L, 204300L, 6268500L
), id24 = c(239400L, 239400L, 4547700L, 193500L, 26100L,
10800L, 48636900L, 88200L, 1139400L, 41400L, 16200L, 16200L,
16200L, 16200L, 16200L, 14818500L), id31 = c(63900L, 63900L,
14319000L, 526500L, 139500L, 0L, 58785300L, 398700L, 1723500L,
73800L, 0L, 0L, 0L, 0L, 0L, 31161600L), id41 = c(384300L,
384300L, 4142700L, 0L, 86400L, 0L, 9641700L, 357300L, 3166200L,
392400L, 0L, 0L, 0L, 0L, 0L, 963900L), id42 = c(729000L,
729000L, 508500L, 209700L, 13500L, 0L, 4072500L, 682200L,
2137500L, 31500L, 10800L, 10800L, 10800L, 10800L, 10800L,
3993300L), id43 = c(1224000L, 1224000L, 1266300L, 1532700L,
0L, 418500L, 6607800L, 695700L, 1356300L, 10800L, 78300L,
78300L, 78300L, 78300L, 78300L, 5419800L), id52 = c(16200L,
16200L, 57600L, 600300L, 17100L, 0L, 1730700L, 958500L, 120600L,
101700L, 20700L, 20700L, 20700L, 20700L, 20700L, 0L), id71 = c(22500L,
22500L, 780300L, 208800L, 5400L, 0L, 1139400L, 533700L, 7085700L,
582300L, 0L, 0L, 0L, 0L, 0L, 198000L), id81 = c(221400L,
221400L, 3398400L, 0L, 1649700L, 0L, 287100L, 155700L, 6300900L,
1511100L, 13500L, 13500L, 13500L, 13500L, 13500L, 264600L
), id82 = c(665100L, 665100L, 1513800L, 41400L, 447300L,
0L, 3083400L, 132300L, 616500L, 53100L, 2943900L, 2943900L,
2943900L, 2943900L, 2943900L, 931500L), id90 = c(2142000L,
2142000L, 826200L, 215100L, 0L, 17705700L, 630000L, 1156500L,
590400L, 15300L, 4598100L, 4598100L, 4598100L, 4598100L,
4598100L, 311400L), id95 = c(4628700L, 4628700L, 113400L,
4897800L, 0L, 10526400L, 358200L, 2281500L, 1431900L, 33300L,
4982400L, 4982400L, 4982400L, 4982400L, 4982400L, 0L)), .Names = c("WaterID",
"ID", "Date", "UTMX", "UTMY", "Watershed", "richness", "H", "EPT",
"DOM", "PMA", "FBI", "BAP", "Insects.sample", "id11", "id21",
"id22", "id23", "id24", "id31", "id41", "id42", "id43", "id52",
"id71", "id81", "id82", "id90", "id95"), row.names = c(NA, -16L
), class = "data.frame")
如何使最后一个变量不会产生NA
s?
答案 0 :(得分:1)
你试图拟合14个预测因子(如果包括截距则为15个)只有16个观测值。
没有足够的数据来计算这么多参数,这就是为什么你只得到其中一些参数的估计值。
您需要使用某种正则化或模型选择,但即便如此,您的估算也会对您选择的方法敏感。
答案 1 :(得分:1)
为了增加@Pete提供的答案,许多变量具有高共线性。为了便于想象,
library(corrplot)
corPlot <- cor( macro.shed[, c(15:29)])
corPlot <- cor(x)
corrplot(corPlot, method = "number")