Question

我正在尝试运行线性回归模型，其中我的数据中有虚拟变量，以指示某个预测变量是否不存在。我总共有15个预测变量。

无论预测变量的顺序如何，最后五个变量总是会产生NA。

此问题几乎与此处提出的问题完全相同：linear regression "NA" estimate just for last coefficient

我尝试在代码

中添加-1或+0

lm(H~id11+id21+id22+id23+id24+id31+id41+id42+id43+id52+id71+id81+id82+id90+id95, data=macro.shed)

这导致只有一个值NA。所以现在我有4个而不是5个预测变量为NA。

我正在阅读来自csv文档的数据。

这是我的代码：

watershed = read.csv("nlcd_2000_watershed.csv") macro_2000 = read.csv("wapp_macro_2000.csv") temp1 = matrix(watershed$Area,ncol=15,byrow=T) nlcd_watershed = data.frame（cbind（unique（watershed $ WaterID），temp1））name（nlcd_watershed）= c（＆＃34; WaterID＆＃34;，paste（＆＃34; id＆＃34;，unique（watershed $ Value）），九月=＆＃34;＆＃34）） macro.shed = merge（macro_2000，nlcd_watershed，by.x =＆＃34; WaterID＆＃34;，by.y =＆＃34; WaterID＆＃34;） data.frame（唯一的（分水岭$值），唯一的（分水岭$ NLCD））

这是我的macro.shed数据：

dput(macro.shed)
structure(list(WaterID = c(1L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 
9L, 10L, 10L, 10L, 10L, 10L, 11L), ID = structure(c(1L, 16L, 
2L, 9L, 10L, 11L, 12L, 13L, 15L, 8L, 3L, 4L, 5L, 6L, 7L, 14L), .Label = c("L1", 
"L10", "L11", "L12", "L13", "L14", "L15", "L16", "L2", "L3", 
"L4", "L5", "L6", "L7", "L8", "L9"), class = "factor"), Date = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "8/20/2001", class = "factor"), 
    UTMX = c(607308L, 607112L, 598526L, 592235L, 603094L, 597749L, 
    605523L, 608668L, 600517L, 601806L, 597548L, 593815L, 591453L, 
    607187L, 606851L, 589528L), UTMY = c(4639040L, 4643780L, 
    4622470L, 4608350L, 4629780L, 4623340L, 4634330L, 4636950L, 
    4628160L, 4630380L, 4621720L, 4611960L, 4607960L, 4636480L, 
    4636020L, 4605120L), Watershed = structure(c(1L, 1L, 2L, 
    3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 10L, 10L, 10L, 10L, 11L), .Label = c("Cold Spring Creek", 
    "Drake Brook", "Dutchess County Airport", "East Branch Wappinger", 
    "Great Spring Creek", "Grist Mill Creek", "Hunns Lake Creek", 
    "Little Wappinger", "Upton Lake Creek", "Wappinger Creek", 
    "Wappinger Falls"), class = "factor"), richness = c(37L, 
    20L, 32L, 14L, 23L, 20L, 23L, 28L, 25L, 32L, 31L, 30L, 23L, 
    33L, 19L, 19L), H = c(0.9, 1, 0.9, 0.8, 1, 0.8, 0.7, 1, 1, 
    1, 1, 1, 1, 1, 0.9, 1), EPT = c(18L, 14L, 13L, 3L, 15L, 12L, 
    15L, 19L, 15L, 21L, 17L, 16L, 13L, 20L, 13L, 12L), DOM = c(62.1, 
    61.5, 64.1, 73.7, 53.4, 74, 80.3, 59.2, 55.6, 56.8, 57.4, 
    59.4, 54.2, 59.8, 66, 52.2), PMA = c(58.1, 51, 59.3, 39.9, 
    58.4, 45.2, 54.5, 75.3, 56.2, 64.3, 66, 53.7, 55.6, 60.4, 
    52.3, 42.4), FBI = c(3.8, 3.4, 4, 3.9, 3.6, 4.2, 5.2, 3.8, 
    3.5, 4.1, 3.7, 3.7, 4, 3.8, 3.5, 3.6), BAP = c(8.3, 6.8, 
    7.8, 3.9, 7.4, 6, 6.8, 8.4, 7.5, 8.2, 8.3, 7.8, 6.8, 8.3, 
    6.6, 6), Insects.sample = c(7123L, 516L, 2061L, 1341L, 921L, 
    961L, 580L, 1567L, 1180L, 4226L, 4133L, 1400L, 2325L, 2596L, 
    687L, 609L), id11 = c(216900L, 216900L, 4923900L, 131400L, 
    1806300L, 0L, 41945400L, 250200L, 200700L, 1908000L, 4500L, 
    4500L, 4500L, 4500L, 4500L, 25427700L), id21 = c(83700L, 
    83700L, 1163700L, 1290600L, 0L, 0L, 11841300L, 2824200L, 
    110700L, 136800L, 9000L, 9000L, 9000L, 9000L, 9000L, 9145800L
    ), id22 = c(111600L, 111600L, 596700L, 7245000L, 63900L, 
    11700L, 7293600L, 5060700L, 323100L, 179100L, 55800L, 55800L, 
    55800L, 55800L, 55800L, 3876300L), id23 = c(413100L, 413100L, 
    611100L, 1817100L, 0L, 0L, 11107800L, 208800L, 1713600L, 
    33300L, 204300L, 204300L, 204300L, 204300L, 204300L, 6268500L
    ), id24 = c(239400L, 239400L, 4547700L, 193500L, 26100L, 
    10800L, 48636900L, 88200L, 1139400L, 41400L, 16200L, 16200L, 
    16200L, 16200L, 16200L, 14818500L), id31 = c(63900L, 63900L, 
    14319000L, 526500L, 139500L, 0L, 58785300L, 398700L, 1723500L, 
    73800L, 0L, 0L, 0L, 0L, 0L, 31161600L), id41 = c(384300L, 
    384300L, 4142700L, 0L, 86400L, 0L, 9641700L, 357300L, 3166200L, 
    392400L, 0L, 0L, 0L, 0L, 0L, 963900L), id42 = c(729000L, 
    729000L, 508500L, 209700L, 13500L, 0L, 4072500L, 682200L, 
    2137500L, 31500L, 10800L, 10800L, 10800L, 10800L, 10800L, 
    3993300L), id43 = c(1224000L, 1224000L, 1266300L, 1532700L, 
    0L, 418500L, 6607800L, 695700L, 1356300L, 10800L, 78300L, 
    78300L, 78300L, 78300L, 78300L, 5419800L), id52 = c(16200L, 
    16200L, 57600L, 600300L, 17100L, 0L, 1730700L, 958500L, 120600L, 
    101700L, 20700L, 20700L, 20700L, 20700L, 20700L, 0L), id71 = c(22500L, 
    22500L, 780300L, 208800L, 5400L, 0L, 1139400L, 533700L, 7085700L, 
    582300L, 0L, 0L, 0L, 0L, 0L, 198000L), id81 = c(221400L, 
    221400L, 3398400L, 0L, 1649700L, 0L, 287100L, 155700L, 6300900L, 
    1511100L, 13500L, 13500L, 13500L, 13500L, 13500L, 264600L
    ), id82 = c(665100L, 665100L, 1513800L, 41400L, 447300L, 
    0L, 3083400L, 132300L, 616500L, 53100L, 2943900L, 2943900L, 
    2943900L, 2943900L, 2943900L, 931500L), id90 = c(2142000L, 
    2142000L, 826200L, 215100L, 0L, 17705700L, 630000L, 1156500L, 
    590400L, 15300L, 4598100L, 4598100L, 4598100L, 4598100L, 
    4598100L, 311400L), id95 = c(4628700L, 4628700L, 113400L, 
    4897800L, 0L, 10526400L, 358200L, 2281500L, 1431900L, 33300L, 
    4982400L, 4982400L, 4982400L, 4982400L, 4982400L, 0L)), .Names = c("WaterID", 
"ID", "Date", "UTMX", "UTMY", "Watershed", "richness", "H", "EPT", 
"DOM", "PMA", "FBI", "BAP", "Insects.sample", "id11", "id21", 
"id22", "id23", "id24", "id31", "id41", "id42", "id43", "id52", 
"id71", "id81", "id82", "id90", "id95"), row.names = c(NA, -16L
), class = "data.frame")

如何使最后一个变量不会产生NA s？

Answer 1

你试图拟合14个预测因子（如果包括截距则为15个）只有16个观测值。

没有足够的数据来计算这么多参数，这就是为什么你只得到其中一些参数的估计值。

您需要使用某种正则化或模型选择，但即便如此，您的估算也会对您选择的方法敏感。

Answer 2

为了增加@Pete提供的答案，许多变量具有高共线性。为了便于想象，

library(corrplot)

corPlot <- cor( macro.shed[, c(15:29)])

corPlot <- cor(x)
corrplot(corPlot, method = "number")

R中线性回归模型的最后变量的多个NA

2 个答案: