在检查我的数据后,我的lm()
模型中似乎有一些比预期(或希望)更大的杠杆值。我希望根据哪个观察值超过杠杆阈值来删除它们。
df <- df[ df$title_year >= 2000 , ]
model <- lm( df$gross ~ . , data= df )
lev <- hatvalues( model3 )
lev_val <- lev[ lev > 2 * 8/2546
> names(lev_val)
[1] "282" "90" "103" "25" "189" "53" "147" "51" "1526" "1" "133" "34" "273" "1852" "2172" "5" "916" "200" "79" "50"
[21] "29" "632" "266" "149" "28" "26" "107" "105" "202" "144" "203" "8" "414" "54" "279" "394" "4" "155" "10" "1539"
[41] "27" "1318" "153" "109" "1790" "1327" "918" "1275" "1055" "85" "11" "1543" "919" "33" "481" "759" "43" "2404" "30" "920"
[61] "212" "123" "42" "223" "58" "118" "111" "32" "281" "88" "1075" "1061" "421" "1517" "487" "2084" "774" "934" "1069" "86"
[81] "113" "221" "37" "60" "112" "304" "1347" "117" "697" "102" "1472" "225" "773" "2" "219" "121" "151" "1856" "18" "122"
[101] "182" "518" "1084" "2014" "776" "300" "71" "125" "2070" "1854" "97" "1288" "38" "1087" "2004" "227" "1747" "64" "98" "264"
[121] "23" "1729" "150" "65" "1559" "36" "52" "13" "128" "328" "39" "524" "886" "613" "2195" "2441" "2446" "2096" "84" "1522"
[141] "15" "1800" "462" "41" "1100" "1804" "14" "162" "1336" "232" "2193" "24" "529" "2088" "787" "68" "433" "785" "66" "1340"
[161] "1863" "1587" "788" "950" "2450" "1717" "158" "682" "2531" "951" "1110" "535" "539" "1478" "35" "72" "2136" "547" "1887" "21"
[181] "2208" "1873" "1126" "2144" "1805" "2211" "1722" "1874" "1721" "91" "16" "2451" "410" "31" "1153" "1593" "280" "1878" "366" "20"
[201] "2407" "69" "1605" "1606" "694" "1881" "403" "73" "303" "1149" "955" "1144" "893" "1138" "2221" "140" "404" "819" "1365" "2225"
[221] "1374" "1690" "1485" "442" "1150" "1614" "1486" "78" "1894" "1998" "185" "2236" "2229" "1373" "1155" "1369" "1817" "184" "1305" "9"
[241] "2017" "331" "1910" "466" "1621" "566" "1619" "1372" "2239" "570" "1908" "947" "1906" "587" "170" "1629" "17" "308" "2112" "2257"
[261] "400" "575" "1166" "402" "1745" "1733" "1818" "372" "962" "329" "829" "966" "67" "1174" "2248" "703" "1622" "1624" "1623" "574"
[281] "2255" "269" "1013" "1180" "1387" "1627" "40" "1924" "2270" "2260" "1631" "95" "1628" "1184" "2423" "2050" "1481" "1388" "1037" "2060"
[301] "843" "1746" "1202" "975" "977" "1392" "256" "2109" "2541" "333" "1495" "1047" "2426" "2478" "2277" "250" "2282" "981" "92" "848"
[321] "2290" "75" "1995" "1646" "2143" "688" "2123" "2488" "2499" "255" "1941" "1190" "914" "1189" "2487" "1911" "1768" "382" "2305" "1512"
[341] "2113" "2127" "2329" "980" "1203" "1204" "582" "1949" "2185" "600" "588" "1422" "1468" "1963" "2425" "2331" "2494" "2122" "1273" "2415"
[361] "2147" "2335" "2500" "2276" "864" "2397" "2504" "1235" "1269" "2361" "2360" "2119" "1009" "2371" "2362" "709" "1976" "1216" "2039" "594"
[381] "595" "2517" "77" "2124" "2373" "1680" "1682" "998" "259" "1984" "2386" "1785" "1683" "143" "1990" "1179" "2163" "2418" "82"
我不确定如何从我的数据中删除过多的杠杆值。
我已经尝试了df2 = df[which(names(df) %nin% remove),]
,但我对这些或微小的变化没有成功。它删除的行数多于杠杆值太大,可能是因为它删除了值,而不是X&#39;观察。
任何人都有基本的R代码吗?我好像迷路了。
答案 0 :(得分:2)
纯粹作为编程问题,您需要df[!which(rownames(df) %in% remove), ]
,因为没有%nin%
。另请注意,您希望row.names
不是names
,因为您要选择行而不是列。或者,为什么不简单地使用df[lev <= (2 * 8 / 2546), ]
?
在您的具体情况下,第一个会给您错误的结果。您有子集并已覆盖df
,之后其row.names
不再是1,2,3,....
据统计,仅高杠杆率并不意味着异常值。高杠杆和大残差的结合是有问题的。 Cook的距离是一个很好的衡量标准。