使用R中的数据表的线性回归

时间:2014-03-23 08:38:27

标签: r data.table

我有一个代码如下:

dt <- ddply(dt, .(SIC,FYEAR), function(x) disAccRegFunc(x));

disAccRegFunc <- function(dt)
{
     #Compute Discreationary Accrual
     model <- lm(ACNew ~ DSALENew + PPEGTNew + ROANew, data = dt);
     dt$RES <- residuals(model);
     dt$StudRES <- studres(model);  #Calculation of studentized residuals
     return(dt)
}

在这段代码中,我使用ddply函数在每个片段上应用函数disAccRegFunc。我使用数据表编写了一个代码,如下所示:

        dt[,disAccRegFunc(.SD),by=.by]

但我想这个会慢一些,因为这个必须将每个片段的.SD加载到内存中。有什么方法可以使这个代码有效?谢谢。

以下是数据的快照:

structure(list(SIC = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1), FYEAR = c(1989, 1989, 1989, 1989, 1989, 1989, 1989, 
1989, 1989, 1989, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 1990, 
1990, 1990, 1990, 1991, 1991, 1991, 1991, 1991, 1991, 1991, 1991, 
1991, 1991, 1991, 1991, 1992, 1992, 1992, 1992, 1992, 1992, 1992, 
1992, 1992, 1992, 1992, 1993, 1993, 1993, 1993, 1993, 1993, 1993, 
1993, 1993, 1993, 1993, 1993, 1994, 1994, 1994, 1994, 1994, 1994, 
1994, 1994, 1994, 1994, 1994, 1994, 1994, 1995, 1995, 1995, 1995, 
1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1995, 1996, 1996, 
1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 1996, 
1997, 1997, 1997, 1997, 1997), DSALENew = c(0.012602500023269, 
0.291902040273809, 0.118617033965829, 0.0893175203093097, -0.00852049231260627, 
0.0088329859025545, 0.209634378324404, 0.0830958123218592, 0.0738020724667918, 
0.109482024510348, -0.0428304666755963, -0.16588866439072, 0.121627138869356, 
0.0312269226711679, 0.101225809778869, -0.0275779376498801, 0.237572262729396, 
-0.0121992630135952, -0.00510842445824787, 0.0576157552901739, 
0.0855443732845379, 0.0872632057071098, -0.156267221848019, 0.0815859699707067, 
-0.0430624961441175, 0.153418299584922, -8.85024282853663e-05, 
0.133435797726111, -0.0184609333710255, -0.146181230961207, 0.0781112477932131, 
0.0442291827447641, 0.00716417910447766, 0.0481930614039844, 
0.0541753077810537, 0.0665705612789049, 0.118963433232041, 0.0294059514659054, 
-0.632275504735391, -0.0502141257669623, 0.0958285084007296, 
-0.0272426945849758, 0.085945755547728, 0.301778708148965, -0.0459045802393442, 
0.0169764469498758, -0.0562287270251872, 0.0669855988183644, 
0.0547472043521437, -0.067714725413364, 0.0617426162281712, 0.440429133206918, 
0.0833729932633978, -0.0280735721200387, 0.0383118213480845, 
-0.0194657903500448, 0.0626774121566572, 0.113601675703828, 0.30761369443025, 
0.109305701022796, -0.030075679207274, 0.506415816050758, 0.125916995075369, 
-0.0196319087485011, 0.0578873173006881, 0.0897072710103872, 
-0.719538572458544, -0.478305381558564, 0.173499612656267, -0.0250104170612523, 
-0.0119919744060999, -0.139720136759367, 0.0418697622544592, 
0.0593189307945807, -0.505190248796772, 0.211265167747981, -0.0227537539511344, 
-0.00186915887850474, -1.10693374422188, 0.0756610100079348, 
0.0921928012282265, 0.159792224191246, 0.035416442965031, -0.153989830860226, 
0.0147928615931956, 0.139226519337017, -0.0114289417966556, -0.0513681750613451, 
0.307342194442119, 0.218928016906197, 0.198455485939094, 0.00664589823468326, 
0.0398349694124342, 0.115581717451524, 0.584157679434734, 0.00032726458480456, 
0.039702980735921, 0.200301694587959, 0.0416528925619835, 0.36358070267058
), PPEGTNew = c(0.953973882854457, 0.467086462417758, 0.638359582619649, 
0.265758721056519, 0.689282635504338, 0.512784274929631, 0.500905533546401, 
0.302835073280151, 0.392572501564782, 0.173497722981228, 0.969752361742529, 
0.24137810910611, 0.684512774616975, 0.339065302247072, 0.70778363944283, 
0.509976924121081, 0.570903219759228, 0.30435274734949, 0.355635184458544, 
0.129286263007193, 0.895242451967063, 0.949408181259518, 0.336231706570326, 
0.744086161679677, 0.305410574372262, 0.680226270692954, 0.513867224231965, 
0.556038600187438, 0.256535686631187, 0.230172129041729, 0.205983930988692, 
0.918203511012942, 0.82228855721393, 0.940475545033404, 0.784086638101383, 
0.285761166391243, 0.66772390313165, 0.533457779729878, 0.584734315365566, 
0.261132460991096, 0.322836113207432, 0.978359054565186, 0.794293765410356, 
0.712110006643519, 0.914969230419874, 0.784890348299594, 0.540912185159973, 
0.648134411028597, 0.551530590216312, 0.67236550759716, 0.30596000552919, 
0.299670387008921, 1.10469551228341, 0.748951274604711, 0.447654169227617, 
0.541347525306156, 0.887468150139335, 0.791733854056621, 0.647061829330608, 
0.680562828917594, 0.598813088691675, 0.736365482650909, 0.388248067037364, 
0.348369262297389, 1.16189687609724, 0.77435945860875, 0.534607065609229, 
0.690092274533576, 0.254856986617654, 0.868432516383196, 0.499511832537101, 
0.631419407141095, 0.71472891353351, 0.660164441929363, 0.612835925592257, 
0.454140131190202, 0.396457871496262, 0.747663551401869, 0.663174114021572, 
0.760667025007039, 0.269186299967778, 0.73279715312704, 0.868230788503526, 
0.409463525993566, 0.663444240909767, 0.725598526703499, 0.691893877198289, 
0.674017924092229, 0.598189952656007, 0.373202231080638, 0.775171207926563, 
1.22658359293873, 0.805768957177408, 0.220152354570637, 0.591156955236735, 
0.832888368327678, 0.431280406066905, 0.670032249971822, 0.687603305785124, 
0.705083646971962), ACNew = c(0.0395945606344065, 0.0664252010367515, 
-0.0301384111110581, 0.00254137886094096, -0.0265658063511183, 
0.0166295681258759, 0.017084585460487, -0.2362842156747, 0.0046734821614855, 
0.226591136287904, 0.0173969224465998, 0.17431895770919, 0.0477768543970679, 
0.0700759573794704, -0.00381011122461684, 0.0327360752907108, 
0.0270528951957744, 0.0692339617421051, 0.145256938943222, -0.012902437321042, 
0.0731930466605672, 0.0408395008950757, 0.220439654644541, -0.0044062389767342, 
0.275945462397434, 0.0790790446221029, -0.0311086035423097, -0.0284790074946835, 
0.0561202541758336, 0.139409843285499, 0.0526540633186986, 0.0137318040290603, 
-0.00597014925373134, 0.0544822559172043, -0.00638549410303916, 
0.355472733026265, 0.0192527105080905, 0.0449544306577358, 0.06393425639316, 
0.000712762608473587, 0.128074844252703, 0.0703969102602978, 
0.0250088059175766, 0.0264988655878261, 0.0102073256579694, 0.0162804709314325, 
0.848230088495575, 0.0312279981947237, 0.00828979750442554, 0.0349715025339877, 
0.100239598212229, 0.187866678544612, 0.0916311961658344, -0.0464666021297193, 
0.322474902683876, 0.1259809866102, -0.0111675601060711, 0.000467070106532058, 
0.0368932038834952, 0.0957255259448906, 0.00633745304077966, 
0.0479452127623754, 0.134842513781169, -0.0118286894983082, 0.0424276408533741, 
0.0698772426817753, 0.341384282624369, 0.0355627399474641, 0.216942870596813, 
-0.0462706920716694, 0.036495723805627, 0.147967995488351, -0.0337729047054296, 
-0.0368829266668886, 0.0225649771950469, 0.0382707722793551, 
0.0477004248403114, -0.00872274143302181, 0.551001540832049, 
0.0122219662648135, 0.156750507022499, 0.0566504760333187, -0.00385661619957765, 
0.0477623445351992, 0.00240292197259525, 0.122283609576427, 0.0188393898803094, 
0.0593513325987576, 0.565895582777576, 0.285157733474218, 0.155034241585313, 
-0.149117341640706, -0.017095841039503, -0.126246537396122, -0.2505725078781, 
-0.0850198942418657, -0.00165680116879791, 0.0528700778192669, 
-0.044297520661157, -0.161513388917902), ROANew = c(0.163292659090274, 
-0.226041735894198, 0.0452896804759701, -0.064034058820974, 0.0921216374505778, 
0.0575910680846553, 0.0444595485158336, -0.114887086165315, 0.0114889769803185, 
-0.0696064274871339, 0.0740157108076805, -0.186354769037832, 
0.0513349954186111, -0.235757991349298, 0.0756937929151935, 0.0442740147504638, 
0.0495950382782889, -0.0935696049702564, -0.0920108683581228, 
-0.0784272644737761, 0.070448307410796, 0.0760680791941646, -0.470699895903496, 
0.0512729475637624, -0.368708742056882, 0.0996120773321018, 0.0328344008938745, 
0.0483339285972892, -0.176730866139797, -0.172129041729365, -0.0340218336790418, 
0.0206509300902148, 0.0624875621890547, 0.0590244519545298, 0.0221510307432041, 
-0.425699987107814, 0.132221178502936, 0.0329197320742286, -0.0736185481498404, 
-0.22114647865573, -0.185582948611794, 0.0310091128169936, 0.0352236703064459, 
0.0579615678704388, 0.0335740400118082, 0.0240287571367621, -1.15527569775357, 
0.11279120338079, 0.0641595786019602, -0.0181738389010379, -0.290547389761784, 
-0.220156025300024, 0.0394543687368033, 0.0100032268473701, -0.58840401557058, 
0.0152936519406099, 0.0802071603157223, 0.0187610177761631, -1.2452733776188, 
0.171864696070445, 0.0332552948222355, -0.0172610138205074, -0.513469255546958, 
0.0195868685219487, 0.0165811580607801, 0.0346238589864652, -0.301730353280461, 
0.0856334613053142, -0.533258396245044, 0.0846717678699951, 0.0380945477528242, 
-0.990166014592365, 0.1437311990952, 0.00963128169279762, 0.0101235097874349, 
-0.365303235587282, -0.0863005431502716, 0.029595015576324, -1.14576271186441, 
0.0828918068033479, -0.572186735912356, 0.144467323379801, 0.0397204624360213, 
-0.0796039075586653, 0.0361253877308557, 0.16427255985267, 0.0364040962710107, 
-0.0108940888920407, -0.601575652723907, -0.243145420678573, 
0.0495410170479382, -0.144963655244029, 0.0926992934035187, -0.0573407202216066, 
0.119598363703979, 0.096224400158465, -0.0436491798834214, 0.0647035511327484, 
0.160661157024793, 0.0546001332589004)), .Names = c("SIC", "FYEAR", 
"DSALENew", "PPEGTNew", "ACNew", "ROANew"), row.names = c(NA, 
100L), class = "data.frame")

1 个答案:

答案 0 :(得分:5)

最好重写您的功能并使用:=

library(MASS)
disAccRegFunc <- function(dt)
{
  model <- lm(ACNew ~ DSALENew + PPEGTNew + ROANew, data = dt);
  RES <- residuals(model);
  StudRES <- studres(model);  #Calculation of studentized residuals
  list(RES, StudRES)
}

DT[,c("Res", "StudRes") := disAccRegFunc(.SD), by = list(SIC, FYEAR)]

使用lm.fit并计算学生化残差&#34;手动&#34;可以更快地做到这一点。

PS:为什么你认为ddply可能比data.table更快。我无法理解。