最实用的多表结构方式

时间:2012-09-15 22:48:38

标签: r structure lapply

我根据目录中的一堆csv文件填充数据结构,显示一系列县(按行)的就业,住户等增长的时间序列(年度)。虽然我可以循环遍历dat对象的长度并将每个表填充到一个数据框中,然后依次处理,我对lapply函数族感兴趣,以避免循环并使代码更简单,更容易维护。

假设我想对恰好位于结构中所有表中的列执行操作(可能是grep,用于将所有列重命名为仅嵌入在名称中的年份部分,或者转换为county_id.li8变量为具有适当因子级别的factor。)我无法正确索引结构索引以达到第二级和第三级,并对这些列应用lapply调用。例如,我可以通过

访问其中一个子表中的county_id.li8列
> head(dat[1][[1]][1])
  county_id.i8
1            1
2            7
3           17
4           21
5           23
6           24

但这只能通过插入特定的索引来实现。换句话说,我将如何制作类似以下工作的内容(此处将字母指定为因子级别作为使用lapply处理结构中表格中重复列的一般示例)。

lapply(dat[][[]][1],factor, labels=letters[1:9]) #indices ommitted here; looking for general notation for access to the first column in all tables in structure

关于如何在所有级别下正确索引几个级别的指针,以及最终如何在结构中嵌入的不同表中的列上调用lappy函数? (以下数据)。

> dat<-lapply(fileList,read.csv,header=T,sep = "\t")
> dput(dat)
list(structure(list(county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 
28L, 35L, 38L, 39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_employment_pda_2010.f8 = c(313325, 
102645, 0, 19302, 0, 0, 9340, 0, 483207, 0, 110998, 444452, 0, 
22426, 74123, 0, 0), county_employment_pda_2011.f8 = c(313216, 
102576, 0, 19281, 0, 0, 9338, 0, 483043, 0, 110974, 445445, 0, 
22417, 74017, 0, 0), county_employment_pda_2012.f8 = c(313238, 
102504, 0, 19252, 0, 0, 9333, 0, 482906, 0, 110947, 446132, 0, 
22416, 73971, 0, 0), county_employment_pda_2013.f8 = c(313137, 
102418, 0, 19262, 0, 0, 9352, 0, 482697, 0, 110867, 447037, 0, 
22373, 73946, 0, 0), county_employment_pda_2014.f8 = c(313114, 
102399, 0, 19255, 0, 0, 9357, 0, 482469, 0, 110784, 447622, 0, 
22359, 73874, 0, 0), county_employment_pda_2015.f8 = c(312961, 
102281, 0, 19249, 0, 0, 9357, 0, 482181, 0, 110733, 448479, 0, 
22358, 73877, 0, 0), county_employment_pda_2016.f8 = c(312943, 
102273, 0, 19239, 0, 0, 9329, 0, 481983, 0, 110719, 449118, 0, 
22367, 73848, 0, 0), county_employment_pda_2017.f8 = c(312839, 
102216, 0, 19231, 0, 0, 9329, 0, 481889, 0, 110653, 450126, 0, 
22360, 73789, 0, 0), county_employment_pda_2018.f8 = c(312760, 
102188, 0, 19219, 0, 0, 9331, 0, 481795, 0, 110620, 451350, 0, 
22341, 73748, 0, 0), county_employment_pda_2019.f8 = c(312723, 
102139, 0, 19214, 0, 0, 9316, 0, 481816, 0, 110484, 452171, 0, 
22311, 73725, 0, 0), county_employment_pda_2020.f8 = c(312531, 
102094, 0, 19208, 0, 0, 9316, 0, 481812, 0, 110444, 453251, 0, 
22294, 73681, 0, 0)), .Names = c("county_id.i8", "county_employment_pda_2010.f8", 
"county_employment_pda_2011.f8", "county_employment_pda_2012.f8", 
"county_employment_pda_2013.f8", "county_employment_pda_2014.f8", 
"county_employment_pda_2015.f8", "county_employment_pda_2016.f8", 
"county_employment_pda_2017.f8", "county_employment_pda_2018.f8", 
"county_employment_pda_2019.f8", "county_employment_pda_2020.f8"
), class = "data.frame", row.names = c(NA, -17L)), structure(list(
    county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 28L, 35L, 38L, 
    39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_employment_tpp_2010.f8 = c(450548, 
    164024, 0, 76243, 0, 0, 9280, 0, 578345, 0, 201026, 709110, 
    0, 39157, 79052, 0, 0), county_employment_tpp_2011.f8 = c(450309, 
    163934, 0, 76189, 0, 0, 9280, 0, 578065, 0, 200971, 710321, 
    0, 39154, 78941, 0, 0), county_employment_tpp_2012.f8 = c(450223, 
    163828, 0, 76125, 0, 0, 9278, 0, 577849, 0, 200922, 710952, 
    0, 39137, 78894, 0, 0), county_employment_tpp_2013.f8 = c(450078, 
    163780, 0, 76119, 0, 0, 9297, 0, 577584, 0, 200821, 712099, 
    0, 39123, 78858, 0, 0), county_employment_tpp_2014.f8 = c(449954, 
    163635, 0, 76071, 0, 0, 9297, 0, 577275, 0, 200757, 713066, 
    0, 39093, 78754, 0, 0), county_employment_tpp_2015.f8 = c(449743, 
    163455, 0, 76039, 0, 0, 9298, 0, 576946, 0, 200697, 713844, 
    0, 39095, 78671, 0, 0), county_employment_tpp_2016.f8 = c(449702, 
    163416, 0, 76012, 0, 0, 9270, 0, 576679, 0, 200621, 714573, 
    0, 39095, 78608, 0, 0), county_employment_tpp_2017.f8 = c(449493, 
    163366, 0, 75941, 0, 0, 9264, 0, 576523, 0, 200540, 715484, 
    0, 39093, 78554, 0, 0), county_employment_tpp_2018.f8 = c(449325, 
    163290, 0, 75815, 0, 0, 9266, 0, 576412, 0, 200353, 716977, 
    0, 39052, 78510, 0, 0), county_employment_tpp_2019.f8 = c(449077, 
    163186, 0, 75750, 0, 0, 9251, 0, 576354, 0, 200169, 717829, 
    0, 39018, 78487, 0, 0), county_employment_tpp_2020.f8 = c(448740, 
    163053, 0, 75704, 0, 0, 9250, 0, 576269, 0, 200122, 718708, 
    0, 39008, 78352, 0, 0)), .Names = c("county_id.i8", "county_employment_tpp_2010.f8", 
"county_employment_tpp_2011.f8", "county_employment_tpp_2012.f8", 
"county_employment_tpp_2013.f8", "county_employment_tpp_2014.f8", 
"county_employment_tpp_2015.f8", "county_employment_tpp_2016.f8", 
"county_employment_tpp_2017.f8", "county_employment_tpp_2018.f8", 
"county_employment_tpp_2019.f8", "county_employment_tpp_2020.f8"
), class = "data.frame", row.names = c(NA, -17L)), structure(list(
    county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 28L, 35L, 38L, 
    39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_empres_pda_2010.f8 = c(201923, 
    43963, 0, 8772, 0, 0, 815, 0, 186251, 0, 78408, 160782, 0, 
    7238, 38390, 0, 0), county_empres_pda_2011.f8 = c(201783, 
    43849, 0, 8732, 0, 0, 795, 0, 213456, 0, 82832, 167626, 0, 
    7284, 37663, 0, 0), county_empres_pda_2012.f8 = c(202059, 
    44012, 0, 8742, 0, 0, 795, 0, 225552, 0, 87327, 167766, 0, 
    7498, 37518, 0, 0), county_empres_pda_2013.f8 = c(201918, 
    43878, 0, 8715, 0, 0, 789, 0, 232941, 0, 93303, 170896, 0, 
    7502, 38012, 0, 0), county_empres_pda_2014.f8 = c(209007, 
    43640, 0, 8648, 0, 0, 787, 0, 235599, 0, 96654, 174762, 0, 
    7530, 37910, 0, 0), county_empres_pda_2015.f8 = c(212050, 
    43789, 0, 8572, 0, 0, 776, 0, 234853, 0, 100111, 179057, 
    0, 7551, 37825, 0, 0), county_empres_pda_2016.f8 = c(214927, 
    43883, 0, 8531, 0, 0, 764, 0, 239730, 0, 102522, 182816, 
    0, 7518, 37677, 0, 0), county_empres_pda_2017.f8 = c(218551, 
    44331, 0, 8474, 0, 0, 764, 0, 240854, 0, 105426, 186818, 
    0, 7531, 37545, 0, 0), county_empres_pda_2018.f8 = c(220972, 
    45006, 0, 8432, 0, 0, 789, 0, 241628, 0, 107229, 190735, 
    0, 7546, 37596, 0, 0), county_empres_pda_2019.f8 = c(223044, 
    45761, 0, 8379, 0, 0, 818, 0, 244283, 0, 108506, 194185, 
    0, 7521, 37502, 0, 0), county_empres_pda_2020.f8 = c(224509, 
    46506, 0, 8394, 0, 0, 821, 0, 247482, 0, 109911, 197017, 
    0, 7504, 37591, 0, 0)), .Names = c("county_id.i8", "county_empres_pda_2010.f8", 
"county_empres_pda_2011.f8", "county_empres_pda_2012.f8", "county_empres_pda_2013.f8", 
"county_empres_pda_2014.f8", "county_empres_pda_2015.f8", "county_empres_pda_2016.f8", 
"county_empres_pda_2017.f8", "county_empres_pda_2018.f8", "county_empres_pda_2019.f8", 
"county_empres_pda_2020.f8"), class = "data.frame", row.names = c(NA, 
-17L)), structure(list(county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 
24L, 28L, 35L, 38L, 39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), 
    county_empres_tpp_2010.f8 = c(443752, 155338, 0, 65907, 0, 
    0, 2828, 0, 456214, 0, 206571, 617212, 0, 37273, 48617, 0, 
    0), county_empres_tpp_2011.f8 = c(445080, 154940, 0, 65404, 
    0, 0, 2763, 0, 487189, 0, 212888, 630774, 0, 37297, 47577, 
    0, 0), county_empres_tpp_2012.f8 = c(445075, 155693, 0, 65455, 
    0, 0, 2774, 0, 499931, 0, 219766, 628413, 0, 37542, 47263, 
    0, 0), county_empres_tpp_2013.f8 = c(444322, 155348, 0, 65132, 
    0, 0, 2741, 0, 507989, 0, 228214, 632356, 0, 37431, 47767, 
    0, 0), county_empres_tpp_2014.f8 = c(451863, 154962, 0, 64733, 
    0, 0, 2723, 0, 509617, 0, 234769, 637378, 0, 37330, 47588, 
    0, 0), county_empres_tpp_2015.f8 = c(454943, 155713, 0, 64439, 
    0, 0, 2703, 0, 506948, 0, 241141, 643996, 0, 37412, 47541, 
    0, 0), county_empres_tpp_2016.f8 = c(458014, 156027, 0, 64177, 
    0, 0, 2673, 0, 512542, 0, 245412, 649009, 0, 37347, 47498, 
    0, 0), county_empres_tpp_2017.f8 = c(462516, 156941, 0, 63885, 
    0, 0, 2662, 0, 512233, 0, 249690, 654784, 0, 37323, 47328, 
    0, 0), county_empres_tpp_2018.f8 = c(465893, 158289, 0, 63755, 
    0, 0, 2713, 0, 511619, 0, 252055, 660675, 0, 37380, 47365, 
    0, 0), county_empres_tpp_2019.f8 = c(468709, 159607, 0, 63518, 
    0, 0, 2776, 0, 513150, 0, 253851, 666054, 0, 37360, 47326, 
    0, 0), county_empres_tpp_2020.f8 = c(471780, 160499, 0, 63788, 
    0, 0, 2781, 0, 515528, 0, 255491, 669768, 0, 37373, 47399, 
    0, 0)), .Names = c("county_id.i8", "county_empres_tpp_2010.f8", 
"county_empres_tpp_2011.f8", "county_empres_tpp_2012.f8", "county_empres_tpp_2013.f8", 
"county_empres_tpp_2014.f8", "county_empres_tpp_2015.f8", "county_empres_tpp_2016.f8", 
"county_empres_tpp_2017.f8", "county_empres_tpp_2018.f8", "county_empres_tpp_2019.f8", 
"county_empres_tpp_2020.f8"), class = "data.frame", row.names = c(NA, 
-17L)), structure(list(county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 
24L, 28L, 35L, 38L, 39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), 
    county_households_pda_2010.f8 = c(170187, 37225, 0, 7006, 
    0, 0, 651, 0, 156702, 0, 53789, 111880, 0, 6479, 29142, 0, 
    0), county_households_pda_2011.f8 = c(169149, 37004, 0, 6943, 
    0, 0, 638, 0, 174237, 0, 56758, 115577, 0, 6497, 28608, 0, 
    0), county_households_pda_2012.f8 = c(169278, 37095, 0, 6955, 
    0, 0, 637, 0, 183566, 0, 60653, 115752, 0, 6634, 28529, 0, 
    0), county_households_pda_2013.f8 = c(169023, 36924, 0, 6933, 
    0, 0, 633, 0, 189987, 0, 65709, 117615, 0, 6626, 29128, 0, 
    0), county_households_pda_2014.f8 = c(174100, 36699, 0, 6889, 
    0, 0, 629, 0, 192437, 0, 68543, 120561, 0, 6643, 29129, 0, 
    0), county_households_pda_2015.f8 = c(176860, 36749, 0, 6839, 
    0, 0, 624, 0, 191746, 0, 71450, 123785, 0, 6646, 29103, 0, 
    0), county_households_pda_2016.f8 = c(179434, 36761, 0, 6812, 
    0, 0, 618, 0, 196075, 0, 73490, 126676, 0, 6616, 28999, 0, 
    0), county_households_pda_2017.f8 = c(182536, 37124, 0, 6772, 
    0, 0, 619, 0, 197431, 0, 75885, 129800, 0, 6618, 28961, 0, 
    0), county_households_pda_2018.f8 = c(184556, 37722, 0, 6736, 
    0, 0, 638, 0, 198440, 0, 77483, 132850, 0, 6620, 29037, 0, 
    0), county_households_pda_2019.f8 = c(186021, 38369, 0, 6698, 
    0, 0, 663, 0, 201268, 0, 78655, 135419, 0, 6591, 29009, 0, 
    0), county_households_pda_2020.f8 = c(187210, 38907, 0, 6717, 
    0, 0, 665, 0, 204334, 0, 79840, 137468, 0, 6569, 29120, 0, 
    0)), .Names = c("county_id.i8", "county_households_pda_2010.f8", 
"county_households_pda_2011.f8", "county_households_pda_2012.f8", 
"county_households_pda_2013.f8", "county_households_pda_2014.f8", 
"county_households_pda_2015.f8", "county_households_pda_2016.f8", 
"county_households_pda_2017.f8", "county_households_pda_2018.f8", 
"county_households_pda_2019.f8", "county_households_pda_2020.f8"
), class = "data.frame", row.names = c(NA, -17L)), structure(list(
    county_id.i8 = c(1L, 7L, 17L, 21L, 23L, 24L, 28L, 35L, 38L, 
    39L, 41L, 43L, 44L, 48L, 49L, 50L, 57L), county_households_tpp_2010.f8 = c(355208, 
    123536, 0, 50467, 0, 0, 2424, 0, 345393, 0, 138895, 406129, 
    0, 31138, 37070, 0, 0), county_households_tpp_2011.f8 = c(354126, 
    122641, 0, 49973, 0, 0, 2362, 0, 365214, 0, 143132, 413115, 
    0, 30966, 36340, 0, 0), county_households_tpp_2012.f8 = c(354044, 
    123118, 0, 50018, 0, 0, 2363, 0, 375399, 0, 149277, 411874, 
    0, 31089, 36140, 0, 0), county_households_tpp_2013.f8 = c(353169, 
    122709, 0, 49794, 0, 0, 2345, 0, 382912, 0, 156550, 414253, 
    0, 30988, 36782, 0, 0), county_households_tpp_2014.f8 = c(358483, 
    122366, 0, 49502, 0, 0, 2326, 0, 385046, 0, 162182, 418427, 
    0, 30902, 36727, 0, 0), county_households_tpp_2015.f8 = c(361320, 
    122856, 0, 49265, 0, 0, 2309, 0, 383106, 0, 167712, 423715, 
    0, 30946, 36751, 0, 0), county_households_tpp_2016.f8 = c(364010, 
    123010, 0, 49051, 0, 0, 2286, 0, 388265, 0, 171458, 427874, 
    0, 30863, 36738, 0, 0), county_households_tpp_2017.f8 = c(367824, 
    123698, 0, 48857, 0, 0, 2281, 0, 388946, 0, 175097, 432592, 
    0, 30811, 36694, 0, 0), county_households_tpp_2018.f8 = c(370694, 
    124838, 0, 48781, 0, 0, 2329, 0, 389217, 0, 177329, 437403, 
    0, 30821, 36766, 0, 0), county_households_tpp_2019.f8 = c(372758, 
    125835, 0, 48614, 0, 0, 2382, 0, 391713, 0, 179043, 441576, 
    0, 30800, 36802, 0, 0), county_households_tpp_2020.f8 = c(375254, 
    126557, 0, 48859, 0, 0, 2386, 0, 394477, 0, 180661, 444561, 
    0, 30801, 36884, 0, 0)), .Names = c("county_id.i8", "county_households_tpp_2010.f8", 
"county_households_tpp_2011.f8", "county_households_tpp_2012.f8", 
"county_households_tpp_2013.f8", "county_households_tpp_2014.f8", 
"county_households_tpp_2015.f8", "county_households_tpp_2016.f8", 
"county_households_tpp_2017.f8", "county_households_tpp_2018.f8", 
"county_households_tpp_2019.f8", "county_households_tpp_2020.f8"
), class = "data.frame", row.names = c(NA, -17L)))

1 个答案:

答案 0 :(得分:1)

# I read your data into my_list
library(plyr)
# There is no difference between lappy and llply, except that llply is nicer
# because it preserves names. See ?llply for more information.

new_list <- llply(my_list, function(foo) {
    # Notice that foo now holds the contents of each list item.
    # so class(foo) should give you data.frame
    # you can now access columns just by referencing foo$something
    # rather than using an index
    nicer_data <- melt(foo, id.vars = 1)
    names(nicer_data) <- c("country_id", "year", "value")
    # This returns only the year portion of the string
    nicer_data$year <- substring(nicer_data$year,first=23, last=26)
    return(nicer_data)
    })