以下是我的数据集的dput
。我正在尝试填充我的数据集,以便如果一年中的特定列中存在NA
,则NA
应该用另外两年的mean
填充。例如,在下面的数据集中,刚果在“ Economy.2015”列中包含NA
,因此NA
应该用“ Economy.2016”和“ Economy.2017”列中的均值填充。
投放
structure(list(Country = c("Angola", "Bosnia and Herzegovina",
"Congo (Kinshasa)", "Greece", "Indonesia", "Iraq", "Sierra Leone",
"Sudan", "Togo"), Region = c("Sub-Saharan Africa", "Central and Eastern Europe",
"Sub-Saharan Africa", "Western Europe", "Southeastern Asia",
"Middle East and Northern Africa", "Sub-Saharan Africa", "Sub-Saharan Africa",
"Sub-Saharan Africa"), Happiness.Rank.2015 = c(137L, 96L, 120L,
102L, 74L, 112L, 123L, 118L, 158L), Happiness.Score.2015 = c(4.033,
4.949, 4.517, 4.857, 5.399, 4.677, 4.507, 4.55, 2.839), Standard.Error.2015 = c(0.04758,
0.06913, 0.0368, 0.05062, 0.02596, 0.05232, 0.07068, 0.0674,
0.06727), Economy.2015 = c(0.75778, 0.83223, NA, 1.15406, 0.82827,
0.98549, 0.33024, 0.52107, 0.20868), Family.2015 = c(0.8604,
0.91916, 1.0012, 0.92933, 1.08708, 0.81889, 0.95571, 1.01404,
0.13995), Health.2015 = c(0.16683, 0.79081, 0.09806, 0.88213,
0.63793, 0.60237, NA, 0.36878, 0.28443), Freedom.2015 = c(0.10384,
0.09245, 0.22605, 0.07699, 0.46611, NA, 0.4084, 0.10081, 0.36453
), Trust.2015 = c(0.07122, 0.00227, 0.07625, 0.01397, NA, 0.13788,
0.08786, 0.1466, 0.10731), Generosity.2015 = c(0.12344, 0.24808,
0.24834, NA, 0.51535, 0.17922, 0.21488, 0.19062, 0.16681), Dystopia.Residual.2015 = c(1.94939,
2.06367, 2.86712, 1.80101, 1.86399, 1.95335, 2.51009, 2.20857,
1.56726), Region.2016 = c("Sub-Saharan Africa", "Central and Eastern Europe",
"Sub-Saharan Africa", "Western Europe", "Southeastern Asia",
"Middle East and Northern Africa", "Sub-Saharan Africa", "Sub-Saharan Africa",
"Sub-Saharan Africa"), Happiness.Rank.2016 = c(141L, 87L, 125L,
99L, 79L, 112L, 111L, 133L, 155L), Happiness.Score.2016 = c(3.866,
5.163, 4.272, 5.033, 5.314, 4.575, 4.635, 4.139, 3.303), Lower.CI.2016 = c(3.753,
5.063, 4.191, 4.935, 5.237, 4.446, 4.505, 3.928, 3.192), Upper.CI.2016 = c(3.979,
5.263, 4.353, 5.131, 5.391, 4.704, 4.765, 4.35, 3.414), Economy.2016 = c(0.84731,
0.93383, 0.05661, 1.24886, 0.95104, 1.07474, 0.36485, 0.63069,
0.28123), Family.2016 = c(0.66366, 0.64367, 0.80676, 0.75473,
0.87625, 0.59205, 0.628, 0.81928, NA), Health.2016 = c(0.04991,
0.70766, 0.188, 0.80029, 0.49374, 0.51076, NA, 0.29759, 0.24811
), Freedom.2016 = c(0.00589, 0.09511, 0.15602, 0.05822, 0.39237,
0.24856, 0.30685, NA, 0.34678), Trust.2016 = c(0.08434, NA, 0.06075,
0.04127, 0.00322, 0.13636, 0.08196, 0.10039, 0.11587), Generosity.2016 = c(0.12071,
0.29889, 0.25458, NA, 0.56521, 0.19589, 0.23897, 0.18077, 0.17517
), Dystopia.Residual.2016 = c(2.09459, 2.48406, 2.74924, 2.12944,
2.03171, 1.81657, 3.01402, 2.10995, 2.1354), Happiness.Rank.2017 = c(140L,
90L, 126L, 87L, 81L, 117L, 106L, 130L, 150L), Happiness.Score.2017 = c(3.79500007629395,
5.18200016021729, 4.28000020980835, 5.22700023651123, 5.26200008392334,
4.49700021743774, 4.70900011062622, 4.13899993896484, 3.49499988555908
), Whisker.high.2017 = c(3.95164193540812, 5.27633568674326,
4.35781083270907, 5.3252461694181, 5.35288859814405, 4.62259140968323,
4.85064333498478, 4.34574716508389, 3.59403811171651), whisker.low.2017 = c(3.63835821717978,
5.08766463369131, 4.20218958690763, 5.12875430360436, 5.17111156970263,
4.37140902519226, 4.56735688626766, 3.9322527128458, 3.39596165940166
), Economy.2017 = c(0.858428180217743, 0.982409417629242, 0.0921023488044739,
1.28948748111725, 0.995538592338562, 1.10271048545837, 0.36842092871666,
0.65951669216156, 0.305444717407227), Family.2017 = c(1.10441195964813,
1.0693359375, 1.22902345657349, 1.23941457271576, 1.27444469928741,
0.978613197803497, 0.984136044979095, 1.21400856971741, 0.431882530450821
), Health.2017 = c(0.0498686656355858, 0.705186307430267, 0.191407024860382,
0.810198903083801, 0.492345720529556, 0.501180469989777, 0.00556475389748812,
0.290920823812485, 0.247105568647385), Freedom.2017 = c(NA, 0.204403176903725,
0.235961347818375, 0.0957312509417534, 0.443323463201523, 0.288555532693863,
0.318697690963745, 0.0149958552792668, 0.38042613863945), Generosity.2017 = c(0.097926490008831,
0.328867495059967, 0.246455833315849, NA, 0.611704587936401,
0.19963726401329, 0.293040901422501, 0.182317450642586, 0.196896150708199
), Trust.2017 = c(0.0697203353047371, NA, 0.0602413564920425,
0.04328977689147, 0.0153171354904771, 0.107215754687786, 0.0710951760411263,
0.089847519993782, 0.0956650152802467), Dystopia.Residual.2017 = c(1.61448240280151,
1.89217257499695, 2.22495865821838, 1.74922156333923, 1.42947697639465,
1.31890726089478, 2.66845989227295, 1.68706583976746, 1.83722925186157
)), class = "data.frame", row.names = c(NA, -9L))
数据框的结构
Country Region Happiness.Rank.2015 Happiness.Score.2015
1 Angola Sub-Saharan Africa 137 4.033
2 Bosnia and Herzegovina Central and Eastern Europe 96 4.949
3 Congo (Kinshasa) Sub-Saharan Africa 120 4.517
4 Greece Western Europe 102 4.857
5 Indonesia Southeastern Asia 74 5.399
6 Iraq Middle East and Northern Africa 112 4.677
7 Sierra Leone Sub-Saharan Africa 123 4.507
8 Sudan Sub-Saharan Africa 118 4.550
9 Togo Sub-Saharan Africa 158 2.839
Standard.Error.2015 Economy.2015 Family.2015 Health.2015 Freedom.2015 Trust.2015 Generosity.2015
1 0.04758 0.75778 0.86040 0.16683 0.10384 0.07122 0.12344
2 0.06913 0.83223 0.91916 0.79081 0.09245 0.00227 0.24808
3 0.03680 NA 1.00120 0.09806 0.22605 0.07625 0.24834
4 0.05062 1.15406 0.92933 0.88213 0.07699 0.01397 NA
5 0.02596 0.82827 1.08708 0.63793 0.46611 NA 0.51535
6 0.05232 0.98549 0.81889 0.60237 NA 0.13788 0.17922
7 0.07068 0.33024 0.95571 NA 0.40840 0.08786 0.21488
8 0.06740 0.52107 1.01404 0.36878 0.10081 0.14660 0.19062
9 0.06727 0.20868 0.13995 0.28443 0.36453 0.10731 0.16681
Dystopia.Residual.2015 Region.2016 Happiness.Rank.2016 Happiness.Score.2016
1 1.94939 Sub-Saharan Africa 141 3.866
2 2.06367 Central and Eastern Europe 87 5.163
3 2.86712 Sub-Saharan Africa 125 4.272
4 1.80101 Western Europe 99 5.033
5 1.86399 Southeastern Asia 79 5.314
6 1.95335 Middle East and Northern Africa 112 4.575
7 2.51009 Sub-Saharan Africa 111 4.635
8 2.20857 Sub-Saharan Africa 133 4.139
9 1.56726 Sub-Saharan Africa 155 3.303
Lower.CI.2016 Upper.CI.2016 Economy.2016 Family.2016 Health.2016 Freedom.2016 Trust.2016
1 3.753 3.979 0.84731 0.66366 0.04991 0.00589 0.08434
2 5.063 5.263 0.93383 0.64367 0.70766 0.09511 NA
3 4.191 4.353 0.05661 0.80676 0.18800 0.15602 0.06075
4 4.935 5.131 1.24886 0.75473 0.80029 0.05822 0.04127
5 5.237 5.391 0.95104 0.87625 0.49374 0.39237 0.00322
6 4.446 4.704 1.07474 0.59205 0.51076 0.24856 0.13636
7 4.505 4.765 0.36485 0.62800 NA 0.30685 0.08196
8 3.928 4.350 0.63069 0.81928 0.29759 NA 0.10039
9 3.192 3.414 0.28123 NA 0.24811 0.34678 0.11587
Generosity.2016 Dystopia.Residual.2016 Happiness.Rank.2017 Happiness.Score.2017 Whisker.high.2017
1 0.12071 2.09459 140 3.795 3.951642
2 0.29889 2.48406 90 5.182 5.276336
3 0.25458 2.74924 126 4.280 4.357811
4 NA 2.12944 87 5.227 5.325246
5 0.56521 2.03171 81 5.262 5.352889
6 0.19589 1.81657 117 4.497 4.622591
7 0.23897 3.01402 106 4.709 4.850643
8 0.18077 2.10995 130 4.139 4.345747
9 0.17517 2.13540 150 3.495 3.594038
whisker.low.2017 Economy.2017 Family.2017 Health.2017 Freedom.2017 Generosity.2017 Trust.2017
1 3.638358 0.85842818 1.1044120 0.049868666 NA 0.09792649 0.06972034
2 5.087665 0.98240942 1.0693359 0.705186307 0.20440318 0.32886750 NA
3 4.202190 0.09210235 1.2290235 0.191407025 0.23596135 0.24645583 0.06024136
4 5.128754 1.28948748 1.2394146 0.810198903 0.09573125 NA 0.04328978
5 5.171112 0.99553859 1.2744447 0.492345721 0.44332346 0.61170459 0.01531714
6 4.371409 1.10271049 0.9786132 0.501180470 0.28855553 0.19963726 0.10721575
7 4.567357 0.36842093 0.9841360 0.005564754 0.31869769 0.29304090 0.07109518
8 3.932253 0.65951669 1.2140086 0.290920824 0.01499586 0.18231745 0.08984752
9 3.395962 0.30544472 0.4318825 0.247105569 0.38042614 0.19689615 0.09566502
Dystopia.Residual.2017
1 1.614482
2 1.892173
3 2.224959
4 1.749222
5 1.429477
6 1.318907
7 2.668460
8 1.687066
9 1.837229
更新#1:我尝试过的事情
我已经使用@RAB建议的代码尝试了apply
函数。它给了我如下警告信息
使用的代码
dt <- apply(df, 1, mean, na.rm=T)
警告消息
1:在mean.default(newX [,i],...)中: 参数不是数字或逻辑:返回NA
数据帧的字符串
'data.frame': 9 obs. of 35 variables:
$ Country : chr "Angola" "Bosnia and Herzegovina" "Congo (Kinshasa)" "Greece" ...
$ Region : chr "Sub-Saharan Africa" "Central and Eastern Europe" "Sub-Saharan Africa" "Western Europe" ...
$ Happiness.Rank.2015 : int 137 96 120 102 74 112 123 118 158
$ Happiness.Score.2015 : num 4.03 4.95 4.52 4.86 5.4 ...
$ Standard.Error.2015 : num 0.0476 0.0691 0.0368 0.0506 0.026 ...
$ Economy.2015 : num 0.758 0.832 NA 1.154 0.828 ...
$ Family.2015 : num 0.86 0.919 1.001 0.929 1.087 ...
$ Health.2015 : num 0.1668 0.7908 0.0981 0.8821 0.6379 ...
$ Freedom.2015 : num 0.1038 0.0925 0.2261 0.077 0.4661 ...
$ Trust.2015 : num 0.07122 0.00227 0.07625 0.01397 NA ...
$ Generosity.2015 : num 0.123 0.248 0.248 NA 0.515 ...
$ Dystopia.Residual.2015: num 1.95 2.06 2.87 1.8 1.86 ...
$ Region.2016 : chr "Sub-Saharan Africa" "Central and Eastern Europe" "Sub-Saharan Africa" "Western Europe" ...
$ Happiness.Rank.2016 : int 141 87 125 99 79 112 111 133 155
$ Happiness.Score.2016 : num 3.87 5.16 4.27 5.03 5.31 ...
$ Lower.CI.2016 : num 3.75 5.06 4.19 4.93 5.24 ...
$ Upper.CI.2016 : num 3.98 5.26 4.35 5.13 5.39 ...
$ Economy.2016 : num 0.8473 0.9338 0.0566 1.2489 0.951 ...
$ Family.2016 : num 0.664 0.644 0.807 0.755 0.876 ...
$ Health.2016 : num 0.0499 0.7077 0.188 0.8003 0.4937 ...
$ Freedom.2016 : num 0.00589 0.09511 0.15602 0.05822 0.39237 ...
$ Trust.2016 : num 0.08434 NA 0.06075 0.04127 0.00322 ...
$ Generosity.2016 : num 0.121 0.299 0.255 NA 0.565 ...
$ Dystopia.Residual.2016: num 2.09 2.48 2.75 2.13 2.03 ...
$ Happiness.Rank.2017 : int 140 90 126 87 81 117 106 130 150
$ Happiness.Score.2017 : num 3.8 5.18 4.28 5.23 5.26 ...
$ Whisker.high.2017 : num 3.95 5.28 4.36 5.33 5.35 ...
$ whisker.low.2017 : num 3.64 5.09 4.2 5.13 5.17 ...
$ Economy.2017 : num 0.8584 0.9824 0.0921 1.2895 0.9955 ...
$ Family.2017 : num 1.1 1.07 1.23 1.24 1.27 ...
$ Health.2017 : num 0.0499 0.7052 0.1914 0.8102 0.4923 ...
$ Freedom.2017 : num NA 0.2044 0.236 0.0957 0.4433 ...
$ Generosity.2017 : num 0.0979 0.3289 0.2465 NA 0.6117 ...
$ Trust.2017 : num 0.0697 NA 0.0602 0.0433 0.0153 ...
$ Dystopia.Residual.2017: num 1.61 1.89 2.22 1.75 1.43 ...
注意:我是R的新手,请提供解释以及代码。
答案 0 :(得分:2)
您的数据必须为数字才能正常工作,因此第1步将仅过滤数字数据(我们稍后将其他数据放回去)
您将需要用数据框名称替换“您的数据”
第1步:仅过滤数字
df <- Filter(is.numeric, yourdata)
第2步:掌握方法
mns <- apply(df, 1, mean, na.rm=T) # this gets the mean of each row
第3步:找到NA值的索引
nas <- as.data.frame(which(is.na(df), arr.ind = T))
# the data frame makes it easier to extract the row info for later
第4步:将NA值替换为相应的平均值
df[which(is.na(df), arr.ind = T)] <- mns[nas$row]
第5步:将非数字列与新列合并
new_df <- cbind(Filter(Negate(is.numeric), yourdata), df)
编辑:
我很无聊,所以听说有一个适合您的功能
replace_missing <- function(df, groups){
cols <- names(df)
df_char <- Filter(Negate(is.numeric), df)
df_num <- Filter(is.numeric, df)
for(gg in 1:length(groups)){
tmp <- df_num[, grep(groups[gg], names(df_num))]
mns <- apply(tmp, 1, mean, na.rm=T)
nas <- as.data.frame(which(is.na(tmp), arr.ind = T))
if (nrow(nas) > 0){
tmp[which(is.na(tmp), arr.ind = T)] <- mns[nas$row]
}
df_char <- cbind(df_char, tmp)
}
new_df <- cbind(df_char, df[, setdiff(names(df), names(df_char))])
new_df <- new_df[, cols]
}
new_data <- replace_missing(yourdata, groups = c("Happiness.Rank", "Happiness.Score",
"Family", "Economy"))
您可以在groups
字段中添加任意数量的
答案 1 :(得分:1)
这是一个相当直接的tidyverse
解决方案;此处的关键是将数据从宽变长到整形,然后在将数据转换回宽之前“适当地”替换NA
值。最后,我给出一些解释,但我鼓励您逐行执行代码以了解每一步的作用。
library(tidyverse)
df.new <- df %>%
gather(key, val, -Country, -Region, -Region.2016) %>%
separate(key, c("what", "when"), sep = "\\.(?=\\d)", remove = FALSE) %>%
group_by(Country, what) %>%
mutate(val = replace(val, is.na(val), mean(val, na.rm = TRUE))) %>%
ungroup() %>%
select(-what, -when) %>%
spread(key, val)
df.new
## A tibble: 9 x 35
# Country Region Region.2016 Dystopia.Residu… Dystopia.Residu… Dystopia.Residu…
# <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 Angola Sub-S… Sub-Sahara… 1.95 2.09 1.61
#2 Bosnia… Centr… Central an… 2.06 2.48 1.89
#3 Congo … Sub-S… Sub-Sahara… 2.87 2.75 2.22
#4 Greece Weste… Western Eu… 1.80 2.13 1.75
#5 Indone… South… Southeaste… 1.86 2.03 1.43
#6 Iraq Middl… Middle Eas… 1.95 1.82 1.32
#7 Sierra… Sub-S… Sub-Sahara… 2.51 3.01 2.67
#8 Sudan Sub-S… Sub-Sahara… 2.21 2.11 1.69
#9 Togo Sub-S… Sub-Sahara… 1.57 2.14 1.84
## ... with 29 more variables: Economy.2015 <dbl>, Economy.2016 <dbl>,
## Economy.2017 <dbl>, Family.2015 <dbl>, Family.2016 <dbl>,
## Family.2017 <dbl>, Freedom.2015 <dbl>, Freedom.2016 <dbl>,
## Freedom.2017 <dbl>, Generosity.2015 <dbl>, Generosity.2016 <dbl>,
## Generosity.2017 <dbl>, Happiness.Rank.2015 <dbl>,
## Happiness.Rank.2016 <dbl>, Happiness.Rank.2017 <dbl>,
## Happiness.Score.2015 <dbl>, Happiness.Score.2016 <dbl>,
## Happiness.Score.2017 <dbl>, Health.2015 <dbl>, Health.2016 <dbl>,
## Health.2017 <dbl>, Lower.CI.2016 <dbl>, Standard.Error.2015 <dbl>,
## Trust.2015 <dbl>, Trust.2016 <dbl>, Trust.2017 <dbl>, Upper.CI.2016 <dbl>,
## Whisker.high.2017 <dbl>, whisker.low.2017 <dbl>
说明:
Country
,Region
和Region.2016
不变。所有其他列名称在新列key
中给出,值在val
中。key
之类的所有"Happiness.Score.2016"
条目分离为"Happiness.Score" (column
“ {2016} ) and
” (column
。)Country
和what
对条目进行分组。NA
和Country
中的what
替换为所有年份的平均值。ungroup
并删除之前的what
和when
列请介意,实际上,将数据保留为长格式可能要容易得多(并且更符合“整洁”的数据);但这只是我的意见。
让我们检查一下Country == "Congo"
df.new %>% filter(str_detect(Country, "Congo")) %>% select(contains("Economy"))
## A tibble: 1 x 3
# Economy.2015 Economy.2016 Economy.2017
# <dbl> <dbl> <dbl>
#1 0.0744 0.0566 0.0921
并与原始数据进行比较
df %>% filter(str_detect(Country, "Congo")) %>% select(contains("Economy"))
# Economy.2015 Economy.2016 Economy.2017
#1 NA 0.05661 0.09210235
这里0.0744 = 1/2 * (0.05661 + 0.09210235)
。