以下是我正在使用的数据:
library(RCurl)
x <- getURL("https://raw.githubusercontent.com/dothemathonthatone/maps/master/main_test.csv")
maindf <- read.csv(text = x)
maindf_1 <- maindf %>%
dplyr::select(year, regional_schlüssel, age_group, fee_per_inc, fert_total, daily_hours, low_fee, middle_fee, high_fee)
head(maindf_1)
year regional_schlüssel fee_per_inc fert_total daily_hours low_fee middle_fee high_fee
2006 12246436188 0.000000000 0.02905331 8 1 0 0
2006 12246436188 0.002770760 0.02905331 8 1 0 0
2006 12246436188 0.003857333 0.02905331 8 1 0 0
2006 12246436188 0.004237633 0.02905331 8 0 1 0
2006 12246436188 0.004482112 0.02905331 8 0 1 0
2006 12246436188 0.005085077 0.02905331 8 0 1 0
要准备用于面板回归的数据,我想根据最后三列对第4列fee_per_inc
中的非零值求平均值;例如,
year regional_schlüssel age_group fee_per_inc fert_total daily_hours low_fee middle_fee high_fee
2006 12246436188 -8 0.000000000 0.02905331 8 .003314047 0 0
2006 12246436188 -8 0.002770760 0.02905331 8 .003314047 0 0
2006 12246436188 -8 0.003857333 0.02905331 8 .003314047 0 0
2006 12246436188 -8 0.004237633 0.02905331 8 0 .004601607 0
2006 12246436188 -8 0.004482112 0.02905331 8 0 .004601607 0
2006 12246436188 -8 0.005085077 0.02905331 8 0 .004601607 0
,然后删除最后三行中的零:
year regional_schlüssel age_group fee_per_inc fert_total daily_hours low_fee middle_fee high_fee
2006 12246436188 -8 0.000000000 0.02905331 8 .003314047 .004601607 0
2006 12246436188 -8 0.002770760 0.02905331 8 .003314047 .004601607 0
2006 12246436188 -8 0.003857333 0.02905331 8 .003314047 .004601607 0
除了在此示例中,最后一行仍为零。
完成此操作后,我可以删除fee_per_inc
,多余的行,并进行面板回归。
答案 0 :(得分:2)
这里只是一个新手,但是也许是这样吗?
maindf_2 <- maindf_1 %>%
mutate(fee_per_inc = ifelse(fee_per_inc==0,NA, fee_per_inc)) %>%
group_by(low_fee, middle_fee, high_fee) %>%
mutate(low_fee_avg = ifelse(low_fee !=0, mean(fee_per_inc, na.rm = T), NA),
mid_fee_avg = ifelse(middle_fee !=0, mean(fee_per_inc, na.rm = T), NA),
high_fee_avg = ifelse(high_fee !=0, mean(fee_per_inc, na.rm = T), NA)
) %>%
ungroup() %>%
select(-ends_with("_fee"))
答案 1 :(得分:1)
一种方法是将数据重整形为长格式,删除具有0值的行,按regional_schlüssel
分组并使用唯一的列名。我们可以使用mean
个非零的fee_per_inc
值,将数据转换为宽格式并删除不需要的列。
library(dplyr)
library(tidyr)
maindf_1 %>%
pivot_longer(cols = ends_with('fee')) %>%
filter(value != 0) %>%
group_by(regional_schlüssel, grp = data.table::rleid(name)) %>%
mutate(value = mean(fee_per_inc[fee_per_inc != 0], na.rm = TRUE),
row = row_number()) %>%
pivot_wider(values_fill = list(value = 0)) %>%
ungroup() %>%
select(-grp, -fee_per_inc, -row)
# year regional_schlüssel age_group fert_total daily_hours low_fee middle_fee high_fee
# <int> <dbl> <fct> <dbl> <fct> <dbl> <dbl> <dbl>
# 1 2006 12246436188. -8 0.0291 8 0.00331 0 0
# 2 2006 12246436188. -8 0.0291 8 0.00331 0 0
# 3 2006 12246436188. -8 0.0291 8 0.00331 0 0
# 4 2006 12246436188. -8 0.0291 8 0 0.00460 0
# 5 2006 12246436188. -8 0.0291 8 0 0.00460 0
# 6 2006 12246436188. -8 0.0291 8 0 0.00460 0
# 7 2006 12246436188. -8 0.0291 8 0.00197 0 0
# 8 2006 12246436188. -8 0.0291 8 0.00197 0 0
# 9 2006 12246436188. -8 0.0291 8 0.00197 0 0
#10 2006 12246436188. -8 0.0291 8 0 0.00308 0
# … with 9,907 more rows