我的数据如下:
[[1]]
date germany france germany_mean france_mean germany_sd france_sd
1 2016-01-01 17 25 21.29429 48.57103 30.03026 47.05169
我想做的是使用map
对所有列表进行以下计算。
germany_calc = (germany - germany_mean) / germany_sd
france_calc = (france - france_mean) / france_sd
但是列数可以更改-这里有两个类别/国家,但在另一个列表中可以是1或3或N。国家/地区始终遵循相同的结构。也就是说,
"country1", "country2", ... , "countryN", "country1_mean", "country2_mean", ... , "countryN_mean", "country1_sd", "country2_sd", ... , "countryN_sd".
预期的输出(对于第一个列表):
Germany: -0.1429988 = (17 - 21.29429) / 30.03026
France: -0.5009603 = (25 - 48.57103) / 47.05169
编辑:抱歉-预期输出:
-0.1429988
-0.5009603
功能:
Scale_Me <- function(x){
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
数据:
my_list <- list(structure(list(date = structure(16801, class = "Date"),
germany = 17, france = 25, germany_mean = 21.2942922374429,
france_mean = 48.5710301846855, germany_sd = 30.030258443028,
france_sd = 47.0516928425878), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16802, class = "Date"),
germany = 9, france = 29, germany_mean = 21.2993150684932,
france_mean = 48.5605316914534, germany_sd = 30.0286190461173,
france_sd = 47.0543871206842), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16803, class = "Date"),
germany = 8, france = 18, germany_mean = 21.2947488584475,
france_mean = 48.551889593794, germany_sd = 30.0297291333284,
france_sd = 47.0562416513092), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16804, class = "Date"),
germany = 3, france = 11, germany_mean = 21.2778538812785,
france_mean = 48.5382545766386, germany_sd = 30.0267943793948,
france_sd = 47.0607680244109), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16805, class = "Date"),
germany = 4, france = 13, germany_mean = 21.2614155251142,
france_mean = 48.5214531240057, germany_sd = 30.0269420596686,
france_sd = 47.0676011750263), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16806, class = "Date"),
germany = 4, france = 9, germany_mean = 21.253196347032,
france_mean = 48.5055948249362, germany_sd = 30.0292032528186,
france_sd = 47.0737183354519), class = "data.frame", row.names = c(NA,
-1L)))
答案 0 :(得分:5)
为什么不只是rbind
这东西呢?
with(do.call(rbind, my_list),
cbind(germany=(germany - germany_mean) / germany_sd,
france=(france - france_mean) / france_sd))
# germany france
# [1,] -0.1429988 -0.5009603
# [2,] -0.4095864 -0.4157005
# [3,] -0.4427196 -0.6492633
# [4,] -0.6087181 -0.7976550
# [5,] -0.5748642 -0.7546901
# [6,] -0.5745473 -0.8392283
答案 1 :(得分:4)
该问题尚不清楚确切的输出形式,因此我们假设所需的是一个数据框,该数据框具有用于日期的列和用于将国家值标准化的每个国家的列。在这种情况下,这意味着我们需要在输出中包含3列。
1)ivot_longer / _wider 将my_list
列表组件绑定在一起,创建一个数据框,其中每个组件都有一行。然后,对于列中的每个裸国家名称,在其后附加_root,以便除date
以外的所有列名称均采用country_suffix的形式。然后转换为长格式,执行归一化并转换回宽格式:
library(dplyr)
library(tidyr)
library(purrr)
my_list %>%
bind_rows %>%
set_names(names(.)[1], sub("^([^_]*)$", "\\1_root", names(.)[-1])) %>%
pivot_longer(-date, names_to = c("country", ".value"), names_sep = "_") %>%
mutate(root = (root - mean) / sd) %>%
pivot_wider(id_cols = "date", names_from = "country", values_from = "root")
给予:
# A tibble: 6 x 3
date germany france
<date> <dbl> <dbl>
1 2016-01-01 -0.143 -0.501
2 2016-01-02 -0.410 -0.416
3 2016-01-03 -0.443 -0.649
4 2016-01-04 -0.609 -0.798
5 2016-01-05 -0.575 -0.755
6 2016-01-06 -0.575 -0.839
2)Base R
将列表部分重新绑定在一起,得到d
后,我们选择了国名nms
,因为除了第一个(date
)之外,这些国名均不包含下划线。然后执行标准化,并在cbind
列上进行date
。
d <- do.call("rbind", my_list)
nms <- grep("_", names(d), invert = TRUE, value = TRUE)[-1]
cbind(d[1], (d[nms] - d[paste0(nms, "_mean")]) / d[paste0(nms, "_sd")])
给予:
date germany france
1 2016-01-01 -0.1429988 -0.5009603
2 2016-01-02 -0.4095864 -0.4157005
3 2016-01-03 -0.4427196 -0.6492633
4 2016-01-04 -0.6087181 -0.7976550
5 2016-01-05 -0.5748642 -0.7546901
6 2016-01-06 -0.5745473 -0.8392283
答案 2 :(得分:4)
我们也可以在transform
中使用base R
transform(do.call(rbind, my_list),
germany = (germany - germany_mean)/germany_sd,
france = (france - france_mean)/france_sd)[c('date', 'germany', 'france')]
# date germany france
#1 2016-01-01 -0.1429988 -0.5009603
#2 2016-01-02 -0.4095864 -0.4157005
#3 2016-01-03 -0.4427196 -0.6492633
#4 2016-01-04 -0.6087181 -0.7976550
#5 2016-01-05 -0.5748642 -0.7546901
#6 2016-01-06 -0.5745473 -0.8392283
或者在dplyr
中,无需任何调整,就可以做到
library(dplyr)
bind_rows(my_list) %>%
transmute(date,
germany = (germany - germany_mean)/germany_sd,
france = (france - france_mean)/france_sd)
答案 3 :(得分:3)
您必须使用map
吗?
在这里,我使用两个for
循环而不是使用map
Result_list = vector("list",length(my_list))
for(i in 1:length(my_list))
{
df = my_list[[i]]
# identifier number of countries
countries = colnames(df)[grep('mean',colnames(df))]
countries = gsub("_mean","",countries)
df_result = NULL
for(j in 1:length(countries))
{
country = countries[j]
value_country = df[1,match(country,colnames(df))]
mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]
result_country = (value_country - mean_country) / sd_country
Sentence = paste0(country,": ",round(result_country,5)," = (",value_country," - ",round(mean_country,5),") / ",round(sd_country,5))
df_result = c(df_result,Sentence)
}
Result_list[[i]] = df_result
}
输出Result_list
如下:
> Result_list
[[1]]
[1] "germany: -0.143 = (17 - 21.29429) / 30.03026"
[2] "france: -0.50096 = (25 - 48.57103) / 47.05169"
[[2]]
[1] "germany: -0.40959 = (9 - 21.29932) / 30.02862"
[2] "france: -0.4157 = (29 - 48.56053) / 47.05439"
[[3]]
[1] "germany: -0.44272 = (8 - 21.29475) / 30.02973"
[2] "france: -0.64926 = (18 - 48.55189) / 47.05624"
[[4]]
[1] "germany: -0.60872 = (3 - 21.27785) / 30.02679"
[2] "france: -0.79765 = (11 - 48.53825) / 47.06077"
[[5]]
[1] "germany: -0.57486 = (4 - 21.26142) / 30.02694"
[2] "france: -0.75469 = (13 - 48.52145) / 47.0676"
[[6]]
[1] "germany: -0.57455 = (4 - 21.2532) / 30.0292"
[2] "france: -0.83923 = (9 - 48.50559) / 47.07372"
是您要找的东西吗?
编辑:仅提取结果
要仅提取结果值,可以执行以下操作:
Df_result_value = NULL
for(i in 1:length(my_list))
{
df = my_list[[i]]
# identifier number of countries
countries = colnames(df)[grep('mean',colnames(df))]
countries = gsub("_mean","",countries)
for(j in 1:length(countries))
{
country = countries[j]
value_country = df[1,match(country,colnames(df))]
mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]
result_country = (value_country - mean_country) / sd_country
Df_result_value = rbind(Df_result_value,c(country,result_country))
}
}
Df_result_value = data.frame(Df_result_value)
colnames(Df_result_value) = c("Country","Result")
并获得以下输出:
> Df_result_value
Country Result
1 germany -0.142998843835787
2 france -0.500960300483614
3 germany -0.409586436512588
4 france -0.415700488060442
5 germany -0.442719572974515
6 france -0.649263275639099
7 germany -0.608718121899195
8 france -0.797654950237258
9 germany -0.574864249939699
10 france -0.754690110335453
11 germany -0.574547256608035
12 france -0.839228262008441