我有一个数据框,所有列均归类为字符。每列包含一个分数。我想将列转换为整数,但是某些列的分数为“ 0/0”,R似乎并不喜欢。我尝试了以下方法,但得到了
df2 <- as.numeric(df)
并获得以下内容
Error: (list) object cannot be coerced to type 'double'
我找不到帖子,解释了如何将包含小数“ 0/0”的字符转换为数值0。我意识到R会给我带来麻烦的原因整体除以零。我只处理遗传数据,将数据转换为数字并添加所有内容比执行某种替换功能要容易得多。实际的数据帧是数百万行和500多个行的10s。
这是示例数据帧
df <- structure(list(`GEN[5].GT` = c("0/1", "0/0", "0/0", "0/0",
"0/1", "0/0", "0/0", "1/1", "0/0", "0/0"), `GEN[1].GT` = c("0/0",
"0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0"
), `GEN[6].GT` = c("1/1", "0/0", "0/0", "0/0", "0/0", "0/0",
"0/1", "0/0", "0/0", "0/0"), `GEN[9].GT` = c("0/0", "0/0",
"0/0", "0/0", "0/1", "0/0", "0/0", "0/1", "0/0", "0/0"), `GEN[89].GT` = c("0/0",
"0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0"
), `GEN[453].GT` = c("0/0", "0/0", "0/1", "0/0", "0/0", "0/0",
"0/0", "0/0", "0/0", "0/0"), `GEN[554].GT` = c("0/0", "0/0",
"0/0", "0/0", "0/0", "0/0", "1/1", "0/0", "0/0", "0/0"), `GEN[9864].GT` = c("0/0",
"0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0", "0/0"
), `GEN[1234].GT` = c("1/1", "0/0", "0/0", "0/0", "0/0", "0/0",
"0/0", "0/0", "0/0", "0/0"), `GEN[3333].GT` = c("0/0", "0/0",
"0/0", "0/0", "0/0", "1/1", "0/0", "0/1", "0/0", "0/0")), row.names = c(NA,
10L), class = "data.frame")
# Expected output
df2 <- structure(list(`GEN[5].GT` = c("0.5", "0", "0", "0",
"0.5", "0", "0", "1", "0", "0"), `GEN[1].GT` = c("0",
"0", "0", "0", "0", "0", "0", "0", "0", "0"
), `GEN[6].GT` = c("1", "0", "0", "0", "0", "0",
"0.5", "0", "0", "0"), `GEN[9].GT` = c("0", "0",
"0", "0", "0.5", "0", "0", "0.5", "0", "0"), `GEN[89].GT` = c("0",
"0", "0", "0", "0", "0", "0", "0", "0", "0"
), `GEN[453].GT` = c("0", "0", "0.5", "0", "0", "0",
"0", "0", "0", "0"), `GEN[554].GT` = c("0", "0",
"0", "0", "0", "0", "1", "0", "0", "0"), `GEN[9864].GT` = c("0",
"0", "0", "0", "0", "0", "0", "0", "0", "0"
), `GEN[1234].GT` = c("1", "0", "0", "0", "0", "0",
"0", "0", "0", "0"), `GEN[3333].GT` = c("0", "0",
"0", "0", "0", "1", "0", "0.5", "0", "0")), row.names = c(NA,
10L), class = "data.frame")
答案 0 :(得分:2)
我们可以创建一个行名列(从rownames_to_column
到行tibble
,然后在/
的同时用separate_rows
在定界符(convert
)处拆分每一列。自动按“ rn”分组的类型,获取每一列的mean
library(dplyr)
library(tibble)
library(tidyr)
df %>%
rownames_to_column('rn') %>%
separate_rows(-1, convert = TRUE) %>%
group_by(rn) %>%
summarise_all(mean) %>%
select(-rn)
# A tibble: 10 x 10
# `GEN[5].GT` `GEN[1].GT` `GEN[6].GT` `GEN[9].GT` `GEN[89].GT` `GEN[453].GT` `GEN[554].GT` `GEN[9864].GT` `GEN[1234].GT` `GEN[3333].GT`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.5 0 1 0 0 0 0 0 1 0
# 2 0 0 0 0 0 0 0 0 0 0
# 3 0 0 0 0 0 0 0 0 0 0
# 4 0 0 0 0 0 0.5 0 0 0 0
# 5 0 0 0 0 0 0 0 0 0 0
# 6 0.5 0 0 0.5 0 0 0 0 0 0
# 7 0 0 0 0 0 0 0 0 0 1
# 8 0 0 0.5 0 0 0 1 0 0 0
# 9 1 0 0 0.5 0 0 0 0 0 0.5
#10 0 0 0 0 0 0 0 0 0 0
或@IceCreamToucan在注释中提到的另一个选项,它是用strsplit
拆分单元格并使用mean
library(purrr)
df %>%
mutate_all(~ map_dbl(strsplit(., '/'), ~ mean(as.numeric(.))))
或者另一个可能更有效的方法(在base R
中)是传递命名向量,通过匹配替换值
nm1 <- setNames(c(0, 0.5, 0.5, 1), c("0/0", "1/0", "0/1", "1/1"))
df[] <- lapply(df, function(x) nm1[x])
df
# GEN[5].GT GEN[1].GT GEN[6].GT GEN[9].GT GEN[89].GT GEN[453].GT GEN[554].GT GEN[9864].GT GEN[1234].GT GEN[3333].GT
#1 0.5 0 1.0 0.0 0 0.0 0 0 1 0.0
#2 0.0 0 0.0 0.0 0 0.0 0 0 0 0.0
#3 0.0 0 0.0 0.0 0 0.5 0 0 0 0.0
#4 0.0 0 0.0 0.0 0 0.0 0 0 0 0.0
#5 0.5 0 0.0 0.5 0 0.0 0 0 0 0.0
#6 0.0 0 0.0 0.0 0 0.0 0 0 0 1.0
#7 0.0 0 0.5 0.0 0 0.0 1 0 0 0.0
#8 1.0 0 0.0 0.5 0 0.0 0 0 0 0.5
#9 0.0 0 0.0 0.0 0 0.0 0 0 0 0.0
#10 0.0 0 0.0 0.0 0 0.0 0 0 0 0.0
答案 1 :(得分:2)
我们可以使用2
4
6
8
捕获两边并将它们放在括号内,将gsub
替换为/
,然后除以2。
+