我想转换数据框中的最后一列:
2L 7800161 2L_7800161_SNP G A 999 PASS REFCOUNT=198;ALTCOUNT=5
2L 7800182 2L_7800182_SNP C T 999 PASS REFCOUNT=174;ALTCOUNT=21
2L 7800202 2L_7800202_SNP C G 999 PASS REFCOUNT=152;ALTCOUNT=36
2L 7800231 2L_7800231_SNP C T 999 PASS REFCOUNT=193;ALTCOUNT=8
2L 7800235 2L_7800235_SNP A G 999 PASS REFCOUNT=199;ALTCOUNT=2
对于这样的事情:
2L 7800161 2L_7800161_SNP G A 999 PASS 198 5
2L 7800182 2L_7800182_SNP C T 999 PASS 174 21
2L 7800202 2L_7800202_SNP C G 999 PASS 152 36
2L 7800231 2L_7800231_SNP C T 999 PASS 193 8
2L 7800235 2L_7800235_SNP A G 999 PASS 199 2
有人可以帮帮我吗?
答案 0 :(得分:2)
我们可以使用str_extract_all
从最后一列中提取数字部分,输出为list
,我们将其转换为numeric
,rbind
,然后创建两个'新'列。如果需要,我们可以将修改后的列指定为NULL
。
library(stringr)
df1[paste0('new', 1:2)] <- do.call(rbind,
lapply(str_extract_all(df1[,ncol(df1)], '\\d+'), as.numeric))
df1$v8 <- NULL
df1
# v1 v2 v3 v4 v5 v6 v7 new1 new2
#1 2L 7800161 2L_7800161_SNP G A 999 PASS 198 5
#2 2L 7800182 2L_7800182_SNP C T 999 PASS 174 21
#3 2L 7800202 2L_7800202_SNP C G 999 PASS 152 36
#4 2L 7800231 2L_7800231_SNP C T 999 PASS 193 8
#5 2L 7800235 2L_7800235_SNP A G 999 PASS 199 2
另一个选项是来自extract
的{{1}}。我们选择要转换的列,在library(tidyr)
中指定新列名,使用带有捕获组的正则表达式,即括号内的字符进行提取。我们还可以使用into
将输出列的类更改为整数。
convert=TRUE
library(tidyr)
extract(df1, v8, into=c('new1', 'new2'),
'\\D*(\\d+)\\D*(\\d+).*', convert=TRUE)
答案 1 :(得分:2)
使用data.table
:
require(data.table) # v1.9.6+
dt[, c("col1", "col2") := transpose(regmatches(V8, gregexpr("\\d+", V8)))]
如果需要,您可以通过执行integer
将列转换为lapply(transpose(..), as.integer)
。
dt = fread("2L 7800161 2L_7800161_SNP G A 999 PASS REFCOUNT=198;ALTCOUNT=5
2L 7800182 2L_7800182_SNP C T 999 PASS REFCOUNT=174;ALTCOUNT=21
2L 7800202 2L_7800202_SNP C G 999 PASS REFCOUNT=152;ALTCOUNT=36
2L 7800231 2L_7800231_SNP C T 999 PASS REFCOUNT=193;ALTCOUNT=8
2L 7800235 2L_7800235_SNP A G 999 PASS REFCOUNT=199;ALTCOUNT=2
")