数据集包含数百个列,列名称看起来像这样的“ drop.loc1.genom1.tret1.gwas2.a”,我需要删除除loc1和tret1之外的所有内容-这样看起来就像这样“ loc1.trt1“ ----任何提示或帮助将不胜感激 谢谢
答案 0 :(得分:3)
另一种选择是使用strsplit
:
sapply(strsplit(strings, "\\."), function(x)
paste0(x[c(2, 4)], collapse = "."))
[1] "loc1.tret1" "loc2.tret2" "loc100.tret100"
(根据ManuelBickel的回答)
strings = c("drop.loc1.genom1.tret1.gwas2.a",
"drop.loc2.genom1.tret2.gwas2.a",
"drop.loc100.genom1.tret100.gwas2.a")
答案 1 :(得分:2)
您可以尝试类似的方法。
UPDATE:到目前为止已用所有建议的版本基准更新了代码。 如果@Onyambu发布了答案,则您应该接受该答案,因为这种方法是最快的。
strings = c("drop.loc1.genom1.tret1.gwas2.a",
"drop.loc2.genom1.tret2.gwas2.a",
"drop.loc100.genom1.tret100.gwas2.a")
gsub("(^.*\\.)(loc\\d+)(\\..*\\.)(tret\\d+)(\\..*$)", "\\2.\\4", strings, perl = T)
[1] "loc1.tret1" "loc2.tret2" "loc100.tret100"
f1 = function(strings) {
unname(sapply(strings, function(x)
paste0(unlist(strsplit(x, "\\."))[c(2, 4)], collapse = ".")))
}
f2 = function(strings) {
gsub("(^.*\\.)(loc\\d+)(\\..*\\.)(tret\\d+)(\\..*$)", "\\2.\\4", strings, perl = T)
}
f2b = function(strings) {
sub(".*(loc\\d+).*(tret\\d+).*","\\1.\\2",strings)
}
microbenchmark::microbenchmark(
f1(strings),
f2(strings),
f2b(strings)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# f1(strings) 58.818 64.1475 136.31964 68.687 76.1880 5691.106 100
# f2(strings) 78.161 79.9380 106.08183 83.293 88.6215 2110.333 100
# f2b(strings) 27.238 29.6070 53.29592 32.765 35.1330 1872.299 100
答案 2 :(得分:0)
您可以在评论曼努埃尔·比克尔的答案中使用Onyambu的正则表达式模式来使用dplyr::rename_all()
或dplyr::select_all()
和gsub()
library(dplyr)
# sample data
df <- data_frame(drop.loc1.genom1.tret1.gwas2.a = 1:2,
drop.loc23.genom2.tret2.gwas2.a = 3:4,
drop.loc3.genom3.tret34.gwas3.a = 5:6)
# both rename_all and select_all give the same results:
df %>%
rename_all(~gsub(".*(loc\\d+).*(tret\\d+).*","\\1.\\2", .))
df %>%
select_all(~gsub(".*(loc\\d+).*(tret\\d+).*","\\1.\\2", .))
# A tibble: 2 x 3
loc1.tret1 loc23.tret2 loc3.tret34
<int> <int> <int>
1 1 3 5
2 2 4 6