我有这样的数据
df<- structure(list(sname = structure(2:1, .Label = c("sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1"), class = "factor"),
sence = structure(1:2, .Label = c("MGSSNYATTATVHAVRTSTNSNCWCHDNAVVASASTWWTYSGWMYRVWYAVNHSTSSYRKVTWHWASMAGSAVRAKVGDWRSWGYVVCVYVRVRKSRRSNSNASSAVSTSCVSNRAMKGTTHYDTS",
"MVKTTYYDVGVKNATKKAYRKAKYHDKNNGKKSAYVSDAKKRYDKGGAKGGAGGGGSMDDMGGGGRMRRRGKNVVHSVTDYNGATRKAKNVCDKCGRGGKKGAVCCNCRGTGMRHGGMVSVCMCVVDDNRRRHYNGAYDDHHRGGVCTS"
), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
我正在尝试将第二列切成一定数量的字母,但没有成功
library("plyr")
laply(seq(1, nchar(df$sence), 2), function(i) substr(df$sence, i, i+1))
我基本上想拥有的是这样的东西,例如用10个字母将其切碎
sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1
MGSSNYATTA
TVHAVRTSTN
SNCWCHDNAV
VASASTWWTY
SGWMYRVWYA
VNHSTSSYRK
VTWHWASMAG
SAVRAKVGDW
RSWGYVVCVY
VRVRKSRRSN
SNASSAVSTS
CVSNRAMKGT
THYDTS
sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2
MVKTTYYDVG
VKNATKKAYR
KAKYHDKNNG
KKSAYVSDAK
KRYDKGGAKG
GAGGGGSMDD
MGGGGRMRRR
GKNVVHSVTD
YNGATRKAKN
VCDKCGRGGK
KGAVCCNCRG
TGMRHGGMVS
VCMCVVDDNR
RRHYNGAYDD
HHRGGVCTS
答案 0 :(得分:1)
对于每个sence
,我们可以创建一个从1到nchar
的序列,并取substring
lapply(df$sence, function(x)
sapply(seq(1, nchar(x), 10), function(y) substring(x, y, y+9)))
#[[1]]
# [1] "MGSSNYATTA" "TVHAVRTSTN" "SNCWCHDNAV" "VASASTWWTY" "SGWMYRVWYA" "VNHSTSSYRK"
# "VTWHWASMAG" "SAVRAKVGDW" "RSWGYVVCVY" "VRVRKSRRSN" "SNASSAVSTS"
# "CVSNRAMKGT" "THYDTS"
#[[2]]
# [1] "MVKTTYYDVG" "VKNATKKAYR" "KAKYHDKNNG" "KKSAYVSDAK" "KRYDKGGAKG" "GAGGGGSMDD"
# "MGGGGRMRRR" "GKNVVHSVTD" "YNGATRKAKN" "VCDKCGRGGK" "KGAVCCNCRG"
# "TGMRHGGMVS" "VCMCVVDDNR" "RRHYNGAYDD" "HHRGGVCTS"
要获取OP格式的输出,我们可以使用mapply
data.frame(V1 = as.character(unlist(mapply(function(p, q)
c(q, sapply(p, function(x) sapply(seq(1, nchar(x), 10),
function(y) substring(x, y, y+9)))), df$sence, df$sname))))
# V1
#1 sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1
#2 MGSSNYATTA
#3 TVHAVRTSTN
#4 SNCWCHDNAV
#5 VASASTWWTY
#6 SGWMYRVWYA
#7 VNHSTSSYRK
#8 VTWHWASMAG
#....
答案 1 :(得分:1)
我们可以使用strsplit
setNames(strsplit(as.character(df$sence), "(?<=.{10})", perl = TRUE),
df$sname)
#$`sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1`
# [1] "MGSSNYATTA" "TVHAVRTSTN" "SNCWCHDNAV" "VASASTWWTY" "SGWMYRVWYA" "VNHSTSSYRK" "VTWHWASMAG" "SAVRAKVGDW" "RSWGYVVCVY"
#[10] "VRVRKSRRSN" "SNASSAVSTS" "CVSNRAMKGT" "THYDTS"
#$`sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2`
# [1] "MVKTTYYDVG" "VKNATKKAYR" "KAKYHDKNNG" "KKSAYVSDAK" "KRYDKGGAKG" "GAGGGGSMDD" "MGGGGRMRRR" "GKNVVHSVTD" "YNGATRKAKN"
#[10] "VCDKCGRGGK" "KGAVCCNCRG" "TGMRHGGMVS" "VCMCVVDDNR" "RRHYNGAYDD" "HHRGGVCTS"
或具有与OP输出相同的输出
n <- 10
pat <- paste0("(?<=.{", n,"})")
out <- data.frame(col = unlist(Map(c, as.character(df$sname),
strsplit(as.character(df$sence), pat, perl = TRUE))))
row.names(out) <- NULL
head(out)
# col
#1 sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1
#2 MGSSNYATTA
#3 TVHAVRTSTN
#4 SNCWCHDNAV
#5 VASASTWWTY
#6 SGWMYRVWYA
或者与tidyverse
library(tidyverse)
df %>%
mutate(sence = str_extract_all(sence, ".{10}")) %>%
transmute(sence = map2(as.character(sname), sence, c)) %>%
unnest