如何将字符串切成一定数量

时间:2019-04-19 02:16:29

标签: r

我有这样的数据

df<- structure(list(sname = structure(2:1, .Label = c("sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2", 
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1"), class = "factor"), 
    sence = structure(1:2, .Label = c("MGSSNYATTATVHAVRTSTNSNCWCHDNAVVASASTWWTYSGWMYRVWYAVNHSTSSYRKVTWHWASMAGSAVRAKVGDWRSWGYVVCVYVRVRKSRRSNSNASSAVSTSCVSNRAMKGTTHYDTS", 
    "MVKTTYYDVGVKNATKKAYRKAKYHDKNNGKKSAYVSDAKKRYDKGGAKGGAGGGGSMDDMGGGGRMRRRGKNVVHSVTDYNGATRKAKNVCDKCGRGGKKGAVCCNCRGTGMRHGGMVSVCMCVVDDNRRRHYNGAYDDHHRGGVCTS"
    ), class = "factor")), class = "data.frame", row.names = c(NA, 
-2L))

我正在尝试将第二列切成一定数量的字母,但没有成功

library("plyr")
laply(seq(1, nchar(df$sence), 2), function(i) substr(df$sence, i, i+1))

我基本上想拥有的是这样的东西,例如用10个字母将其切碎

sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1  
MGSSNYATTA
TVHAVRTSTN
SNCWCHDNAV
VASASTWWTY
SGWMYRVWYA
VNHSTSSYRK
VTWHWASMAG
SAVRAKVGDW
RSWGYVVCVY
VRVRKSRRSN
SNASSAVSTS
CVSNRAMKGT
THYDTS
sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2  
MVKTTYYDVG
VKNATKKAYR
KAKYHDKNNG
KKSAYVSDAK
KRYDKGGAKG
GAGGGGSMDD
MGGGGRMRRR
GKNVVHSVTD
YNGATRKAKN
VCDKCGRGGK
KGAVCCNCRG
TGMRHGGMVS
VCMCVVDDNR
RRHYNGAYDD
HHRGGVCTS

2 个答案:

答案 0 :(得分:1)

对于每个sence,我们可以创建一个从1到nchar的序列,并取substring

lapply(df$sence, function(x) 
       sapply(seq(1, nchar(x), 10), function(y) substring(x, y, y+9)))


#[[1]]
# [1] "MGSSNYATTA" "TVHAVRTSTN" "SNCWCHDNAV" "VASASTWWTY" "SGWMYRVWYA" "VNHSTSSYRK" 
#     "VTWHWASMAG" "SAVRAKVGDW" "RSWGYVVCVY" "VRVRKSRRSN" "SNASSAVSTS" 
#     "CVSNRAMKGT" "THYDTS"    

#[[2]]
# [1] "MVKTTYYDVG" "VKNATKKAYR" "KAKYHDKNNG" "KKSAYVSDAK" "KRYDKGGAKG" "GAGGGGSMDD" 
#     "MGGGGRMRRR" "GKNVVHSVTD"  "YNGATRKAKN" "VCDKCGRGGK" "KGAVCCNCRG" 
#    "TGMRHGGMVS"   "VCMCVVDDNR" "RRHYNGAYDD" "HHRGGVCTS" 

要获取OP格式的输出,我们可以使用mapply

data.frame(V1 = as.character(unlist(mapply(function(p, q) 
   c(q, sapply(p, function(x) sapply(seq(1, nchar(x), 10), 
   function(y) substring(x, y, y+9)))), df$sence, df$sname))))


#                                                      V1
#1              sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1
#2                                              MGSSNYATTA
#3                                              TVHAVRTSTN
#4                                              SNCWCHDNAV
#5                                              VASASTWWTY
#6                                              SGWMYRVWYA
#7                                              VNHSTSSYRK
#8                                              VTWHWASMAG
#....

答案 1 :(得分:1)

我们可以使用strsplit

setNames(strsplit(as.character(df$sence), "(?<=.{10})", perl = TRUE),
    df$sname)
#$`sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1`
# [1] "MGSSNYATTA" "TVHAVRTSTN" "SNCWCHDNAV" "VASASTWWTY" "SGWMYRVWYA" "VNHSTSSYRK" "VTWHWASMAG" "SAVRAKVGDW" "RSWGYVVCVY"
#[10] "VRVRKSRRSN" "SNASSAVSTS" "CVSNRAMKGT" "THYDTS"    

#$`sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2`
# [1] "MVKTTYYDVG" "VKNATKKAYR" "KAKYHDKNNG" "KKSAYVSDAK" "KRYDKGGAKG" "GAGGGGSMDD" "MGGGGRMRRR" "GKNVVHSVTD" "YNGATRKAKN"
#[10] "VCDKCGRGGK" "KGAVCCNCRG" "TGMRHGGMVS" "VCMCVVDDNR" "RRHYNGAYDD" "HHRGGVCTS" 

或具有与OP输出相同的输出

n <- 10
pat <- paste0("(?<=.{", n,"})") 
out <- data.frame(col = unlist(Map(c, as.character(df$sname), 
      strsplit(as.character(df$sence), pat, perl = TRUE))))
row.names(out) <- NULL
head(out)
#                                         col
#1 sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1
#2                                 MGSSNYATTA
#3                                 TVHAVRTSTN
#4                                 SNCWCHDNAV
#5                                 VASASTWWTY
#6                                 SGWMYRVWYA

或者与tidyverse

相同
library(tidyverse)
df %>%
    mutate(sence = str_extract_all(sence, ".{10}")) %>%
    transmute(sence = map2(as.character(sname), sence, c)) %>% 
    unnest