我有这样的数据
df<- structure(list(sname = structure(2:1, .Label = c("Carrot", "Melon"
), class = "factor"), sence = structure(1:2, .Label = c("RSNSNASSAVSTSCVSNRAMKGTTHYDTS",
"TGMRHGGMVSVCMCVVDDNRRRHYNGAYDDHHRGGVCTS"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
让我们看看第一行
Melon RSNSNASSAVSTSCVSNRAMKGTTHYDTS
我希望能够将字符串切成不同的窗口以及以不同的模式移动。例如,假设在时间和窗口10处移动1个字母,那么第一个输出将像这样
RSNSNASSAV
所以这个是字母1,2,3,4,5,6,7,8,9,10
第二个字母将向前移动1个字母,然后切成10个字母
SNSNASSAVS
所以这是字母2,3,4,5,6,7,8,9,10,11 直到最后。
请求的输出如下所示
output<- structure(list(position = structure(c(33L, 1L, 12L, 23L, 26L,
27L, 28L, 29L, 30L, 31L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 13L, 32L, 1L, 12L, 23L, 26L, 27L, 28L, 29L, 30L, 31L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 24L, 25L), .Label = c("1,2,3,4,5,6,7,8,9,10",
"10,11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,20",
"12,13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22",
"14,15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24",
"16,17,18,19,20,21,22,23,24,25", "17,18,19,20,21,22,23,24,25,26",
"18,19,20,21,22,23,24,25,26,27", "19,20,21,22,23,24,25,26,27,28",
"2,3,4,5,6,7,8,9,10,11", "20,21,22,23,24,25,26,27,28,29", "21,22,23,24,25,26,27,28,29,30",
"22,23,24,25,26,27,28,29,30,31", "23,24,25,26,27,28,29,30,31,32",
"24,25,26,27,28,29,30,31,32,33", "25,26,27,28,29,30,31,32,33,34",
"26,27,28,29,30,31,32,33,34,35", "27,28,29,30,31,32,33,34,35,36",
"28,29,30,31,32,33,34,35,36,37", "29,30,31,32,33,34,35,36,37,38",
"3,4,5,6,7,8,9,10,11,12", "30,31,32,33,34,35,36,37,38,39", "31,32,33,34,35,36,37,38,39,40",
"4,5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,14,15",
"7,8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17",
"9,10,11,12,13,14,15,16,17,18", "Carrot", "Melon"), class = "factor"),
name = structure(c(20L, 32L, 37L, 26L, 35L, 35L, 2L, 38L,
33L, 3L, 46L, 39L, 42L, 34L, 7L, 45L, 36L, 24L, 27L, 1L,
21L, 5L, 41L, 15L, 22L, 28L, 17L, 14L, 16L, 23L, 47L, 40L,
43L, 6L, 19L, 8L, 19L, 8L, 48L, 44L, 10L, 12L, 25L, 31L,
30L, 29L, 18L, 50L, 13L, 4L, 49L, 9L, 11L), .Label = c("AMKGTTHYDT",
"ASSAVSTSCV", "AVSTSCVSNR", "AYDDHHRGGV", "Carrot", "CMCVVDDNRR",
"CVSNRAMKGT", "CVVDDNRRRH", "DDHHRGGVCT", "DDNRRRHYNG", "DHHRGGVCTS",
"DNRRRHYNGA", "GAYDDHHRGG", "GGMVSVCMCV", "GMRHGGMVSV", "GMVSVCMCVV",
"HGGMVSVCMC", "HYNGAYDDHH", "MCVVDDNRRR", "Melon", "MKGTTHYDTS",
"MRHGGMVSVC", "MVSVCMCVVD", "NRAMKGTTHY", "NRRRHYNGAY", "NSNASSAVST",
"RAMKGTTHYD", "RHGGMVSVCM", "RHYNGAYDDH", "RRHYNGAYDD", "RRRHYNGAYD",
"RSNSNASSAV", "SAVSTSCVSN", "SCVSNRAMKG", "SNASSAVSTS", "SNRAMKGTTH",
"SNSNASSAVS", "SSAVSTSCVS", "STSCVSNRAM", "SVCMCVVDDN", "TGMRHGGMVS",
"TSCVSNRAMK", "VCMCVVDDNR", "VDDNRRRHYN", "VSNRAMKGTT", "VSTSCVSNRA",
"VSVCMCVVDD", "VVDDNRRRHY", "YDDHHRGGVC", "YNGAYDDHHR"), class = "factor")), class = "data.frame", row.names = c(NA,
-53L))
以2分割
RSNSNASSAV
NSNASSAVST
NASSAVSTSC
SSAVSTSCVS
AVSTSCVSNR
STSCVSNRAM
SCVSNRAMKG
VSNRAMKGTT
NRAMKGTTHY
AMKGTTHYDT
KGTTHYDTS
答案 0 :(得分:1)
我们将factor
列转换为character
,然后将transmute
转换为tibble
的“位置”,“名称”,方法是使用{{1} },根据分割宽度'n'和'sence'的字符数(map
)创建substring
,将'sname'连接为nchar
元素,然后将{ {1}} first
输出以创建两列数据集
unnest
-测试 -移动窗口-1
list
-移动窗口-2
library(tidyverse)
f1 <- function(dat, n, mv = 1) {
dat %>%
mutate_all(as.character) %>%
transmute(out = map2(sence, sname, ~ {
i1 <- seq_len(nchar(.x) - (n -1))
i11 <- seq(i1[1], i1[length(i1)], by = mv)
i2 <- n:nchar(.x)
i22 <- seq(i2[1], i2[length(i2)], by = mv)
tibble(position = c(.y, map2_chr(i11, i22, ~
str_c(seq(.x, .y), collapse=","))),
name = c(.y, substring(.x, i11, i22)))
})) %>%
unnest
}
-移动窗口-3
f1(df, n = 10, mv = 1)
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 2,3,4,5,6,7,8,9,10,11 SNSNASSAVS
#4 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#5 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#6 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#7 6,7,8,9,10,11,12,13,14,15 ASSAVSTSCV
#8 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#9 8,9,10,11,12,13,14,15,16,17 SAVSTSCVSN
#10 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#11 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#12 11,12,13,14,15,16,17,18,19,20 STSCVSNRAM
#13 12,13,14,15,16,17,18,19,20,21 TSCVSNRAMK
#14 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#15 14,15,16,17,18,19,20,21,22,23 CVSNRAMKGT
#16 15,16,17,18,19,20,21,22,23,24 VSNRAMKGTT
#17 16,17,18,19,20,21,22,23,24,25 SNRAMKGTTH
#18 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
#19 18,19,20,21,22,23,24,25,26,27 RAMKGTTHYD
#20 19,20,21,22,23,24,25,26,27,28 AMKGTTHYDT
#21 20,21,22,23,24,25,26,27,28,29 MKGTTHYDTS
#22 Carrot Carrot
#23 1,2,3,4,5,6,7,8,9,10 TGMRHGGMVS
#24 2,3,4,5,6,7,8,9,10,11 GMRHGGMVSV
#25 3,4,5,6,7,8,9,10,11,12 MRHGGMVSVC
#26 4,5,6,7,8,9,10,11,12,13 RHGGMVSVCM
#27 5,6,7,8,9,10,11,12,13,14 HGGMVSVCMC
#28 6,7,8,9,10,11,12,13,14,15 GGMVSVCMCV
#29 7,8,9,10,11,12,13,14,15,16 GMVSVCMCVV
#30 8,9,10,11,12,13,14,15,16,17 MVSVCMCVVD
#31 9,10,11,12,13,14,15,16,17,18 VSVCMCVVDD
#32 10,11,12,13,14,15,16,17,18,19 SVCMCVVDDN
#33 11,12,13,14,15,16,17,18,19,20 VCMCVVDDNR
#34 12,13,14,15,16,17,18,19,20,21 CMCVVDDNRR
#35 13,14,15,16,17,18,19,20,21,22 MCVVDDNRRR
#36 14,15,16,17,18,19,20,21,22,23 CVVDDNRRRH
#37 15,16,17,18,19,20,21,22,23,24 VVDDNRRRHY
#38 16,17,18,19,20,21,22,23,24,25 VDDNRRRHYN
#39 17,18,19,20,21,22,23,24,25,26 DDNRRRHYNG
#40 18,19,20,21,22,23,24,25,26,27 DNRRRHYNGA
#41 19,20,21,22,23,24,25,26,27,28 NRRRHYNGAY
#42 20,21,22,23,24,25,26,27,28,29 RRRHYNGAYD
#43 21,22,23,24,25,26,27,28,29,30 RRHYNGAYDD
#44 22,23,24,25,26,27,28,29,30,31 RHYNGAYDDH
#45 23,24,25,26,27,28,29,30,31,32 HYNGAYDDHH
#46 24,25,26,27,28,29,30,31,32,33 YNGAYDDHHR
#47 25,26,27,28,29,30,31,32,33,34 NGAYDDHHRG
#48 26,27,28,29,30,31,32,33,34,35 GAYDDHHRGG
#49 27,28,29,30,31,32,33,34,35,36 AYDDHHRGGV
#50 28,29,30,31,32,33,34,35,36,37 YDDHHRGGVC
#51 29,30,31,32,33,34,35,36,37,38 DDHHRGGVCT
#52 30,31,32,33,34,35,36,37,38,39 DHHRGGVCTS
-移动窗口-4
f1(df, n = 10, mv = 2) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#4 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#5 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#6 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
答案 1 :(得分:1)
library('tidyverse')
# use this function to make the blocks:
make_substrings = function(string, len, label){
# set up the indices
str_len = nchar(string)
indices1 = 1:(str_len-len+1)
indices2 = (len:str_len)
# create the list of indices
position = map2_chr(indices1, indices2, .f = function(x, y){paste(x:y, collapse = ', ')})
# take substrings
name = map2_chr(indices1, indices2, .f = substr, x = string)
# add yoru food labels
header = tibble(position = label,
name = label)
header %>%
bind_rows(tibble(position,
name))
}
# your version had factors
df = df %>%
mutate_all(as.character)
# iterate over all the rows of df:
output = Map(f = make_substrings, string = df$sence, len = 10, label = df$sname) %>%
bind_rows