我有一个数据集,其中一列是由1个数字组成的字符串,表示星期几,然后跟任意数量的10位数字块:
# A tibble: 7 x 3
respid record_type record_data
<chr> <chr> <chr>
1 20163911123050111 6 1000456561200035759120000989800
2 20163911123050111 6 2000405161200031719120000999900
3 20163911123050111 6 30004071212000320212200032832220003545620
4 20163911123050111 6 40004051612000326272200033032220003545620
5 20163911123050111 6 5036803031200040404120004051812000434361200045556120003575910
6 20163911123050111 6 6000411161200031720120003283121000344462100035759120004707410
7 20163911123050111 6 70004111312000314261200043334120004535610
我想通过以下方式将其转换为长格式:
1.将第3列分割为长度为1的固定块,然后将一系列长度为10的字符
2.从宽到长
例如,上面的第一行将变为3行:
respid record_type dayofweek chunk
<chr> <chr> <chr> <chr>
1 20163911123050111 6 1 0004565612
2 20163911123050111 6 1 0003575912
3 20163911123050111 6 1 0000989800
到目前为止,我已在第一部分中使用此代码,但这是一个循环...:
my_list<-list()
for(i in 1:nrow(mydf)){
temp_list<-list()
temp_list
temp_list$respid <- mydf[i,1]
temp_list$record_type <- mydf[i,2]
temp_list$dayofweek <- stringi::stri_sub(t6[i,3],1,1)
temp_list$chunk <- stringi::stri_sub(mydf[i,3],
seq(2, stringi::stri_length(mydf[i,3]), by = 10),
length = 10)
my_list[[i]] <- temp_list
}
有没有办法像purrr :: map和tidyr :: unnest这样的东西?
答案 0 :(得分:2)
方法是首先从1st
中将record_data
字符提取为dayofweek
。现在,可以替换第10个字符,并添加一个分隔符(例如,
),以准备record_data
来应用tidyr::separate_rows
。
library(tidyverse)
df %>%
# 1st character as dayofweek
mutate(dayofweek = substring(record_data, 1,1)) %>%
# Every 10th character appended with ,
mutate(record_data = gsub("(\\d{10})","\\1,",substring(record_data,2))) %>%
# Remove last ,
mutate(record_data = gsub(",$","",record_data)) %>%
# Expand rows
separate_rows(record_data)
# respid record_type dayofweek record_data
# 1 20163911123050112 6 1 0004565612
# 2 20163911123050112 6 1 0003575912
# 3 20163911123050112 6 1 0000989800
# 4 20163911123050112 6 2 0004051612
# 5 20163911123050112 6 2 0003171912
# 6 20163911123050112 6 2 0000999900
# 7 20163911123050112 6 3 0004071212
# 8 20163911123050112 6 3 0003202122
# 9 20163911123050112 6 3 0003283222
# 10 20163911123050112 6 3 0003545620
# 11 20163911123050112 6 4 0004051612
# 12 20163911123050112 6 4 0003262722
# 13 20163911123050112 6 4 0003303222
# 14 20163911123050112 6 4 0003545620
# 15 20163911123050112 6 5 0368030312
# 16 20163911123050112 6 5 0004040412
# 17 20163911123050112 6 5 0004051812
# 18 20163911123050112 6 5 0004343612
# 19 20163911123050112 6 5 0004555612
# 20 20163911123050112 6 5 0003575910
# 21 20163911123050112 6 6 0004111612
# 22 20163911123050112 6 6 0003172012
# 23 20163911123050112 6 6 0003283121
# 24 20163911123050112 6 6 0003444621
# 25 20163911123050112 6 6 0003575912
# 26 20163911123050112 6 6 0004707410
# 27 20163911123050112 6 7 0004111312
# 28 20163911123050112 6 7 0003142612
# 29 20163911123050112 6 7 0004333412
# 30 20163911123050112 6 7 0004535610
数据:
df <- read.table(text ="
respid record_type record_data
20163911123050111 6 1000456561200035759120000989800
20163911123050111 6 2000405161200031719120000999900
20163911123050111 6 30004071212000320212200032832220003545620
20163911123050111 6 40004051612000326272200033032220003545620
20163911123050111 6 5036803031200040404120004051812000434361200045556120003575910
20163911123050111 6 6000411161200031720120003283121000344462100035759120004707410
20163911123050111 6 70004111312000314261200043334120004535610",
header = TRUE, colClasses = c("numeric", "integer", "character"))
答案 1 :(得分:2)
我们可以定义一个函数,该函数可以将字符串每10位拆分一次并返回一个列表。然后,我们可以使用separate
函数拆分星期几和其余部分。我们终于可以应用定义的功能,并unnest
数据框。
# Define a function to split the string in every 10 digits
string_split <- function(string, width = 10){
lst <- list()
i <- 1
while (nchar(string) > 0){
lst[[i]] <- substring(string, 1, width)
string <- substring(string, width + 1)
i <- i + 1
}
return(lst)
}
library(tidyverse)
dat2 <- dat %>%
# Split dayofweek and chunk
separate(record_data, into = c("dayofweek", "chunk"), sep = 1) %>%
# Apply the string_split function
mutate(chunk = map(chunk, string_split)) %>%
unnest()
head(dat2)
# respid record_type dayofweek chunk
# 1 20163911123050111 6 1 0004565612
# 2 20163911123050111 6 1 0003575912
# 3 20163911123050111 6 1 0000989800
# 4 20163911123050111 6 2 0004051612
# 5 20163911123050111 6 2 0003171912
# 6 20163911123050111 6 2 0000999900
数据
dat <- read.table(text = "respid record_type record_data
1 20163911123050111 6 1000456561200035759120000989800
2 20163911123050111 6 2000405161200031719120000999900
3 20163911123050111 6 30004071212000320212200032832220003545620
4 20163911123050111 6 40004051612000326272200033032220003545620
5 20163911123050111 6 5036803031200040404120004051812000434361200045556120003575910
6 20163911123050111 6 6000411161200031720120003283121000344462100035759120004707410
7 20163911123050111 6 70004111312000314261200043334120004535610",
header = TRUE, colClasses = "character")