我想将一列字符串(例如[1,58,10])与tidyr分开使用。我的问题是有时候列更短(永远不会更长)。我在同一数据框中有很多关于此问题的专栏。
正在加载软件包
require(tidyr)
require(dplyr)
require(stringr)
数据
在这里,我使用来自真实数据的样本创建数据框。 “载体”在col1中长度为10,在col2中为9或10。有一个时间列只是为了表明还有其他列。
df <- data.frame(
time = as.POSIXct(1:5, origin=Sys.time()),
col1 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,31,0,0,0,5,0,0,925,1]", "[0,1,0,0,0,471,0,0,130339,3946]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956,53]"),
col2 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,355,0,0,0,1227,0,0,382059,116]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956]","[0,355,0,0,0,1227,0,0,382059,116]")
)
我希望如何
对于所有“向量”长度相等的第一列,我可以使用separate()来获得我想要的内容。
a1 <- df %>%
mutate(col1 = str_sub(col1,2,-2)) %>%
separate(col1, paste("col1",1:10,sep="."),",")
# Making sure the numbers are numeric
a1 <- as.data.frame(sapply(a1, as.numeric)) %>%
mutate(time = as.POSIXct(time, origin="1970-01-01")) %>% select(-col2)
这导致
> a1
time col1.1 col1.2 col1.3 col1.4 col1.5 col1.6 col1.7 col1.8
1 2014-11-07 12:21:45 0 355 0 0 0 1227 0 0
2 2014-11-07 12:21:46 0 31 0 0 0 5 0 0
3 2014-11-07 12:21:47 0 1 0 0 0 471 0 0
4 2014-11-07 12:21:48 0 0 0 0 0 223 0 0
5 2014-11-07 12:21:49 0 19 0 0 0 667 0 0
col1.9 col1.10
1 382059 116
2 925 1
3 130339 3946
4 37666 12
5 336956 53
这对于col2不起作用,其中元素不能分成几列
解决方法
# Does not work
#b1 <- df %>%
# mutate(col2 = str_sub(col1,2,-2)) %>%
# separate(col2, paste("col2",1:10,sep="."),",")
b2 <- sapply(as.data.frame(str_split_fixed(str_sub(df$col2,2,-2),',',n=10), stringsAsFactors=F), as.numeric)
colnames(b2) <- paste("col2",1:10,sep=".")
b2 <- as.data.frame(cbind(time=df$time, b2)) %>%
mutate(time = as.POSIXct(time, origin="1970-01-01"))
结果是
> b2
time col2.1 col2.2 col2.3 col2.4 col2.5 col2.6 col2.7 col2.8
1 2014-11-07 12:21:45 0 355 0 0 0 1227 0 0
2 2014-11-07 12:21:46 0 355 0 0 0 1227 0 0
3 2014-11-07 12:21:47 0 0 0 0 0 223 0 0
4 2014-11-07 12:21:48 0 19 0 0 0 667 0 0
5 2014-11-07 12:21:49 0 355 0 0 0 1227 0 0
col2.9 col2.10
1 382059 116
2 382059 116
3 37666 12
4 336956 NA
5 382059 116
如果向量较短,则最后一个元素应为NA,因此这是正确的。
问题
有没有办法使用单独的(或其他更简单的功能)而不是解决方法? 有没有办法同时将它应用于col1和col2(通过选择以col开头的列为例)?
谢谢!
答案 0 :(得分:6)
这只回答了关于separate
的问题的第一部分。 extra
中有separate
个参数(至少在 tidyr 的开发版本中),如果您将extra
设置为"merge"
,则可以执行您想要的操作df %>%
mutate(col2 = str_sub(col2,2,-2)) %>%
separate(col2, paste("col2",1:10,sep="."), ",", extra = "merge")
time col1
1 2014-11-07 08:00:59 [0,355,0,0,0,1227,0,0,382059,116]
2 2014-11-07 08:01:00 [0,31,0,0,0,5,0,0,925,1]
3 2014-11-07 08:01:01 [0,1,0,0,0,471,0,0,130339,3946]
4 2014-11-07 08:01:02 [0,0,0,0,0,223,0,0,37666,12]
5 2014-11-07 08:01:03 [0,19,0,0,0,667,0,0,336956,53]
col2.1 col2.2 col2.3 col2.4 col2.5 col2.6 col2.7 col2.8
1 0 355 0 0 0 1227 0 0
2 0 355 0 0 0 1227 0 0
3 0 0 0 0 0 223 0 0
4 0 19 0 0 0 667 0 0
5 0 355 0 0 0 1227 0 0
col2.9 col2.10
1 382059 116
2 382059 116
3 37666 12
4 336956 <NA>
5 382059 116
。
{{1}}
答案 1 :(得分:4)
没有包裹的解决方案:
df <- data.frame(
time = as.POSIXct(1:5, origin=Sys.time()),
col1 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,31,0,0,0,5,0,0,925,1]", "[0,1,0,0,0,471,0,0,130339,3946]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956,53]"),
col2 = c("[0,355,0,0,0,1227,0,0,382059,116]", "[0,355,0,0,0,1227,0,0,382059,116]", "[0,0,0,0,0,223,0,0,37666,12]", "[0,19,0,0,0,667,0,0,336956]","[0,355,0,0,0,1227,0,0,382059,116]")
)
df[-1] <- lapply(df[-1], function(x) gsub('\\[|\\]', '', as.character(x)))
df <- read.csv(text = apply(as.matrix(df), 1,
function(x) paste0(x, collapse = ',')),
check.names = FALSE, header = FALSE,
colClasses = c('POSIXct', rep('numeric', 20)))
names(df) <- c('time', paste0('col1.', 1:10), paste0('col2.', 1:10))
# time col1.1 col1.2 col1.3 col1.4 col1.5 col1.6 col1.7 col1.8
# 1 2014-11-07 10:53:22 0 355 0 0 0 1227 0 0
# 2 2014-11-07 10:53:23 0 31 0 0 0 5 0 0
# 3 2014-11-07 10:53:24 0 1 0 0 0 471 0 0
# 4 2014-11-07 10:53:25 0 0 0 0 0 223 0 0
# 5 2014-11-07 10:53:26 0 19 0 0 0 667 0 0
# col1.9 col1.10 col2.1 col2.2 col2.3 col2.4 col2.5 col2.6 col2.7 col2.8 col2.9
# 1 382059 116 0 355 0 0 0 1227 0 0 382059
# 2 925 1 0 355 0 0 0 1227 0 0 382059
# 3 130339 3946 0 0 0 0 0 223 0 0 37666
# 4 37666 12 0 19 0 0 0 667 0 0 336956
# 5 336956 53 0 355 0 0 0 1227 0 0 382059
# col2.10
# 1 116
# 2 116
# 3 12
# 4 NA
# 5 116
答案 2 :(得分:4)
以下是使用dplyr
和splitstackshape
的另一种方法。如果您不需要数据框,则不需要最后一个data.frame(。)。你将拥有data.table。
df %>%
mutate_each(funs(gsub("\\[(.*)\\]", "\\1", .)), contains("col")) %>%
cSplit(., c("col1", "col2"), sep = ",") %>%
mutate_each(funs(as.numeric), -time) %>%
data.frame(.)
# time col1_01 col1_02 col1_03 col1_04 col1_05 col1_06 col1_07 col1_08 col1_09 col1_10 col2_01 col2_02 col2_03 col2_04 col2_05
#1 2014-11-08 00:48:15 0 355 0 0 0 1227 0 0 382059 116 0 355 0 0 0
#2 2014-11-08 00:48:16 0 31 0 0 0 5 0 0 925 1 0 355 0 0 0
#3 2014-11-08 00:48:17 0 1 0 0 0 471 0 0 130339 3946 0 0 0 0 0
#4 2014-11-08 00:48:18 0 0 0 0 0 223 0 0 37666 12 0 19 0 0 0
#5 2014-11-08 00:48:19 0 19 0 0 0 667 0 0 336956 53 0 355 0 0 0
# col2_06 col2_07 col2_08 col2_09 col2_10
#1 1227 0 0 382059 116
#2 1227 0 0 382059 116
#3 223 0 0 37666 12
#4 667 0 0 336956 NA
#5 1227 0 0 382059 116