我有一个大型列表对象(大约200,000行),其结构为:
x1 <- "1614689:-1,1,-1,-826,-3484,0.00;-1,2,-1,-311,-3450,0.00;-1,3,-1,-3732,-708,0.00;-1,4,-1,137,-3387,0.00;4,5,-1,5550,4400,0.00;4,6,-1,5550,4400,0.00;4,7,-1,5550,4400,0.00;4,8,-1,5550,4400,0.00;-1,9,-1,-1971,-2660,0.00;4,10,-1,5550,4400,0.00;4,11,-1,5550,4400,0.00;4,12,-1,5550,4400,0.00;4,13,-1,5550,4400,0.00;4,14,-1,5550,4400,0.00;4,15,-1,5550,4400,0.00;4,16,-1,5550,4400,0.00;4,17,-1,5550,4400,0.00;4,18,-1,5550,4400,0.00;4,19,-1,5550,4400,0.00;4,20,-1,5550,4400,0.00;-1,21,-1,401,-969,0.00;4,22,-1,5550,4400,0.00;4,23,-1,5550,4400,0.00;4,24,-1,5550,4400,0.00;4,25,-1,5550,4400,0.00;4,26,-1,5550,4400,0.00;4,27,-1,5550,4400,0.00;4,28,-1,5550,4400,0.00;4,29,-1,5550,4400,0.00;:-1971,-2660,0,6.08,A,Dead;"
x2 <- "1614690:-1,1,-1,-825,-3484,0.00;-1,2,-1,-311,-3450,0.00;-1,3,-1,-3726,-706,0.00;-1,4,-1,138,-3382,0.00;4,5,-1,5550,4400,0.00;4,6,-1,5550,4400,0.00;4,7,-1,5550,4400,0.00;4,8,-1,5550,4400,0.00;-1,9,-1,-1970,-2666,0.00;4,10,-1,5550,4400,0.00;4,11,-1,5550,4400,0.00;4,12,-1,5550,4400,0.00;4,13,-1,5550,4400,0.00;4,14,-1,5550,4400,0.00;4,15,-1,5550,4400,0.00;4,16,-1,5550,4400,0.00;4,17,-1,5550,4400,0.00;4,18,-1,5550,4400,0.00;4,19,-1,5550,4400,0.00;4,20,-1,5550,4400,0.00;-1,21,-1,401,-965,0.00;4,22,-1,5550,4400,0.00;4,23,-1,5550,4400,0.00;4,24,-1,5550,4400,0.00;4,25,-1,5550,4400,0.00;4,26,-1,5550,4400,0.00;4,27,-1,5550,4400,0.00;4,28,-1,5550,4400,0.00;4,29,-1,5550,4400,0.00;:-1970,-2666,0,6.08,A,Dead;"
data.2.test <- list(x1,x2)
每个字符串有3个主要部分,以“:”分隔。
第1部分:一个简单的字符串。
第2部分:用“;”分隔的26个块用“,”分隔成多个值。
第3部分:1个块,其值用“,”分隔-该部分的长度各不相同。
我有一个脚本,可以将所有内容拆分并合并在一起,但是计算大约需要45分钟。我需要更快地实现这一目标。期望结果是两个数据框。
Data.frame 1由第2部分组成,每个块均作为新行,每个值均在新列中。第1部分将作为“ id”添加到每一行。
Data.frame 2由第3部分组成,每个值都在新列中。第1部分将作为“ id”添加到每一行。
通过功能的当前解决方案:
Unpack.1.Frame.of.Ball <- function(df){
ball.parts <- unlist(strsplit(unlist(df),","))
return(data.frame(team_HA = 10,
TrackID = 50,
JerseyNo = NA,
x = as.numeric(as.character(ball.parts[1])),
y = as.numeric(as.character(ball.parts[2])),
z = as.numeric(as.character(ball.parts[3])),
speed = as.numeric(as.character(ball.parts[4])),
Ball.Ownership = ifelse(gsub(";","",ball.parts[5])=="A",0,1),
Ball.InPlay = ifelse(gsub(";","",ball.parts[6])=="Dead",0,1),
Ball.Contact.Info1 = ifelse(length(ball.parts[7])>0,ball.parts[7],NA),
Ball.Contact.Info2 = ifelse(length(ball.parts[8])>0,ball.parts[8],NA)))
}
Unpack.1.Player.of.Tracking <- function(r){
return(data.frame(team_HA = as.numeric(as.character(unlist(strsplit(unlist(r),","))[1])),
TrackID = as.numeric(as.character(unlist(strsplit(unlist(r),","))[2])),
JerseyNo = as.numeric(as.character(unlist(strsplit(unlist(r),","))[3])),
x = as.numeric(as.character(unlist(strsplit(unlist(r),","))[4])),
y = as.numeric(as.character(unlist(strsplit(unlist(r),","))[5])),
speed = as.numeric(as.character(unlist(strsplit(unlist(r),","))[6])),
z = 10))
}
Unpack.1.Frame.of.Players <- function(df){
unpack.catch <- unlist(strsplit(df, ";")) %>%
split(1:length(.)) %>%
purrr::map(Unpack.1.Player.of.Tracking) %>%
dplyr::bind_rows()
return(unpack.catch)
}
Unpack.1.Frame.of.Time <- function(frame.to.process){
Parsing.Counter <<- 1
temp.parts <- unlist(strsplit(as.character(frame.to.process), ":", fixed = FALSE, perl = FALSE, useBytes = FALSE))
people.temp <- Unpack.1.Frame.of.Players(temp.parts[2])
ball.temp <- Unpack.1.Frame.of.Ball(temp.parts[3])
people.temp$Ball.Ownership <- ball.temp$Ball.Ownership
people.temp$Ball.InPlay <- ball.temp$Ball.InPlay
people.temp$Ball.Contact.Info1 <- ball.temp$Ball.Contact.Info1
people.temp$Ball.Contact.Info2 <- ball.temp$Ball.Contact.Info2
frame.temp <- bind_rows(people.temp, ball.temp)
frame.temp$frameID <- temp.parts[1]
if((Parsing.Counter/250)%%1==0){cat(".")}else{}
Parsing.Counter <<- Parsing.Counter + 1
return(frame.temp)
}
答案 0 :(得分:2)
您只需要读取数据:
void*
答案 1 :(得分:0)
这里是映射数据框1的字符串的另一种选择
library(tidyverse)
df1 <- data.2.test %>%
map(~gsub("^.*?:|:", "", .x) %>%
str_split(., ";") %>%
unlist() %>%
.[1:(length(.)-2)]) %>%
map(~str_split(.x, ",")) %>%
flatten() %>%
reduce(rbind) %>%
as.data.frame()
head(df1)
#> V1 V2 V3 V4 V5 V6
#> out -1 1 -1 -826 -3484 0.00
#> X -1 2 -1 -311 -3450 0.00
#> X.1 -1 3 -1 -3732 -708 0.00
#> X.2 -1 4 -1 137 -3387 0.00
#> X.3 4 5 -1 5550 4400 0.00
#> X.4 4 6 -1 5550 4400 0.00
由reprex package(v0.2.0)于2018-08-15创建。
编辑 将输出更改为排除第3部分。