快速将字符串列表拆分为数据帧

时间:2018-08-15 23:12:26

标签: r dataframe dplyr

我有一个大型列表对象(大约200,000行),其结构为:

x1 <- "1614689:-1,1,-1,-826,-3484,0.00;-1,2,-1,-311,-3450,0.00;-1,3,-1,-3732,-708,0.00;-1,4,-1,137,-3387,0.00;4,5,-1,5550,4400,0.00;4,6,-1,5550,4400,0.00;4,7,-1,5550,4400,0.00;4,8,-1,5550,4400,0.00;-1,9,-1,-1971,-2660,0.00;4,10,-1,5550,4400,0.00;4,11,-1,5550,4400,0.00;4,12,-1,5550,4400,0.00;4,13,-1,5550,4400,0.00;4,14,-1,5550,4400,0.00;4,15,-1,5550,4400,0.00;4,16,-1,5550,4400,0.00;4,17,-1,5550,4400,0.00;4,18,-1,5550,4400,0.00;4,19,-1,5550,4400,0.00;4,20,-1,5550,4400,0.00;-1,21,-1,401,-969,0.00;4,22,-1,5550,4400,0.00;4,23,-1,5550,4400,0.00;4,24,-1,5550,4400,0.00;4,25,-1,5550,4400,0.00;4,26,-1,5550,4400,0.00;4,27,-1,5550,4400,0.00;4,28,-1,5550,4400,0.00;4,29,-1,5550,4400,0.00;:-1971,-2660,0,6.08,A,Dead;"

x2 <- "1614690:-1,1,-1,-825,-3484,0.00;-1,2,-1,-311,-3450,0.00;-1,3,-1,-3726,-706,0.00;-1,4,-1,138,-3382,0.00;4,5,-1,5550,4400,0.00;4,6,-1,5550,4400,0.00;4,7,-1,5550,4400,0.00;4,8,-1,5550,4400,0.00;-1,9,-1,-1970,-2666,0.00;4,10,-1,5550,4400,0.00;4,11,-1,5550,4400,0.00;4,12,-1,5550,4400,0.00;4,13,-1,5550,4400,0.00;4,14,-1,5550,4400,0.00;4,15,-1,5550,4400,0.00;4,16,-1,5550,4400,0.00;4,17,-1,5550,4400,0.00;4,18,-1,5550,4400,0.00;4,19,-1,5550,4400,0.00;4,20,-1,5550,4400,0.00;-1,21,-1,401,-965,0.00;4,22,-1,5550,4400,0.00;4,23,-1,5550,4400,0.00;4,24,-1,5550,4400,0.00;4,25,-1,5550,4400,0.00;4,26,-1,5550,4400,0.00;4,27,-1,5550,4400,0.00;4,28,-1,5550,4400,0.00;4,29,-1,5550,4400,0.00;:-1970,-2666,0,6.08,A,Dead;"

data.2.test <- list(x1,x2)

每个字符串有3个主要部分,以“:”分隔。

第1部分:一个简单的字符串。

第2部分:用“;”分隔的26个块用“,”分隔成多个值。

第3部分:1个块,其值用“,”分隔-该部分的长度各不相同。

我有一个脚本,可以将所有内容拆分并合并在一起,但是计算大约需要45分钟。我需要更快地实现这一目标。期望结果是两个数据框。

  1. Data.frame 1由第2部分组成,每个块均作为新行,每个值均在新列中。第1部分将作为“ id”添加到每一行。

  2. Data.frame 2由第3部分组成,每个值都在新列中。第1部分将作为“ id”添加到每一行。

通过功能的当前解决方案:

    Unpack.1.Frame.of.Ball <- function(df){

     ball.parts <- unlist(strsplit(unlist(df),","))
     return(data.frame(team_HA = 10, 
                        TrackID = 50, 
                        JerseyNo = NA, 
                        x = as.numeric(as.character(ball.parts[1])), 
                        y = as.numeric(as.character(ball.parts[2])), 
                        z = as.numeric(as.character(ball.parts[3])),
                        speed = as.numeric(as.character(ball.parts[4])),
                        Ball.Ownership = ifelse(gsub(";","",ball.parts[5])=="A",0,1),
                        Ball.InPlay = ifelse(gsub(";","",ball.parts[6])=="Dead",0,1),
                        Ball.Contact.Info1 = ifelse(length(ball.parts[7])>0,ball.parts[7],NA),
                        Ball.Contact.Info2 = ifelse(length(ball.parts[8])>0,ball.parts[8],NA)))  
}


    Unpack.1.Player.of.Tracking <- function(r){
    return(data.frame(team_HA = as.numeric(as.character(unlist(strsplit(unlist(r),","))[1])), 
                       TrackID = as.numeric(as.character(unlist(strsplit(unlist(r),","))[2])), 
                       JerseyNo = as.numeric(as.character(unlist(strsplit(unlist(r),","))[3])),
                       x = as.numeric(as.character(unlist(strsplit(unlist(r),","))[4])), 
                       y = as.numeric(as.character(unlist(strsplit(unlist(r),","))[5])),
                       speed = as.numeric(as.character(unlist(strsplit(unlist(r),","))[6])),
                       z = 10))
}

    Unpack.1.Frame.of.Players <- function(df){

    unpack.catch <- unlist(strsplit(df, ";")) %>% 
    split(1:length(.)) %>% 
    purrr::map(Unpack.1.Player.of.Tracking) %>% 
    dplyr::bind_rows() 

    return(unpack.catch)
}

    Unpack.1.Frame.of.Time <- function(frame.to.process){

      Parsing.Counter <<- 1
      temp.parts <- unlist(strsplit(as.character(frame.to.process), ":", fixed = FALSE, perl = FALSE, useBytes = FALSE))

      people.temp <- Unpack.1.Frame.of.Players(temp.parts[2])
      ball.temp <- Unpack.1.Frame.of.Ball(temp.parts[3])

      people.temp$Ball.Ownership <- ball.temp$Ball.Ownership
      people.temp$Ball.InPlay <- ball.temp$Ball.InPlay
      people.temp$Ball.Contact.Info1 <- ball.temp$Ball.Contact.Info1
      people.temp$Ball.Contact.Info2 <- ball.temp$Ball.Contact.Info2

      frame.temp <- bind_rows(people.temp, ball.temp)

      frame.temp$frameID <- temp.parts[1]

      if((Parsing.Counter/250)%%1==0){cat(".")}else{}
      Parsing.Counter <<- Parsing.Counter + 1

      return(frame.temp)

}

2 个答案:

答案 0 :(得分:2)

您只需要读取数据:

void*

答案 1 :(得分:0)

这里是映射数据框1的字符串的另一种选择

library(tidyverse)

df1 <- data.2.test %>% 
  map(~gsub("^.*?:|:", "", .x) %>% 
        str_split(., ";") %>% 
        unlist() %>% 
        .[1:(length(.)-2)]) %>% 
   map(~str_split(.x, ",")) %>%
   flatten() %>% 
   reduce(rbind) %>%
   as.data.frame()

head(df1)
#>     V1 V2 V3    V4    V5   V6
#> out -1  1 -1  -826 -3484 0.00
#> X   -1  2 -1  -311 -3450 0.00
#> X.1 -1  3 -1 -3732  -708 0.00
#> X.2 -1  4 -1   137 -3387 0.00
#> X.3  4  5 -1  5550  4400 0.00
#> X.4  4  6 -1  5550  4400 0.00

reprex package(v0.2.0)于2018-08-15创建。

编辑 将输出更改为排除第3部分。