使用R

时间:2017-03-13 19:49:40

标签: r

我正在尝试阅读下面的数据示例:

  

1344428:-1,1,-1,415,-649,0.00; -1,2,-1,1090,-2167,0.00; -1,3,-1,-881,-3164,0.00; - 1,4,-1,-624,1529,0.00; -1,5-,-1,-849,-2875,0.00; 1,6,-1,856,-2341,0.00; -1,7,-1,758 ,-2408,0.00; -1,8-,-1,-201,-2307,0.00; -1,9-,-1,-963,-2807,0.00; -1,10,-1,-460, - 3309,0.00; -1,11,-1,-1645,-1773,0.00; -1,12,-1,1487,-518,0.00; -1,13,-1,685,-3113,0.00; -1 ,14,-1,-935,-3217,0.00; -1,15,-1,-1101,-2430,0.00; -1,16,-1,754,-2946,0.00; -1,17,-1,823 ,-2497,0.00; -1,18,-1,-948,-2431,0.00; -1,19,-1,774,-2242,0.00; -1,20,-1,861,-2192,0.00; -1 ,21,-1,433,-3391,0.00; -1,22,-1,133,-2190,0.00; -1,23,-1,-977,-2585,0.00; -1,24,-1,-968 ,-2107,0.00; -1,25,-1,175,-3062,0.00; -1,26,-1,265,-2736,0.00; -1,27,-1,67,-2735,0.00; -1, 28,-1,-281,-2752,0.00; 4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,死,SetAway ;:   1344429:-1,1,-1,415,-649,0.00; -1,2,-1,1090,-2167,0.00; -1,3,-1,-885,-3169,0.00; -1,4- ,-1,-626,1527,0.00; -1,5-,-1,-852,-2887,0.00; 1,6,-1,854,-2340,0.00; -1,7,-1,761,-2411 ,0.00; -1,8-,-1,-201,-2307,0.00; -1,9-,-1,-967,-2808,0.00; -1,10,-1,-460,-3309,0.00 ; -1,11,-1,-1647,-1777,0.00; -1,12,-1,1485,-518,0.00; -1,13,-1,687,-3118,0.00; -1,14, -1,-938,-3222,0.00; -1,15,-1,-1100,-2430,0.00; -1,16,-1,744,-2946,0.00; -1,17,-1,815,-2505 ,0.00; -1,18,-1,-950,-2429,0.00; -1,19,-1,773,-2237,0.00; -1,20,-1,861,-2190,0.00; -1,21, -1,433,-3392,0.00; -1,22,-1,133,-2189,0.00; -1,23,-1,-980,-2593,0.00; -1,24,-1,-961,-2109 ,0.00; -1,25,-1,176,-3056,0.00; -1,26,-1,265,-2731,0.00; -1,27,-1,67,-2736,0.00; -1,28, - 1,-283,-2746,0.00; 4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,死,SetAway;:

数据分为3个块:

  1. 第一个是以“:”结尾的时间戳,我们可以将其保留为数字
  2. 然后是以“;:”
  3. 结尾的多组数字(六个中的多个)
  4. 最后是第三个块(7个元素,在字符串和数字之间混合),以“;:”
  5. 结尾

    有没有一种优雅的方法将这些数据读入R数据框?我试过了

    read.table("855360.dat",
                            header = FALSE,
                                sep = ";") 
    

    但它需要大量的操作才能将元素设置为3个块,我可以将它们连接和操作?

2 个答案:

答案 0 :(得分:2)

如果单个数据框结果正常,则只需用逗号替换冒号和分号,然后将其读入:

L <- readLines("myfile")
read.table(text =  gsub("[:;]+", ",", L), sep = ",", as.is = TRUE)

或者如果要生成嵌套列表结构,请使用上面的L

lapply(lapply(strsplit(L, ":"), strsplit, ";"), lapply, strsplit, ",")

答案 1 :(得分:0)

如果将其转换为多遍过程,可能需要更长的时间,但从长远来看可能更容易修改和维护。

如果你首先用";:"字符拆分字符串,你会注意到奇数索引是主要数据(包括时间戳),偶数索引是你的&#34;第三块&#34;混合num / char条目。一旦你打破了这个,你可能会发现我们仍然有一个解析问题,但它有点简单。

txt <- "1344428:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-881,-3164,0.00;-1,4,-1,-624,1529,0.00;-1,5,-1,-849,-2875,0.00;-1,6,-1,856,-2341,0.00;-1,7,-1,758,-2408,0.00;-1,8,-1,-201,-2307,0.00;-1,9,-1,-963,-2807,0.00;-1,10,-1,-460,-3309,0.00;-1,11,-1,-1645,-1773,0.00;-1,12,-1,1487,-518,0.00;-1,13,-1,685,-3113,0.00;-1,14,-1,-935,-3217,0.00;-1,15,-1,-1101,-2430,0.00;-1,16,-1,754,-2946,0.00;-1,17,-1,823,-2497,0.00;-1,18,-1,-948,-2431,0.00;-1,19,-1,774,-2242,0.00;-1,20,-1,861,-2192,0.00;-1,21,-1,433,-3391,0.00;-1,22,-1,133,-2190,0.00;-1,23,-1,-977,-2585,0.00;-1,24,-1,-968,-2107,0.00;-1,25,-1,175,-3062,0.00;-1,26,-1,265,-2736,0.00;-1,27,-1,67,-2735,0.00;-1,28,-1,-281,-2752,0.00;4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,Dead,SetAway;: 1344429:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-885,-3169,0.00;-1,4,-1,-626,1527,0.00;-1,5,-1,-852,-2887,0.00;-1,6,-1,854,-2340,0.00;-1,7,-1,761,-2411,0.00;-1,8,-1,-201,-2307,0.00;-1,9,-1,-967,-2808,0.00;-1,10,-1,-460,-3309,0.00;-1,11,-1,-1647,-1777,0.00;-1,12,-1,1485,-518,0.00;-1,13,-1,687,-3118,0.00;-1,14,-1,-938,-3222,0.00;-1,15,-1,-1100,-2430,0.00;-1,16,-1,744,-2946,0.00;-1,17,-1,815,-2505,0.00;-1,18,-1,-950,-2429,0.00;-1,19,-1,773,-2237,0.00;-1,20,-1,861,-2190,0.00;-1,21,-1,433,-3392,0.00;-1,22,-1,133,-2189,0.00;-1,23,-1,-980,-2593,0.00;-1,24,-1,-961,-2109,0.00;-1,25,-1,176,-3056,0.00;-1,26,-1,265,-2731,0.00;-1,27,-1,67,-2736,0.00;-1,28,-1,-283,-2746,0.00;4,29,-1,5550,4400,0.00;:174,-2563,11,28.67,A,Dead,SetAway;:"

x <- strsplit(txt, ";:")[[1]]
x <- sapply(x, trimws, USE.NAMES = FALSE)
x[1]
# [1] "1344428:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-881,-3164,0.00;-1,4,-1,-624,1529,0.00;-1,5,-1,-849,-2875,0.00;-1,6,-1,856,-2341,0.00;-1,7,-1,758,-2408,0.00;-1,8,-1,-201,-2307,0.00;-1,9,-1,-963,-2807,0.00;-1,10,-1,-460,-3309,0.00;-1,11,-1,-1645,-1773,0.00;-1,12,-1,1487,-518,0.00;-1,13,-1,685,-3113,0.00;-1,14,-1,-935,-3217,0.00;-1,15,-1,-1101,-2430,0.00;-1,16,-1,754,-2946,0.00;-1,17,-1,823,-2497,0.00;-1,18,-1,-948,-2431,0.00;-1,19,-1,774,-2242,0.00;-1,20,-1,861,-2192,0.00;-1,21,-1,433,-3391,0.00;-1,22,-1,133,-2190,0.00;-1,23,-1,-977,-2585,0.00;-1,24,-1,-968,-2107,0.00;-1,25,-1,175,-3062,0.00;-1,26,-1,265,-2736,0.00;-1,27,-1,67,-2735,0.00;-1,28,-1,-281,-2752,0.00;4,29,-1,5550,4400,0.00"
x[2]
# [1] "174,-2563,11,28.67,A,Dead,SetAway"

这里一个重要的假设是我们总是会有成对的时间戳/数据和后续块:

if (length(x) %% 2 != 0) stop("oops, uneven pairs")
odds <- seq(1, length(x), by = 2)
str(x[odds])
#  chr [1:2] "1344428:-1,1,-1,415,-649,0.00;-1,2,-1,1090,-2167,0.00;-1,3,-1,-881,-3164,0.00;-1,4,-1,-624,1529,0.00;-1,5,-1,-849,-2875,0.00;-1"| __truncated__ ...
x[-odds]
# [1] "174,-2563,11,28.67,A,Dead,SetAway" "174,-2563,11,28.67,A,Dead,SetAway"

从这里开始,我们意识到我们可以轻松地使用其他strsplit提取时间戳,然后通过用换行符替换read.csv来将其余内容转换为";"喜欢的内容(与您的相同)第三块):

timestamps <- lapply(firstsplit, function(z) data.frame(timestamp = as.numeric(z[1])))
data1 <- lapply(firstsplit, function(lst) read.csv(textConnection(gsub(";", "\n", lst[[2]])), header = FALSE))
data2 <- lapply(secondsplit, function(z) read.csv(textConnection(z), header = FALSE))

查看其中一对数据:

bothlst <- mapply(list, timestamps, data1, data2, SIMPLIFY = FALSE)
str(bothlst[[1]])
# List of 3
#  $ :'data.frame': 1 obs. of  1 variable:
#   ..$ timestamp: num 1344428
#  $ :'data.frame': 29 obs. of  6 variables:
#   ..$ V1: int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
#   ..$ V2: int [1:29] 1 2 3 4 5 6 7 8 9 10 ...
#   ..$ V3: int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
#   ..$ V4: int [1:29] 415 1090 -881 -624 -849 856 758 -201 -963 -460 ...
#   ..$ V5: int [1:29] -649 -2167 -3164 1529 -2875 -2341 -2408 -2307 -2807 -3309 ...
#   ..$ V6: num [1:29] 0 0 0 0 0 0 0 0 0 0 ...
#  $ :'data.frame': 1 obs. of  7 variables:
#   ..$ V1: int 174
#   ..$ V2: int -2563
#   ..$ V3: int 11
#   ..$ V4: num 28.7
#   ..$ V5: Factor w/ 1 level "A": 1
#   ..$ V6: Factor w/ 1 level "Dead": 1
#   ..$ V7: Factor w/ 1 level "SetAway": 1

这是一个很好的嵌套列表描述您的数据。我故意让timestamp成为data.frame以简化后续步骤,但这当然不是必需的。

如果您希望在单个data.frame中描述所有数据,请注意以下两点:

  1. 您的timestamp和&#34;第三块数据&#34;将在数据中的所有行上重复。这可能不是问题,具体取决于您打算如何使用数据。 如果在&#34;第三块&#34;中假设单行数据,则此方法会中断是无效的。
  2. 我们在两个数据元素中都有相同的列名。如果您有预定义的列(总是6列和7列)或者数据中已定义列(它们不在您的示例中),则可以轻松避免这个问题。如果这些都不起作用,那么您需要决定适合您的命名约定。为了这个例子,我将第二个data.frameV1命名更改为X1命名。
  3. 牢记第2条:

    data2mod <- lapply(data2, function(df) setNames(df, paste("X", seq_along(df), sep = "")))
    bothlst2 <- mapply(list, timestamps, data1, data2mod, SIMPLIFY = FALSE)
    

    现在,对于每个元素,我们可以&#34; column-bind&#34;将元素合并为一个data.frame

    # bothdf <- lapply(bothlst2, cbind.data.frame)
    str(bothdf)
    # List of 2
    #  $ :'data.frame': 29 obs. of  14 variables:
    #   ..$ timestamp: num [1:29] 1344428 1344428 1344428 1344428 1344428 ...
    #   ..$ V1       : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
    #   ..$ V2       : int [1:29] 1 2 3 4 5 6 7 8 9 10 ...
    #   ..$ V3       : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
    #   ..$ V4       : int [1:29] 415 1090 -881 -624 -849 856 758 -201 -963 -460 ...
    #   ..$ V5       : int [1:29] -649 -2167 -3164 1529 -2875 -2341 -2408 -2307 -2807 -3309 ...
    #   ..$ V6       : num [1:29] 0 0 0 0 0 0 0 0 0 0 ...
    #   ..$ X1       : int [1:29] 174 174 174 174 174 174 174 174 174 174 ...
    #   ..$ X2       : int [1:29] -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 ...
    #   ..$ X3       : int [1:29] 11 11 11 11 11 11 11 11 11 11 ...
    #   ..$ X4       : num [1:29] 28.7 28.7 28.7 28.7 28.7 ...
    #   ..$ X5       : Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
    #   ..$ X6       : Factor w/ 1 level "Dead": 1 1 1 1 1 1 1 1 1 1 ...
    #   ..$ X7       : Factor w/ 1 level "SetAway": 1 1 1 1 1 1 1 1 1 1 ...
    #  $ :'data.frame': 29 obs. of  14 variables:
    #   ..$ timestamp: num [1:29] 1344429 1344429 1344429 1344429 1344429 ...
    #   ..$ V1       : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
    #   ..$ V2       : int [1:29] 1 2 3 4 5 6 7 8 9 10 ...
    #   ..$ V3       : int [1:29] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
    #   ..$ V4       : int [1:29] 415 1090 -885 -626 -852 854 761 -201 -967 -460 ...
    #   ..$ V5       : int [1:29] -649 -2167 -3169 1527 -2887 -2340 -2411 -2307 -2808 -3309 ...
    #   ..$ V6       : num [1:29] 0 0 0 0 0 0 0 0 0 0 ...
    #   ..$ X1       : int [1:29] 174 174 174 174 174 174 174 174 174 174 ...
    #   ..$ X2       : int [1:29] -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 -2563 ...
    #   ..$ X3       : int [1:29] 11 11 11 11 11 11 11 11 11 11 ...
    #   ..$ X4       : num [1:29] 28.7 28.7 28.7 28.7 28.7 ...
    #   ..$ X5       : Factor w/ 1 level "A": 1 1 1 1 1 1 1 1 1 1 ...
    #   ..$ X6       : Factor w/ 1 level "Dead": 1 1 1 1 1 1 1 1 1 1 ...
    #   ..$ X7       : Factor w/ 1 level "SetAway": 1 1 1 1 1 1 1 1 1 1 ...
    

    从这里开始,它可以独立地处理它们,或者以类似的方式将它们组合起来:

    head(do.call("rbind", bothdf))
    #   timestamp V1 V2 V3   V4    V5 V6  X1    X2 X3    X4 X5   X6      X7
    # 1   1344428 -1  1 -1  415  -649  0 174 -2563 11 28.67  A Dead SetAway
    # 2   1344428 -1  2 -1 1090 -2167  0 174 -2563 11 28.67  A Dead SetAway
    # 3   1344428 -1  3 -1 -881 -3164  0 174 -2563 11 28.67  A Dead SetAway
    # 4   1344428 -1  4 -1 -624  1529  0 174 -2563 11 28.67  A Dead SetAway
    # 5   1344428 -1  5 -1 -849 -2875  0 174 -2563 11 28.67  A Dead SetAway
    # 6   1344428 -1  6 -1  856 -2341  0 174 -2563 11 28.67  A Dead SetAway
    

    根据我上面的第一个项目符号,您会注意到timestamp列和所有X*列都是多余的,类似于表的连接。