R:如何访问数据框元素中加载的数组成员

时间:2015-06-15 22:28:14

标签: r dataframe

从csv文件中我将日期加载到R数据框中,如下所示:

> head(mydata)
  row lengthArray                         sports num_runs percent_runs
1   0           4               [24, 18, 24, 18]        0            0
2   1          10 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2]        0            0
3   2           4                   [0, 0, 0, 0]        0            0
4   3           2                         [0, 0]        0            0
5   4           2                       [18, 18]        0            0
6   5           1                            [0]        0            0

我可以访问并获取整数数据框的类型没问题,但我无法弄清楚如何访问sports

> class(mydata[4,3])
[1] "factor" 
>  string_factor = mydata[1,3]
> string_factor
[1] [24, 18, 24, 18]
6378 Levels: [0] [0, 0] [0, 0, 0] [0, 0, 0, 0] ... [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
> class(string_factor)
[1] "factor"
> string_factor_numeric = as.numeric(string_factor)
> string_factor_numeric
[1] 5181

我想最好的R响应是“不要这样做”,但这就是数据的来源,所以我想知道如何从数组中获取这些数字以便我可以使用它们。 / p>

我还应该提一下,Convert data.frame columns from factors to characters没有给出任何错误信息但没有效果,因为数组列继续被归类为因素。

UPDATE: from the comments, you can see this can get you somewhere:
mydata[,3]  <- as.character(mydata[,3])

但是,这仍然无法让您进入具有可单独访问元素的数组。

3 个答案:

答案 0 :(得分:4)

这是使用splitstackshape的另一个想法:

library(splitstackshape)
library(dplyr)
mydata %>% 
  mutate(sports = gsub("\\[|\\]", "", sports)) %>%
  cSplit("sports", sep = ",", direction = "wide")

给出了:

   row lengthArray num_runs percent_runs sports_01 sports_02 sports_03 sports_04 sports_05 sports_06 sports_07 sports_08 sports_09 sports_10
1:   0           4        0            0        24        18        24        18        NA        NA        NA        NA        NA        NA
2:   1          10        0            0         2         2         2         2         2         2         2         2         2         2
3:   2           4        0            0         0         0         0         0        NA        NA        NA        NA        NA        NA
4:   3           2        0            0         0         0        NA        NA        NA        NA        NA        NA        NA        NA
5:   4           2        0            0        18        18        NA        NA        NA        NA        NA        NA        NA        NA
6:   5           1        0            0         0        NA        NA        NA        NA        NA        NA        NA        NA        NA

或者根据@thelatemail评论,您还可以将列表存储为列:

library(stringi)
df <- mydata %>%
  mutate(sports = as.list(stri_extract_all(sports, regex = "[:digit:]")))

这将为您提供以下数据结构:

> str(df)
#'data.frame':  6 obs. of  5 variables:
# $ row         : int  0 1 2 3 4 5
# $ lengthArray : int  4 10 4 2 2 1
# $ sports      :List of 6
#  ..$ : chr  "2" "4" "1" "8" ...
#  ..$ : chr  "2" "2" "2" "2" ...
#  ..$ : chr  "0" "0" "0" "0"
#  ..$ : chr  "0" "0"
#  ..$ : chr  "1" "8" "1" "8"
#  ..$ : chr "0"
# $ num_runs    : int  0 0 0 0 0 0
# $ percent_runs: int  0 0 0 0 0 0 

然后,您可以像这样访问列表中的元素:

> df$sports[[1]][1] #first element of first list
#[1] "2"

答案 1 :(得分:1)

这是您使用dput的数据:

mydata = structure(list(row = 0:5, lengthArray = c(4L, 10L, 4L, 2L, 2L, 
1L), sports = structure(c(6L, 5L, 1L, 2L, 4L, 3L), .Label = c("[0, 0, 0, 0]", 
"[0, 0]", "[0]", "[18, 18]", "[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]", 
"[24, 18, 24, 18]"), class = "factor"), num_runs = c(0L, 0L, 
0L, 0L, 0L, 0L), percent_runs = c(0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("row", 
"lengthArray", "sports", "num_runs", "percent_runs"), class = "data.frame", row.names = c(NA, 
-6L))

首先我们将体育列转换为角色

mydata$sports = as.character(mydata$sports)

现在我将摆脱括号和空格(留下逗号)

library(stringr)
mydata$sports = str_replace_all(mydata$sports, pattern = "\\[|\\]| ", "")

最后将体育专栏分成多个栏目

library(tidyr)
mydata = separate(mydata, sports, into = paste0("sport", 1:max(mydata$lengthArray)), sep = ",", extra = "drop")

mydata
#  row lengthArray sport1 sport2 sport3 sport4 sport5 sport6 sport7 sport8 sport9 sport10 num_runs percent_runs
#1   0           4     24     18     24     18   <NA>   <NA>   <NA>   <NA>   <NA>    <NA>        0            0
#2   1          10      2      2      2      2      2      2      2      2      2       2        0            0
#3   2           4      0      0      0      0   <NA>   <NA>   <NA>   <NA>   <NA>    <NA>        0            0
#4   3           2      0      0   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>    <NA>        0            0
#5   4           2     18     18   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>    <NA>        0            0
#6   5           1      0   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>   <NA>    <NA>        0            0

答案 2 :(得分:0)

重新创建数据:

text = "
row lengthArray                            sports num_runs percent_runs
   0           4               '[24, 18, 24, 18]'        0            0
   1          10 '[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]'        0            0
   2           4                   '[0, 0, 0, 0]'        0            0
   3           2                         '[0, 0]'        0            0
   4           2                       '[18, 18]'        0            0
   5           1                            '[0]'        0            0"

data <- read.table(text = text, header= TRUE)

您可能会在运动中获取值并创建新列...但是,如果要在sports列中创建向量,您实际上可以这样做:

data$sports <- as.character(data$sports)
data$sports <- lapply(data$sports, function(x) eval(parse(text = paste0("c(", gsub("\\[|\\]", "", x),")"))))

现在,例如,如果您想获得sports第一行的第三个值:

data$sports[[1]][[3]]
[1] 24