从csv文件中我将日期加载到R数据框中,如下所示:
> head(mydata)
row lengthArray sports num_runs percent_runs
1 0 4 [24, 18, 24, 18] 0 0
2 1 10 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2] 0 0
3 2 4 [0, 0, 0, 0] 0 0
4 3 2 [0, 0] 0 0
5 4 2 [18, 18] 0 0
6 5 1 [0] 0 0
我可以访问并获取整数数据框的类型没问题,但我无法弄清楚如何访问sports
:
> class(mydata[4,3])
[1] "factor"
> string_factor = mydata[1,3]
> string_factor
[1] [24, 18, 24, 18]
6378 Levels: [0] [0, 0] [0, 0, 0] [0, 0, 0, 0] ... [9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9]
> class(string_factor)
[1] "factor"
> string_factor_numeric = as.numeric(string_factor)
> string_factor_numeric
[1] 5181
我想最好的R响应是“不要这样做”,但这就是数据的来源,所以我想知道如何从数组中获取这些数字以便我可以使用它们。 / p>
我还应该提一下,Convert data.frame columns from factors to characters没有给出任何错误信息但没有效果,因为数组列继续被归类为因素。
UPDATE: from the comments, you can see this can get you somewhere:
mydata[,3] <- as.character(mydata[,3])
但是,这仍然无法让您进入具有可单独访问元素的数组。
答案 0 :(得分:4)
这是使用splitstackshape
的另一个想法:
library(splitstackshape)
library(dplyr)
mydata %>%
mutate(sports = gsub("\\[|\\]", "", sports)) %>%
cSplit("sports", sep = ",", direction = "wide")
给出了:
row lengthArray num_runs percent_runs sports_01 sports_02 sports_03 sports_04 sports_05 sports_06 sports_07 sports_08 sports_09 sports_10
1: 0 4 0 0 24 18 24 18 NA NA NA NA NA NA
2: 1 10 0 0 2 2 2 2 2 2 2 2 2 2
3: 2 4 0 0 0 0 0 0 NA NA NA NA NA NA
4: 3 2 0 0 0 0 NA NA NA NA NA NA NA NA
5: 4 2 0 0 18 18 NA NA NA NA NA NA NA NA
6: 5 1 0 0 0 NA NA NA NA NA NA NA NA NA
或者根据@thelatemail评论,您还可以将列表存储为列:
library(stringi)
df <- mydata %>%
mutate(sports = as.list(stri_extract_all(sports, regex = "[:digit:]")))
这将为您提供以下数据结构:
> str(df)
#'data.frame': 6 obs. of 5 variables:
# $ row : int 0 1 2 3 4 5
# $ lengthArray : int 4 10 4 2 2 1
# $ sports :List of 6
# ..$ : chr "2" "4" "1" "8" ...
# ..$ : chr "2" "2" "2" "2" ...
# ..$ : chr "0" "0" "0" "0"
# ..$ : chr "0" "0"
# ..$ : chr "1" "8" "1" "8"
# ..$ : chr "0"
# $ num_runs : int 0 0 0 0 0 0
# $ percent_runs: int 0 0 0 0 0 0
然后,您可以像这样访问列表中的元素:
> df$sports[[1]][1] #first element of first list
#[1] "2"
答案 1 :(得分:1)
这是您使用dput
的数据:
mydata = structure(list(row = 0:5, lengthArray = c(4L, 10L, 4L, 2L, 2L,
1L), sports = structure(c(6L, 5L, 1L, 2L, 4L, 3L), .Label = c("[0, 0, 0, 0]",
"[0, 0]", "[0]", "[18, 18]", "[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]",
"[24, 18, 24, 18]"), class = "factor"), num_runs = c(0L, 0L,
0L, 0L, 0L, 0L), percent_runs = c(0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("row",
"lengthArray", "sports", "num_runs", "percent_runs"), class = "data.frame", row.names = c(NA,
-6L))
首先我们将体育列转换为角色
mydata$sports = as.character(mydata$sports)
现在我将摆脱括号和空格(留下逗号)
library(stringr)
mydata$sports = str_replace_all(mydata$sports, pattern = "\\[|\\]| ", "")
最后将体育专栏分成多个栏目
library(tidyr)
mydata = separate(mydata, sports, into = paste0("sport", 1:max(mydata$lengthArray)), sep = ",", extra = "drop")
mydata
# row lengthArray sport1 sport2 sport3 sport4 sport5 sport6 sport7 sport8 sport9 sport10 num_runs percent_runs
#1 0 4 24 18 24 18 <NA> <NA> <NA> <NA> <NA> <NA> 0 0
#2 1 10 2 2 2 2 2 2 2 2 2 2 0 0
#3 2 4 0 0 0 0 <NA> <NA> <NA> <NA> <NA> <NA> 0 0
#4 3 2 0 0 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 0 0
#5 4 2 18 18 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 0 0
#6 5 1 0 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> 0 0
答案 2 :(得分:0)
重新创建数据:
text = "
row lengthArray sports num_runs percent_runs
0 4 '[24, 18, 24, 18]' 0 0
1 10 '[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]' 0 0
2 4 '[0, 0, 0, 0]' 0 0
3 2 '[0, 0]' 0 0
4 2 '[18, 18]' 0 0
5 1 '[0]' 0 0"
data <- read.table(text = text, header= TRUE)
您可能会在运动中获取值并创建新列...但是,如果要在sports
列中创建向量,您实际上可以这样做:
data$sports <- as.character(data$sports)
data$sports <- lapply(data$sports, function(x) eval(parse(text = paste0("c(", gsub("\\[|\\]", "", x),")"))))
现在,例如,如果您想获得sports
第一行的第三个值:
data$sports[[1]][[3]]
[1] 24