我的数据有一列,我正在尝试使用行中每个“/”之后的内容创建其他列。以下是数据的前几行:
> dput(mydata)
structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c("/
ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.g
r/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,
-5L))
如果变量出现在行中,结果应该如此(数据框)在新列中带有“1”,否则为“0”:
> dput(Result)
structure(list(ALL = structure(c(1L, 4L, 5L, 3L, 2L), .Label = c("/ca
/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL", "/ca/put/sent_1/fe.gr/
eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov", "/ca/put/sent_1fe.
gr/eq2_on/eq2_off/hi.on/hi.ov"
), class = "factor"), ca = c(1L, 1L, 1L, 1L, 1L), put = c(1L,
1L, 1L, 1L, 1L), sent_1 = c(1L, 1L, 1L, 1L, 1L), fe.gr = c(1L,
1L, 1L, 1L, 1L), eq2_on = c(1L, 1L, 1L, 1L, 1L), eq2_off = c(1L,
1L, 1L, 1L, 1L), hi.on = c(0L, 1L, 1L, 0L, 0L), hi.ov = c(0L,
1L, 1L, 0L, 0L), cni_at.p3x.4 = c(0L, 0L, 0L, 1L, 0L), cbr_LBL = c(0L
,
0L, 0L, 0L, 1L)), .Names = c("ALL", "ca", "put", "sent_1", "fe.gr",
"eq2_on", "eq2_off", "hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"
), class = "data.frame", row.names = c(NA, -5L))
我尝试过很多功能,包括strsplit和sapply:
sapply(strsplit(as.character(mydata$ALL), “\\/”), “[[“, 2) #returns "ca"s only
sapply(strsplit(as.character(mydata$ALL), "\\/"), "[[", 3) #returns "put"s only
有数百万行,我非常感谢任何快速有效的行。
答案 0 :(得分:4)
使用我维护的 qdapTools 包中的mtabuate
:
library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))
## V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1 1 1 0 0 1 1 1 0 0 1 1 0
## 2 1 1 0 0 1 1 1 1 1 1 1 0
## 3 1 1 0 0 1 1 0 1 1 1 0 1
## 4 1 1 0 1 1 1 1 0 0 1 1 0
## 5 1 1 1 0 1 1 1 0 0 1 1 0
答案 1 :(得分:4)
您可以使用我的“splitstackshape”软件包中的cSplit_e
:
library(splitstackshape)
cSplit_e(mydata, "ALL", "/", type = "character", fill = 0)
# ALL ALL_ca ALL_cbr_LBL
# 1 /ca/put/sent_1/fe.gr/eq2_on/eq2_off 1 0
# 2 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 3 /ca/put/sent_1fe.gr/eq2_on/eq2_off/hi.on/hi.ov 1 0
# 4 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4 1 0
# 5 /ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL 1 1
# ALL_cni_at.p3x.4 ALL_eq2_off ALL_eq2_on ALL_fe.gr ALL_hi.on ALL_hi.ov ALL_put
# 1 0 1 1 1 0 0 1
# 2 0 1 1 1 1 1 1
# 3 0 1 1 0 1 1 1
# 4 1 1 1 1 0 0 1
# 5 0 1 1 1 0 0 1
# ALL_sent_1 ALL_sent_1fe.gr
# 1 1 0
# 2 1 0
# 3 0 1
# 4 1 0
# 5 1 0
(注意:我认为dput
的第3行存在问题,这就是为什么它与您想要的输出不匹配。请注意,第3行中的第三项是“sent_1fe.gr”而没有“ /“他们之间。”
答案 2 :(得分:2)
这样的事情
spt <- strsplit(as.character(mydata$ALL),"/", fixed=T)
do.call(rbind, lapply(lapply(spt, factor, levels=unique(unlist(spt))), table))
返回
ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov sent_1fe.gr cni_at.p3x.4 cbr_LBL
[1,] 1 1 1 1 1 1 1 0 0 0 0 0
[2,] 1 1 1 1 1 1 1 1 1 0 0 0
[3,] 1 1 1 1 0 1 1 1 1 1 0 0
[4,] 1 1 1 1 1 1 1 0 0 0 1 0
[5,] 1 1 1 1 1 1 1 0 0 0 0 1
答案 3 :(得分:1)
其他选项是在melt
到split
表单中list
long
字符串,然后使用table
library(reshape2)
as.data.frame.matrix(table(melt(strsplit(as.character(
mydata[[1]]), "/"))[2:1]))[,-1]
# ca eq2_off eq2_on fe.gr put sent_1 hi.on hi.ov sent_1fe.gr cni_at.p3x.4
#1 1 1 1 1 1 1 0 0 0 0
#2 1 1 1 1 1 1 1 1 0 0
#3 1 1 1 0 1 0 1 1 1 0
#4 1 1 1 1 1 1 0 0 0 1
#5 1 1 1 1 1 1 0 0 0 0
# cbr_LBL
#1 0
#2 0
#3 0
#4 0
#5 1
答案 4 :(得分:1)
tidyverse
解决方案
library(tidyverse)
mydata %>%
rownames_to_column() %>%
mutate(key = strsplit(levels(ALL)[ALL],"/"),value=1) %>%
unnest %>%
spread(key,value,0) %>%
select(-rowname)
# ALL ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1
# 1 1 1 1 0 0 1 1 1 0 0 1 1
# 2 4 1 1 0 0 1 1 1 1 1 1 1
# 3 4 1 1 0 0 1 1 1 1 1 1 1
# 4 3 1 1 0 1 1 1 1 0 0 1 1
# 5 2 1 1 1 0 1 1 1 0 0 1 1
数据强>
mydata <- structure(list(ALL = structure(c(1L, 4L, 4L, 3L, 2L), .Label = c(
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"), class = "factor
")), .Names = "ALL", class = "data.frame", row.names = c(NA,-5L))
答案 5 :(得分:0)
这是一个使用dplyr和tidyr的解决方案(注意:我清理了似乎是什么 你的样本数据中缺少/第三行):
## Input
input <- structure(
list(ALL = structure(c(1L, 4L, 5L, 3L, 2L),
.Label = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov"),
class = "factor")),
.Names = "ALL", class = "data.frame", row.names = c(NA, -5L))
## Solution
require(dplyr)
require(tidyr)
solution <- input %>%
mutate(temp = sub("^/", "", ALL)) %>%
separate(temp,
c("ca", "put", "sent_1", "fe.gr", "eq2_on", "eq2_off",
"hi.on", "hi.ov", "cni_at.p3x.4", "cbr_LBL"),
"/", extra="merge") %>%
mutate_each(funs(as.numeric(!is.na(.))), -ALL)
答案 6 :(得分:0)
我自己遇到了类似但更复杂的问题,我想到了以下功能方式,该方式允许以“一种'热门'方式分别转换列的字符串分隔字符值以及分类变量本身:
oneHotOnText <- function(datatable, columns, seperator=", "){ #argument columns is character vector or numeric vector
if(! "data.table" %in% .packages()) if(!require(data.table)) { install.packages("data.table"); library(data.table) }
if(! "data.table" %in% class(datatable)) TempDT <- as.data.table(datatable) else TempDT <- copy(datatable)
for(i in TempDT[, columns, with = F]){
if(class(i) != "character") i <- as.character(i)
uniqueValues <- unique(unlist(strsplit(unique(i), split=seperator)))
if(any(uniqueValues %in% names(TempDT))) { print("Value/s of the selected column/s is/are present as variables name/s. Rename it/them.")
rm(TempDT)
break }
for(j in uniqueValues) TempDT[, (j) := ifelse(grepl(j, i), 1L, 0L)]
}
if(exists("TempDT")) return(TempDT)
}
DF = data.frame(
aColumn=rep(c("f", "b", "c"), 100000),
xColumn=rep(c("N/W", "W", "R"), 100000),
yColumn=rep(c("A/B", "A/V", "B/G"), 100000),
zColumn=rep(20:22, 100000))
str(DF) #factors are present in the data.frame
oneHotOnText(DF, columns = c("aColumn", "xColumn", "yColumn"), seperator="/")[] #applies the function, returns a data.table and prints the result
# aColumn xColumn yColumn zColumn f b c N W R A B V G
# 1: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 2: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# 3: c R B/G 22 0 0 1 0 0 1 0 1 0 1
# 4: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
# 5: b W A/V 21 0 1 0 0 1 0 1 0 1 0
# ---
#299996: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#299997: c R B/G 22 0 0 1 0 0 1 0 1 0 1
#299998: f N/W A/B 20 1 0 0 1 1 0 1 1 0 0
#299999: b W A/V 21 0 1 0 0 1 0 1 0 1 0
#300000: c R B/G 22 0 0 1 0 0 1 0 1 0 1
类似地,它适用于OP的问题:
input <- data.frame(ALL = c("/ca/put/sent_1/fe.gr/eq2_on/eq2_off",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/hi.on/hi.ov",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cni_at.p3x.4",
"/ca/put/sent_1/fe.gr/eq2_on/eq2_off/cbr_LBL"
))
oneHotOnText(input, columns = "ALL", seperator = "/")[]
答案 7 :(得分:0)
这是另外一种library(tidyverse)
nn <- max(str_count(df$ALL, "/"), na.rm = T)+1
df %>% mutate(row_id =row_number()) %>%
select(row_id, everything()) %>%
separate(ALL, into = paste("tag", 1:nn), sep = "/") %>%
pivot_longer(cols = paste("tag", 2:nn), names_to = "name", values_to = "val") %>%
filter(!is.na(val)) %>%
select(-`tag 1`) %>%
mutate(new=1) %>%
#group_by(row_id) %>%
pivot_wider(id_cols = row_id, names_from = val, values_from = new, values_fill =0)
# A tibble: 5 x 11
row_id ca put sent_1 fe.gr eq2_on eq2_off hi.on hi.ov cni_at.p3x.4
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 1 1 1 1 1 0 0 0
2 2 1 1 1 1 1 1 1 1 0
3 3 1 1 1 1 1 1 1 1 0
4 4 1 1 1 1 1 1 0 0 1
5 5 1 1 1 1 1 1 0 0 0
# ... with 1 more variable: cbr_LBL <dbl>
解决方案
{{1}}