我正在尝试将 factor 列转换为多个布尔列,如下图所示。数据来自使用精细weatherData
包检索的气象站。我想要转换为多个布尔列的因子列包含11个因子。其中一些是单一"事件",其中一些是"事件"的组合。
这是一张显示我想要实现的目标的图片: 这是R代码,它将生成具有组合因子的数据框,我想将其转换为几个布尔列:
df <- read.table(text="
date Events
1/8/2013 Rain
1/9/2013 Fog
1/10/2013 ''
1/11/2013 Fog-Rain
1/12/2013 Snow
1/13/2013 Rain-Snow
1/14/2013 Rain-Thunderstorm
1/15/2013 Thunderstorm
1/16/2013 Fog-Rain-Thunderstorm
1/17/2013 Fog-Thunderstorm
1/18/2013 Fog-Rain-Thunderstorm-Snow",
header=T)
df$date <- as.character(as.Date(df$date, "%m/%d/%Y"))
提前致谢。
答案 0 :(得分:11)
你可以尝试:
lst <- strsplit(as.character(df$Events),"-")
lvl <- unique(unlist(lst))
res <- data.frame(date=df$date,
do.call(rbind,lapply(lst, function(x) table(factor(x, levels=lvl)))),
stringsAsFactors=FALSE)
res
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
# 11 2013-01-18 1 1 1 1
或者可能,这可能比上面更快(由@alexis_laz提供)
setNames(data.frame(df$date, do.call(rbind,lapply(lst, function(x) as.integer(lvl %in% x)) )), c("date", lvl))
或者
library(devtools)
library(data.table)
source_gist("11380733")
library(reshape2) #In case it is needed
res1 <- dcast.data.table(cSplit(df, "Events", "-", "long"), date~Events)
res2 <- merge(subset(df, select=1), res1, by="date", all=TRUE)
res2 <- as.data.frame(res2)
res2[,-1] <- (!is.na(res2[,-1]))+0
res2[,c(1,3,2,4,5)]
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
#11 2013-01-18 1 1 1 1
或者
library(qdap)
with(df, termco(Events, date, c("Rain", "Fog", "Snow", "Thunderstorm")))[[1]][,-2]
# date Rain Fog Snow Thunderstorm
#1 2013-01-08 1 0 0 0
#2 2013-01-09 0 1 0 0
#3 2013-01-10 0 0 0 0
#4 2013-01-11 1 1 0 0
#5 2013-01-12 0 0 1 0
#6 2013-01-13 1 0 1 0
#7 2013-01-14 1 0 0 1
#8 2013-01-15 0 0 0 1
#9 2013-01-16 1 1 0 1
#10 2013-01-17 0 1 0 1
#11 2013-01-18 1 1 1 1
答案 1 :(得分:8)
我能想到的最简单的事情是来自我的&#34; splitstackshape&#34; concat.split.expanded
。包(devel version 1.3.0, from GitHub)。
## Get the right version of the package
library(devtools)
install_github("splitstackshape", "mrdwab", ref = "devel")
packageVersion("splitstackshape")
# [1] ‘1.3.0’
## Split up the relevant column
concat.split.expanded(df, "Events", "-", type = "character",
fill = 0, drop = TRUE)
# date Events_Fog Events_Rain Events_Snow Events_Thunderstorm
# 1 2013-01-08 0 1 0 0
# 2 2013-01-09 1 0 0 0
# 3 2013-01-10 0 0 0 0
# 4 2013-01-11 1 1 0 0
# 5 2013-01-12 0 0 1 0
# 6 2013-01-13 0 1 1 0
# 7 2013-01-14 0 1 0 1
# 8 2013-01-15 0 0 0 1
# 9 2013-01-16 1 1 0 1
# 10 2013-01-17 1 0 0 1
# 11 2013-01-18 1 1 1 1
回答这个问题,我意识到我有点愚蠢地硬编码了一个&#34; trim&#34; concat.split.expanded
中的功能可以减慢很多事情。如果您想要更快的方法,请直接在&#34;事件&#34;的分割版本上使用charMat
(由concat.split.expanded
调用的函数)。专栏,像这样:
splitstackshape:::charMat(
strsplit(as.character(indf[, "Events"]), "-", fixed = TRUE), fill = 0)
对于某些基准测试,请查看this Gist。
答案 2 :(得分:4)
可以使用&#39; grep&#39;
来完成基础R.ddf = data.frame(df$date, df$Events, "Rain"=rep(0), "Fog"=rep(0), "Snow"=rep(0), "Thunderstorm"=rep(0))
for(i in 3:6) ddf[grep(names(ddf)[i],ddf[,2]),i]=1
ddf
df.date df.Events Rain Fog Snow Thunderstorm
1 2013-01-08 Rain 1 0 0 0
2 2013-01-09 Fog 0 1 0 0
3 2013-01-10 0 0 0 0
4 2013-01-11 Fog-Rain 1 1 0 0
5 2013-01-12 Snow 0 0 1 0
6 2013-01-13 Rain-Snow 1 0 1 0
7 2013-01-14 Rain-Thunderstorm 1 0 0 1
8 2013-01-15 Thunderstorm 0 0 0 1
9 2013-01-16 Fog-Rain-Thunderstorm 1 1 0 1
10 2013-01-17 Fog-Thunderstorm 0 1 0 1
11 2013-01-18 Fog-Rain-Thunderstorm-Snow 1 1 1 1
答案 3 :(得分:3)
这是qdapTools
的方法:
library(qdapTools)
matrix2df(mtabulate(lapply(split(as.character(df$Events), df$date),
function(x) strsplit(x, "-")[[1]])), "Date")
## Date Fog Rain Snow Thunderstorm
## 1 2013-01-08 0 1 0 0
## 2 2013-01-09 1 0 0 0
## 3 2013-01-10 0 0 0 0
## 4 2013-01-11 1 1 0 0
## 5 2013-01-12 0 0 1 0
## 6 2013-01-13 0 1 1 0
## 7 2013-01-14 0 1 0 1
## 8 2013-01-15 0 0 0 1
## 9 2013-01-16 1 1 0 1
## 10 2013-01-17 1 0 0 1
## 11 2013-01-18 1 1 1 1
以下是与magrittr
相同的答案,因为它使链更清晰:
split(as.character(df$Events), df$date) %>%
lapply(function(x) strsplit(x, "-")[[1]]) %>%
mtabulate() %>%
matrix2df("Date")
答案 4 :(得分:2)
创建具有因子
的向量set.seed(1)
n <- c("Rain", "Fog", "Snow", "Thunderstorm")
v <- sapply(sample(0:3,100,T), function(i) paste0(sample(n,i), collapse = "-"))
v <- as.factor(v)
返回带有所需输出的矩阵的函数,该输出可以cbind
加到初始data.frame
mSplit <- function(vec) {
if (!is.character(vec))
vec <- as.character(vec)
L <- strsplit(vec, "-")
ids <- unlist(lapply(seq_along(L), function(i) rep(i, length(L[[i]])) ))
U <- sort(unique(unlist(L)))
M <- matrix(0, nrow = length(vec),
ncol = length(U),
dimnames = list(NULL, U))
M[cbind(ids, match(unlist(L), U))] <- 1L
M
}
解决方案基于Ananda Mahto对SO question的回答。它应该很快。
res <- mSplit(v)
答案 5 :(得分:2)
我认为在这种情况下,您需要对函数dummy
的简单调用。我们称之为目标列。 target_cat
。
df_target_bin <- data.frame(dummy(target_cat, "<prefix>"))
这将创建一个新数据框,其中每个列target_cat
的值分别为0和1s。
要将列转换为逻辑列,我的意思是将值分别为TRUE
和FALSE
,然后使用函数as.logical
。
df_target_logical <- apply(df_target_bin, as.logical)
答案 6 :(得分:2)
以@rnso的答案为基础
以下内容将识别所有唯一元素,然后动态生成其中包含相关数据的新列。
options = unique(unlist(strsplit(df$Events, '-'), recursive=FALSE))
for(o in options){
df$newcol = rep(0)
df <- rename(df, !!o := newcol)
df[grep(o, df$Events), o] = 1
}
结果:
date Events Rain Fog Snow Thunderstorm
1 2013-01-08 Rain 1 0 0 0
2 2013-01-09 Fog 0 1 0 0
3 2013-01-10 0 0 0 0
4 2013-01-11 Fog-Rain 1 1 0 0
5 2013-01-12 Snow 0 0 1 0
6 2013-01-13 Rain-Snow 1 0 1 0
7 2013-01-14 Rain-Thunderstorm 1 0 0 1
8 2013-01-15 Thunderstorm 0 0 0 1
9 2013-01-16 Fog-Rain-Thunderstorm 1 1 0 1
10 2013-01-17 Fog-Thunderstorm 0 1 0 1
11 2013-01-18 Fog-Rain-Thunderstorm-Snow 1 1 1 1