我的列表有很多行但只有一列; 107,114,142,143,146
。我想创建另一个列,具体取决于它是否具有特定值,请说114
。
查看另一篇文章,我尝试了以下内容;
e_list$completed_forms <- rep(0, nrow(event_list))
e_list[e_list$e_list == 114, ][, "completed_forms"] <- "1"
数据列表
structure(list(event_list = c("211,202,214,240,104,105,106,107,114,117,118,139,140,141,142,143,146",
"211,202,214,240,104,105,106,107,114,117,118,121,139,140,141,142,143,146",
"211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146",
"211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146",
"211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146",
"211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146",
"211,202,214,240,215,216,104,105,106,107,114,117,118,120,121,139,140,141,142,143,146",
"211,202,214,240,215,216,104,105,106,107,114,117,118,120,121,139,140,141,142,143,146",
"211,114,117,118,146", "211,104,114,117,118,121,146", "211,202,104,114,117,118,121,139,141,142,143,146",
"211,202,214,104,105,106,107,114,117,118,121,139,141,142,143,146",
"211,202,214,215,104,105,106,107,114,117,118,121,139,141,142,143,146",
"211,202,214,215,216,104,105,106,107,114,117,118,120,121,139,141,142,143,146",
"211,202,214,215,216,203,240,104,105,106,107,114,117,118,120,121,139,140,141,142,143,146",
"", "211,114,117,118,146", "211,114,117,118,146", "211,104,114,117,118,121,146",
"211,202,104,114,117,118,121,139,141,142,143,146", "211,202,214,104,105,106,107,114,117,118,121,139,141,142,143,146",
"211,202,214,215,104,105,106,107,114,117,118,121,139,141,142,143,146",
"211,202,214,215,216,104,105,106,107,114,117,118,120,121,139,141,142,143,146",
"211,202,214,215,216,217,240,104,105,106,107,114,117,118,120,121,139,140,141,142,143,146",
"211,202,214,215,216,217,240,203,104,105,106,107,114,117,118,120,121,122,139,140,141,142,143,146",
"211,202,214,215,216,217,240,203,104,105,106,107,114,117,118,120,121,122,139,140,141,142,143,146",
"211,202,214,215,216,217,240,203,104,105,106,107,114,117,118,120,121,122,139,140,141,142,143,146",
"", "211,114,117,118,146", "211,114,117,118,146"), lead_completed_new_forms = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("event_list", "completed_forms"
), row.names = c(NA, 30L), class = "data.frame")
但我收到此错误并且我不确定原因,因为我已经在第一行的每一列添加了0
。
[。data.frame( tmp ,&#34; completed_forms&#34;,值)出错 =&#34; 1&#34;):替换有1行,数据有0
答案 0 :(得分:0)
整件事
library(stringr)
library(tidyverse)
df2 <- as.data.frame(str_split_fixed(df$event_list, ",", max(length(strsplit(df$event_list, ",")))))
names(df2) <- paste0("event_", 1:ncol(df2))
df2$sequence <- paste0("seq_", 1:nrow(df2))
df3 <- df2 %>%
gather(event, event_num, -sequence) %>%
replace(. == "", NA) %>%
filter(!is.na(event_num)) %>%
select(-event) %>%
mutate(Count = 1) %>%
spread(event_num, Count) %>%
replace(is.na(.), 0)
细分:
在标记这些数据集之前,我会做一些事情来清理这个数据集。这将使操作更容易。
head(df)
event_list completed_forms
1 211,202,214,240,104,105,106,107,114,117,118,139,140,141,142,143,146 0
2 211,202,214,240,104,105,106,107,114,117,118,121,139,140,141,142,143,146 0
3 211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146 0
4 211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146 0
5 211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146 0
6 211,202,214,240,215,104,105,106,107,114,117,118,121,139,140,141,142,143,146 0
我将从使用str_split_fixed
包中的stringr
开始。如果您没有,install.packages(stringr)
。这会将我们的dataframe
转换为matrix
,因此我们希望将其全部包含在as.data.frame()
中。
library(stringr)
df2 <- as.data.frame(str_split_fixed(df$event_list, ",", max(length(strsplit(df$event_list, ",")))))
这样做需要一个像"134, 444, 105, 106"
这样的字符串,并将它从一列拆分为4.我们将它拆分成的列数将是max(length())
。这意味着如果我们有一个包含4个事件的列,一个包含14个事件,另一个包含44个事件,则它会将每列拆分为44列以容纳最大的列。
<强>检查:强>
> head(df2)
V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30
1 211 202 214 240 104 105 106 107 114 117 118 139 140 141 142 143 146
2 211 202 214 240 104 105 106 107 114 117 118 121 139 140 141 142 143 146
3 211 202 214 240 215 104 105 106 107 114 117 118 121 139 140 141 142 143 146
4 211 202 214 240 215 104 105 106 107 114 117 118 121 139 140 141 142 143 146
5 211 202 214 240 215 104 105 106 107 114 117 118 121 139 140 141 142 143 146
6 211 202 214 240 215 104 105 106 107 114 117 118 121 139 140 141 142 143 146
让我们将标题变成事件编号来清理标题
names(df2) <- paste0("event_", 1:ncol(df2))
现在让我们创建一个列来标记所有这些我将称之为“序列”的事件序列。
df2$sequence <- paste0("seq_", 1:nrow(df2))
现在我们想将它从宽格式转移到长格式以便更容易消化。我们可以使用gather
中的tidyverse
函数执行此操作。我们还想将空格转换为NA
值,然后将其从数据框中删除。 *此后我们可能也不需要event
列。然后我会添加一些步骤将其重新转换为一个网格,其中列标题是事件编号,并且像您要求的那样具有1或0。
library(tidyverse)
df3 <- df2 %>%
gather(event, event_num, -sequence) %>%
replace(. == "", NA) %>%
filter(!is.na(event_num)) %>%
select(-event) %>%
mutate(Count = 1) %>%
spread(event_num, Count) %>%
replace(is.na(.), 0)
希望这能回答你的问题!