多列csv文本到存在缺席matri

时间:2017-08-09 02:01:52

标签: r csv text binary

我有一个像这样的csv文件:

    col1    col2    col3
r1  a,b,c   e,f      g
r2  h,i             j,k
r3  l      m,n,o

有些单元格有多个文本逗号分隔,有些有单个,有些没有。我想将其转换为:

   col1  col2  col3
a   1     0     0
b   1     0     0
c   1     0     0
e   0     1     0
f   0     1     0
g   0     0     1
h   1     0     0
i   1     0     0
j   0     0     1
k   0     0     1
l   1     0     0
m   0     1     0
n   0     1     0
o   0     1     0

有什么建议吗?我在excel中尝试了数据透视表但没有获得所需的输出。 提前谢谢。

最诚挚的问候 Zillur

2 个答案:

答案 0 :(得分:0)

不确定这是否是最短的解决方案(可能不是),但它会产生所需的输出。基本上,我们遍历所有三列并计算字符串的出现次数并获得一个长格式数据框,然后我们将其翻转为您想要的宽格式。

library(tidyr)
library(purrr)

df <- data_frame(col1 = c("a,b,c", "h,i", "l"),
             col2 = c("e,f", "", "m,n,o"),
             col3 = c("g", "j,k", ""))

let_df <- map_df(df, function(col){
  # map_df applies the function to each column of df

  # split strings at "," and unlist to get vector of letters 
  letters <- unlist(str_split(col, ","))
  # delete "" 
  letters <- letters[nchar(letters) > 0]

  # count occurrences for each letter
  tab <- table(letters)

  # replace with 1 if occurs more often
  tab[tab > 1] <- 1 

  # create data frame from table
  df <- data_frame(letter = names(tab), count = tab) 
  return(df)
}, .id = "col") # id adds a column col that contains col1 - col3

# bring data frame into wide format 
let_df %>% 
  spread(col, count, fill = 0)

答案 1 :(得分:0)

要解决这么大的问题。以下是我在基地R中的看法:

col1 <- c("a,b,c","h,i","l")
col2 <- c("e,f","","m,n,o")
col3 <- c("g","j,k","")
data <- data.frame(col1, col2, col3, stringsAsFactors = F)

restructure <- function(df){
df[df==""] <- "missing"
result_rows <- as.character()
l <- list()
for (i in seq_along(colnames(df)) ){
  df_col <- sort(unique(unlist(strsplit(gsub(" ", "",toString(df[[i]])), ","))))
  df_col <- df_col[!df_col %in% "missing"]
  result_rows <- sort(unique(c(result_rows, df_col)))
  l[i] <- list(df_col)
}
result <- data.frame(result_rows)
for (j in seq_along(l)){
  result$temp <- NA
  result$temp[match(l[[j]], result_rows)] <- 1
  colnames(result)[colnames(result)=="temp"] <- colnames(df)[j]
}
result[is.na(result)] <- 0
return(result)
}

> restructure(data)
#  result_rows col1 col2 col3
#1            a    1    0    0
#2            b    1    0    0
#3            c    1    0    0
#4            e    0    1    0
#5            f    0    1    0
#6            g    0    0    1
#7            h    1    0    0
#8            i    1    0    0
#9            j    0    0    1
#10           k    0    0    1
#11           l    1    0    0
#12           m    0    1    0
#13           n    0    1    0
#14           o    0    1    0