我有一个包含许多列的数据。例如,这是三列
df<-structure(list(V1 = structure(c(5L, 1L, 7L, 3L, 2L, 4L, 6L, 6L
), .Label = c("CPSIAAAIAAVNALHGR", "DLNYCFSGMSDHR", "FPEHELIVDPQR",
"IADPDAVKPDDWDEDAPSK", "LWADHGVQACFGR", "WGEAGAEYVVESTGVFTTMEK",
"YYVTIIDAPGHR"), class = "factor"), V2 = structure(c(5L, 2L,
7L, 3L, 4L, 6L, 1L, 1L), .Label = c("", "CPSIAAAIAAVNALHGR",
"GCITIIGGGDTATCCAK", "HVGPGVLSMANAGPNTNGSQFFICTIK", "LLELGPKPEVAQQTR",
"MVCCSAWSEDHPICNLFTCGFDR", "YYVTIIDAPGHR"), class = "factor"),
V3 = structure(c(4L, 3L, 2L, 4L, 3L, 1L, 1L, 1L), .Label = c("",
"AVCMLSNTTAIAEAWAR", "DLNYCFSGMSDHR", "FPEHELIVDPQR"), class = "factor")), .Names = c("V1",
"V2", "V3"), class = "data.frame", row.names = c(NA, -8L))
- 第一栏,我们不看任何其他专栏,我们只计算有多少个字符串并保留唯一的字符串
第二栏,我们保持唯一,我们也删除第一栏中已有的那些
第三列,我们保持唯一,我们删除第一列和第二列中的字符串
这与我们一样多的列继续
例如,对于这些数据,我们将有以下
Column 1 Column 2 Column 3
LWADHGVQACFGR
CPSIAAAIAAVNALHGR LLELGPKPEVAQQTR AVCMLSNTTAIAEAWAR
YYVTIIDAPGHR GCITIIGGGDTATCCAK
FPEHELIVDPQR HVGPGVLSMANAGPNTNGSQFFICTIK
DLNYCFSGMSDHR MVCCSAWSEDHPICNLFTCGFDR
IADPDAVKPDDWDEDAPSK
WGEAGAEYVVESTGVFTTMEK
答案 0 :(得分:1)
以下是tidyverse
,
library(tidyverse)
df1 <- df %>%
gather(var, string) %>%
filter(string != '' & !duplicated(string)) %>%
group_by(var) %>%
mutate(cnt = seq(n())) %>%
spread(var, string) %>%
select(-cnt)
哪个给出了
# A tibble: 7 x 4 cnt V1 V2 V3 * <int> <chr> <chr> <chr> 1 1 LWADHGVQACFGR LLELGPKPEVAQQTR AVCMLSNTTAIAEAWAR 2 2 CPSIAAAIAAVNALHGR GCITIIGGGDTATCCAK <NA> 3 3 YYVTIIDAPGHR HVGPGVLSMANAGPNTNGSQFFICTIK <NA> 4 4 FPEHELIVDPQR MVCCSAWSEDHPICNLFTCGFDR <NA> 5 5 DLNYCFSGMSDHR <NA> <NA> 6 6 IADPDAVKPDDWDEDAPSK <NA> <NA> 7 7 WGEAGAEYVVESTGVFTTMEK <NA> <NA>
您可以使用colSums
来获取字符串数量
colSums(!is.na(df1))
#V1 V2 V3
# 7 4 1
通过基本R的类似方法将保存列表中的字符串,
df[] <- lapply(df, as.character)
d1 <- stack(df)
d1 <- d1[d1$values != '' & !duplicated(d1$values),]
l1 <- unstack(d1, values ~ ind)
lengths(l1)
#V1 V2 V3
# 7 4 1
答案 1 :(得分:1)
基础R解决方案。 df2
是最终输出。
# Convert to character
L1 <- lapply(df, as.character)
# Get unique string
L2 <- lapply(L1, unique)
# Remove ""
L3 <- lapply(L2, function(vec){vec <- vec[!(vec %in% "")]})
# Use for loop to remove non-unique string from previous columns
for (i in 2:length(L3)){
previous_vec <- unlist(L3[1:(i - 1)])
current_vec <- L3[[i]]
L3[[i]] <- current_vec[!(current_vec %in% previous_vec)]
}
# Get the maximum column length
max_num <- max(sapply(L3, length))
# Append "" to each column
L4 <- lapply(L3, function(vec){vec <- c(vec, rep("", max_num - length(vec)))})
# Convert L4 to a data frame
df2 <- as.data.frame(do.call(cbind, L4))