我有带有条带标签的数据集DF,我想了解每个标签出现的频率。我的实际数据很大,我需要一种方法来概括下面我提出的代码。在我将单个标签(第2步)拆分成列并加入它们之后,是否有更好的方法来计算单个标签(步骤4)? 任何提示/帮助将不胜感激。
library(tidyverse)
library(dplyr)
library(ggplot2)
library(reshape2)
library(splitstackshape)
DF <- data.frame(V=c("a","a","b","b","b"),V1=c("Place1-Place2-Place3-Place4-Place5-Place6-Place7",
"Place2-Place4-Place5-Place7-Place8",
"Place1-Place2-Place4-Place7-Place8-Place9",
"Place3-Place4-Place2-Place1",
"Place5-Place6"))
> DF
V V1
1 a Place1-Place2-Place3-Place4-Place5-Place6-Place7
2 a Place2-Place4-Place5-Place7-Place8
3 b Place1-Place2-Place4-Place7-Place8-Place9
4 b Place3-Place4-Place2-Place1
5 b Place5-Place6
# 1 - split stringed Labels in V1 into new columns
DF2<-cSplit(DF, "V1", sep="-", direction = "wide")
# 2 - tally Labels per new column
C1_f <- DF2 %>%
group_by(V1_1) %>% rename(Label = V1_1) %>%
tally()
C2_f <- DF2 %>%
group_by(V1_2) %>% rename(Label = V1_2) %>%
tally() %>% filter(Label!="")
C3_f <- DF2 %>%
group_by(V1_3) %>% rename(Label = V1_3) %>%
tally() %>% filter(Label!="")
C4_f <- DF2 %>%
group_by(V1_4) %>% rename(Label = V1_4) %>%
tally() %>% filter(Label!="")
C5_f <- DF2 %>%
group_by(V1_5) %>% rename(Label = V1_5) %>%
tally() %>% filter(Label!="")
# 3 - Count total number of rows
Ctally <- DF2 %>% summarise(count=n())
# 4 - join all tallies by Label and plot in decreasing order
C1_f %>%
full_join(C2_f, by = "Label") %>%
full_join(C3_f, by = "Label") %>%
full_join(C4_f, by = "Label") %>%
full_join(C5_f, by = "Label") %>%
rowwise() %>%
mutate(sum = sum(n, n.x, n.y,n.x.x, n.y.y, na.rm = TRUE)) %>%
select(Label, sum) %>% mutate(pct = 100*sum/Ctally$count) %>%
# arrange(desc(sum)) %>%
ggplot(aes(x=reorder(Label,(sum)), y=pct)) +
geom_col() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Label", y="% of the time Label appears in Row", fill="") +
ggtitle("Labels Associated with Rows") +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip()
答案 0 :(得分:2)
以下是一般tidyverse
方式,
library(tidyverse)
DF %>%
mutate(cnt = n(), V1 = strsplit(as.character(V1), '-')) %>%
unnest() %>%
count(V1, cnt) %>%
mutate(percentage = 100*n/cnt) %>%
ggplot(aes(x = reorder(V1, percentage), y = percentage))+
geom_col()+
coord_flip()
答案 1 :(得分:1)
嗯,我最近对map()
有点痴迷,@ Sotos aswer是我所做的更清晰,更快的版本。
&#39; tidyverse&#39;方法
library('purrr')
library('stringr')
library('tidyr')
library('ggplot2')
library('dplyr')
DF <- data.frame(V=1:5,V1=c("Place1-Place2-Place3-Place4-Place5-Place6-Place7",
"Place2-Place4-Place5-Place7-Place8",
"Place1-Place2-Place4-Place7-Place8-Place9",
"Place3-Place4-Place2-Place1",
"Place5-Place6"))
DF2 <- DF %>%
mutate(V1 = map(V1,
~ str_split(.x, pattern = '-', simplify = T) %>%
t()
)
) %>%
unnest() %>%
group_by(V1) %>%
tally()
诀窍是我们将一个字符列转换为嵌套matrix
的列,每个列由一个列组成,其中包含&#34; Places&#34;。然后我们只需要取消它并计算(tally()
)每个&#34; Place&#34;的数量。
以下是结果图:
DF2 %>%
ggplot(aes(y = n, x = V1))+
geom_col()+
labs(y = 'Place Count',
x = 'Places')+
coord_flip()
答案 2 :(得分:0)
这种做法怎么样?
DF <- data.frame(V=1:5,V1=c("Place1-Place2-Place3-Place4-Place5-Place6-Place7",
"Place2-Place4-Place5-Place7-Place8",
"Place1-Place2-Place4-Place7-Place8-Place9",
"Place3-Place4-Place2-Place1",
"Place5-Place6"))
#Find all unique places in the DF
Places <- unique(c(str_split(DF$V1,pattern = "-",simplify = T)))
#Sort into order and remove blanks
Places <- sort(Places[nchar(Places)>=1])
#Count occurance for each Place
Place_Count <- sapply(1:length(Places), function(x){sum(str_count(DF$V1,pattern = Places[x]))})
#Plot
DFF <- data.frame(Places,Place_Count)
DFF %>%
ggplot(aes(x=Places,y=Place_Count))+
geom_col()