标记弦乐标签

时间:2017-07-28 08:45:13

标签: r dplyr aggregate

我有带有条带标签的数据集DF,我想了解每个标签出现的频率。我的实际数据很大,我需要一种方法来概括下面我提出的代码。在我将单个标签(第2步)拆分成列并加入它们之后,是否有更好的方法来计算单个标签(步骤4)? 任何提示/帮助将不胜感激。

library(tidyverse)
library(dplyr)
library(ggplot2)
library(reshape2)
library(splitstackshape)

DF <- data.frame(V=c("a","a","b","b","b"),V1=c("Place1-Place2-Place3-Place4-Place5-Place6-Place7",
                            "Place2-Place4-Place5-Place7-Place8",
                            "Place1-Place2-Place4-Place7-Place8-Place9",
                            "Place3-Place4-Place2-Place1",
                            "Place5-Place6"))
> DF
  V                                               V1
1 a Place1-Place2-Place3-Place4-Place5-Place6-Place7
2 a               Place2-Place4-Place5-Place7-Place8
3 b        Place1-Place2-Place4-Place7-Place8-Place9
4 b                      Place3-Place4-Place2-Place1
5 b                                    Place5-Place6

# 1 - split stringed Labels in V1 into new columns
DF2<-cSplit(DF, "V1", sep="-", direction = "wide")

# 2 - tally Labels per new column
C1_f <- DF2 %>%
  group_by(V1_1) %>% rename(Label = V1_1) %>% 
  tally()

C2_f <- DF2 %>%
  group_by(V1_2) %>% rename(Label = V1_2) %>% 
  tally() %>% filter(Label!="")

C3_f <- DF2 %>%
  group_by(V1_3) %>% rename(Label = V1_3) %>% 
  tally() %>% filter(Label!="")

C4_f <- DF2 %>%
  group_by(V1_4) %>% rename(Label = V1_4) %>% 
  tally() %>% filter(Label!="")

C5_f <- DF2 %>%
  group_by(V1_5) %>% rename(Label = V1_5) %>% 
  tally() %>% filter(Label!="")

# 3 - Count total number of rows
Ctally <- DF2 %>% summarise(count=n())

# 4 - join all tallies by Label and plot in decreasing order
C1_f %>% 
  full_join(C2_f, by = "Label") %>%
  full_join(C3_f, by = "Label") %>%
  full_join(C4_f, by = "Label") %>%
  full_join(C5_f, by = "Label") %>%
  rowwise() %>%
  mutate(sum = sum(n, n.x, n.y,n.x.x, n.y.y, na.rm = TRUE)) %>%
  select(Label, sum) %>% mutate(pct = 100*sum/Ctally$count) %>%
  # arrange(desc(sum)) %>% 
  ggplot(aes(x=reorder(Label,(sum)), y=pct)) + 
  geom_col() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(x = "Label", y="% of the time Label appears in Row", fill="") +
  ggtitle("Labels Associated with Rows") +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip()

enter image description here

3 个答案:

答案 0 :(得分:2)

以下是一般tidyverse方式,

library(tidyverse)

DF %>% 
 mutate(cnt = n(), V1 = strsplit(as.character(V1), '-')) %>% 
 unnest() %>% 
 count(V1, cnt) %>% 
 mutate(percentage = 100*n/cnt) %>% 
 ggplot(aes(x = reorder(V1, percentage), y = percentage))+ 
 geom_col()+
 coord_flip()

答案 1 :(得分:1)

嗯,我最近对map()有点痴迷,@ Sotos aswer是我所做的更清晰,更快的版本。

&#39; tidyverse&#39;方法

library('purrr')
library('stringr')
library('tidyr')
library('ggplot2')
library('dplyr')

DF <- data.frame(V=1:5,V1=c("Place1-Place2-Place3-Place4-Place5-Place6-Place7",
                            "Place2-Place4-Place5-Place7-Place8",
                            "Place1-Place2-Place4-Place7-Place8-Place9",
                            "Place3-Place4-Place2-Place1",
                            "Place5-Place6"))
DF2 <- DF %>% 
  mutate(V1 = map(V1,
                  ~ str_split(.x, pattern = '-', simplify = T) %>% 
                     t()
                  )
         ) %>% 
    unnest() %>% 
  group_by(V1) %>% 
  tally()

诀窍是我们将一个字符列转换为嵌套matrix的列,每个列由一个列组成,其中包含&#34; Places&#34;。然后我们只需要取消它并计算(tally())每个&#34; Place&#34;的数量。

以下是结果图:

DF2 %>% 
  ggplot(aes(y = n, x = V1))+
  geom_col()+
  labs(y = 'Place Count',
       x = 'Places')+
  coord_flip()

enter image description here

答案 2 :(得分:0)

这种做法怎么样?

DF <- data.frame(V=1:5,V1=c("Place1-Place2-Place3-Place4-Place5-Place6-Place7",
                            "Place2-Place4-Place5-Place7-Place8",
                            "Place1-Place2-Place4-Place7-Place8-Place9",
                            "Place3-Place4-Place2-Place1",
                            "Place5-Place6"))

#Find all unique places in the DF
Places <- unique(c(str_split(DF$V1,pattern = "-",simplify = T)))
#Sort into order and remove blanks
Places <- sort(Places[nchar(Places)>=1])
#Count occurance for each Place
Place_Count <- sapply(1:length(Places), function(x){sum(str_count(DF$V1,pattern = Places[x]))})
#Plot
DFF <- data.frame(Places,Place_Count)
DFF %>%
  ggplot(aes(x=Places,y=Place_Count))+
  geom_col()

enter image description here 首先我们找到所有独特的地方,然后我们计算它们的出现次数。它应该适用于任意数量的行。