我在R中有一个数据框。数据框有多行和多列。其中一列有不同车辆制造商的逗号分隔值,如丰田,本田等。我想计算逗号分隔值的频率,并根据每行的出现频率输出前三个值。 以下是数据集:
Zip Make
12325 Toyota, Honda, Toyota, Mitsubishi, Mercedes
85271 Toyota,Honda,Toyota,Honda,Toyota,Toyota,Volvo,Nissan,Nissan,Nissan, Nissan
56098 Toyota,Honda,Toyota,Mitsubishi,Chevrolet,Acura,Chevrolet,Chevrolet, Honda
这是我想要的输出
任何人都可以根据示例帮助实际的R代码吗?
答案 0 :(得分:1)
在进行任何操作和求和之前,您需要先整理数据。
您的数据集:
df <- data.frame(Zip =c(12325, 85271, 56098), Make = c("Toyota,Honda,Toyota,Mitsubishi,Mercedes", "Toyota,Honda,Toyota,Honda,Toyota,Toyota,Volvo,Nissan,Nissan,Nissan,Nissan", "Toyota,Honda,Toyota,Mitsubishi,Chevrolet,Acura,Chevrolet,Chevrolet,Honda"))
将Make变为Character,以便我们将其拆分
df$Make <- as.character(df$Make)
导入库
library(stringr)
library(tidyverse)
用逗号分割每列,并创建不同品牌的矩阵,然后将其转换为数据框。然后命名列,并将数据框中的zip列分配给新的清洁列。
df2 <- as.data.frame(str_split_fixed(df$Make, ",", max(unlist(lapply(strsplit(df$Make, ","), length)))))
names(df2) <- paste0("car_", 1:ncol(df2))
df2$zip <- df$Zip
<强>检查:强>
> df2
car_1 car_2 car_3 car_4 car_5 car_6 car_7 car_8 car_9 car_10 car_11 zip
1 Toyota Honda Toyota Mitsubishi Mercedes 12325
2 Toyota Honda Toyota Honda Toyota Toyota Volvo Nissan Nissan Nissan Nissan 85271
3 Toyota Honda Toyota Mitsubishi Chevrolet Acura Chevrolet Chevrolet Honda 56098
现在我们想整理一下。我们可以收集列以将其从宽格式转换为长格式。然后我们用NA替换空,并过滤掉它们。然后,我们会删除我们创建的car
列。之后,我们将按zip
和Make
以及summarise
进行分组,其中Make
显示每个zip
的计数。最后,我们按编号递减。
df3 <- df2 %>%
gather(car, Make, -zip) %>%
replace(. == "", NA) %>%
filter(!is.na(Make)) %>%
select(-car) %>%
group_by(zip, Make) %>%
summarise(number = n()) %>%
arrange(zip, desc(number))
我们做的最后一件事是我们可以创建一个名为top3
的新df,我们可以将其切片以仅抓取每个分组的前3行(zip
和Make
) 。由于我们已经安排下降,我们将使用此数据框通过邮政编码获得前三名。
top3 <- df3 %>%
slice(1:3)
答案 1 :(得分:1)
如果你坚持截图中显示的输出格式,这里是一个基本的R解决方案(没有额外的R库):
df <- read.table(text =
"Zip Make
12325 Toyota,Honda,Toyota,Mitsubishi,Mercedes
85271 Toyota,Honda,Toyota,Honda,Toyota,Toyota,Volvo,Nissan,Nissan,Nissan,Nissan
56098 Toyota,Honda,Toyota,Mitsubishi,Chevrolet,Acura,Chevrolet,Chevrolet,Honda",
header = T, stringsAsFactors = F)
df$Frequency <- sapply(lapply(strsplit(df$Make, ","), function(x)
paste0(apply(as.data.frame(table(x)), 1, paste, collapse = "("), ")")),
paste, collapse = ",");
df$top <- sapply(lapply(strsplit(df$Make, ","), function(x)
names(table(x)[order(-table(x))][1:3])),
paste, collapse = ",");
df;
# Zip
#1 12325
#2 85271
#3 56098
# Make
#1 Toyota,Honda,Toyota,Mitsubishi,Mercedes
#2 Toyota,Honda,Toyota,Honda,Toyota,Toyota,Volvo,Nissan,Nissan,Nissan,Nissan
#3 Toyota,Honda,Toyota,Mitsubishi,Chevrolet,Acura,Chevrolet,Chevrolet,Honda
# Frequency top
#1 Honda(1),Mercedes(1),Mitsubishi(1),Toyota(2) Toyota,Honda,Mercedes
#2 Honda(2),Nissan(4),Toyota(4),Volvo(1) Nissan,Toyota,Honda
#3 Acura(1),Chevrolet(3),Honda(2),Mitsubishi(1),Toyota(2) Chevrolet,Honda,Toyota
答案 2 :(得分:1)
使用tidyverse
的解决方案。 dt_final
是最终输出。
library(tidyverse)
# Separate the comma
dt2 <- dt %>% separate_rows(Make)
# Calculate the frequency
dt3 <- dt2 %>% count(Zip, Make)
# Prepare the Frequency column
dt4 <- dt3 %>%
mutate(n = paste0("(", n, ")")) %>%
unite(Frequency, Make, n, sep = " ") %>%
group_by(Zip) %>%
summarise(Frequency = paste0(Frequency, collapse = ", "))
# Prepare the Top 3 Make column
dt5 <- dt3 %>%
group_by(Zip) %>%
mutate(Rank = dense_rank(n)) %>%
filter(Rank <= 3) %>%
arrange(Zip, Rank, Make) %>%
select(Zip, Make) %>%
summarise(`Top 3 Make (per frequency)` = paste0(Make, collapse = ", "))
# Join the results
dt_final <- reduce(list(dt, dt4, dt5), left_join, by = "Zip")
dt_final
# Zip Make
# 1 12325 Toyota, Honda, Toyota, Mitsubishi, Mercedes
# 2 85271 Toyota,Honda,Toyota,Honda,Toyota,Toyota,Volvo,Nissan,Nissan,Nissan, Nissan
# 3 56098 Toyota,Honda,Toyota,Mitsubishi,Chevrolet,Acura,Chevrolet,Chevrolet, Honda
# Frequency
# 1 Honda (1), Mercedes (1), Mitsubishi (1), Toyota (2)
# 2 Honda (2), Nissan (4), Toyota (4), Volvo (1)
# 3 Acura (1), Chevrolet (3), Honda (2), Mitsubishi (1), Toyota (2)
# Top 3 Make (per frequency)
# 1 Honda, Mercedes, Mitsubishi, Toyota
# 2 Volvo, Honda, Nissan, Toyota
# 3 Acura, Mitsubishi, Honda, Toyota, Chevrolet
数据强>
dt <- read.table(text = "Zip Make
12325 'Toyota, Honda, Toyota, Mitsubishi, Mercedes'
85271 'Toyota,Honda,Toyota,Honda,Toyota,Toyota,Volvo,Nissan,Nissan,Nissan, Nissan'
56098 'Toyota,Honda,Toyota,Mitsubishi,Chevrolet,Acura,Chevrolet,Chevrolet, Honda'",
header = TRUE, stringsAsFactors = FALSE)
答案 3 :(得分:0)
首先,考虑@Matt W.关于以更合适的方式组织数据框架的评论,在单元格内部列表通常是设计不良的症状。
尽管如此,我们假设你的数据框是df,然后是:
y
不需要图书馆。
答案 4 :(得分:0)
谢谢大家分享答案。我实际上也可以通过使用sqldf包来解决它。 BB2是包含zip和Concatenated make列的原始数据集。这是代码
D1&lt; - strsplit(BB2 $ MakeConcat,split =“,”)
查看(BB2)
D2&lt; - data.frame(zip = rep(BB2 $ zip,sapply(D1,length)),MakeConcat = unlist(D1))
查看(D2)
D3&lt; - sqldf(“选择zip,计数(MakeConcat)为count2,MakeBox从D2组按zip,MakeConcat按zip desc排序”) 查看(D3)
D4&lt; -mydf [order(D3 $ count2,D3 $ zip,减去= TRUE),]
D4&lt; -D3 [order(D3 $ count2,D3 $ zip,减去= TRUE),]
D4_x&lt; - D4 [ave(D4 $ count,D4 $ zip,FUN = seq_along)&lt; = 3,]
查看(D4_x)
最终&lt; - sqldf(“SELECT zip,upper(GROUP_CONCAT(MakeConcat))as MakeConcat FROM D4_x group by zip”)