我是这个网站的新手,也是编码的新手。我想知道你们中是否有人可以帮助我
我需要通过评分分配来计算前5部电影,计算每部4星或更高的电影的评分百分比。
到目前为止,我只能使用dplyr计算出现次数。
是否可以使用dplyr计算它(类似于我的编码)?
我不确定是否需要变异以提出解决方案,或者是否有其他方法可以这样做。
到目前为止我的代码:
dfAux1 <- na.omit(dfAux)
dfAux1 %>%
group_by(movie) %>%
summarise(tot = n()) %>%
arrange(desc(tot))%>%
head(5)
结果应该是这样的:
**Expected result**:
0.7000000, 'The Shawshank Redemption'
0.5333333, 'Star Wars IV - A New Hope'
0.5000000, 'Gladiator'
0.4444444, 'Blade Runner'
0.4375000, 'The Silence of the Lambs'
到目前为止这是我的结果:
# A tibble: 5 x 2
movie tot
<fctr> <int>
1 Toy Story 17
2 The Silence of the Lambs 16
3 Star Wars IV - A New Hope 15
4 Star Wars VI - Return of the Jedi 14
5 Independence Day 13
编辑:
str(dfAux1)
'data.frame': 241 obs. of 2 variables:
$ Rating: int 1 5 4 2 4 5 4 2 3 2 ...
$ movie : Factor w/ 20 levels "Star Wars IV - A New Hope",..: 1 1 1 1 1 1 1 1 1 1 ...
- attr(*, "na.action")=Class 'omit' Named int [1:159] 3 4 7 16 17 23 27 28 34 36 ...
.. ..- attr(*, "names")= chr [1:159] "3" "4" "7" "16" ...
dput(dfAux1)
structure(list(Rating = c(1L, 5L, 4L, 2L, 4L, 5L, 4L, 2L, 3L,
2L, 3L, 4L, 4L, 5L, 1L, 5L, 3L, 3L, 3L, 4L, 1L, 2L, 1L, 5L, 3L,
4L, 5L, 1L, 2L, 2L, 4L, 4L, 3L, 5L, 2L, 3L, 1L, 1L, 2L, 2L, 5L,
1L, 4L, 1L, 4L, 5L, 5L, 5L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 3L,
2L, 4L, 2L, 5L, 3L, 4L, 1L, 5L, 4L, 2L, 1L, 1L, 4L, 2L, 4L, 5L,
5L, 2L, 1L, 4L, 2L, 1L, 4L, 2L, 3L, 2L, 4L, 4L, 5L, 2L, 4L, 3L,
2L, 2L, 4L, 2L, 2L, 2L, 3L, 4L, 1L, 5L, 4L, 3L, 5L, 2L, 1L, 3L,
4L, 4L, 2L, 3L, 4L, 1L, 3L, 2L, 5L, 3L, 2L, 3L, 4L, 1L, 1L, 4L,
1L, 4L, 5L, 1L, 3L, 2L, 2L, 3L, 5L, 5L, 1L, 2L, 3L, 5L, 2L, 3L,
1L, 2L, 1L, 4L, 1L, 2L, 2L, 3L, 3L, 2L, 1L, 1L, 1L, 5L, 2L, 4L,
1L, 4L, 3L, 1L, 2L, 2L, 3L, 4L, 2L, 3L, 2L, 4L, 3L, 4L, 3L, 2L,
2L, 4L, 5L, 2L, 1L, 5L, 1L, 4L, 5L, 2L, 3L, 3L, 2L, 5L, 5L, 4L,
1L, 3L, 1L, 2L, 1L, 5L, 5L, 2L, 4L, 2L, 4L, 2L, 5L, 2L, 5L, 5L,
1L, 5L, 1L, 3L, 2L, 2L, 3L, 5L, 1L, 3L, 1L, 5L, 3L, 3L, 1L, 2L,
4L, 1L, 5L, 3L, 1L, 1L, 5L, 5L, 1L, 5L, 3L, 3L, 2L, 3L, 3L, 2L,
2L, 2L, 5L, 4L, 2L, 1L, 4L, 5L), movie = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L,
7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L,
11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L, 19L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L), .Label = c("Star Wars IV - A New Hope",
"Star Wars VI - Return of the Jedi", "Forrest Gump", "The Shawshank Redemption",
"The Silence of the Lambs", "Gladiator", "Toy Story", "Saving Private Ryan",
"Pulp Fiction", "Stand by Me", "Shakespeare in Love", "Total Recall",
"Independence Day", "Blade Runner", "Groundhog Day", "The Matrix",
"Schindler's List", "The Sixth Sense", "Raiders of the Lost Ark",
"Babe"), class = "factor")), .Names = c("Rating", "movie"), row.names = c(1L,
2L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 18L, 19L, 20L,
21L, 22L, 24L, 25L, 26L, 29L, 30L, 31L, 32L, 33L, 35L, 38L, 39L,
40L, 41L, 45L, 46L, 47L, 51L, 52L, 54L, 56L, 58L, 60L, 62L, 63L,
65L, 66L, 67L, 69L, 70L, 73L, 78L, 80L, 81L, 82L, 83L, 85L, 87L,
88L, 89L, 90L, 92L, 93L, 94L, 95L, 96L, 97L, 98L, 100L, 101L,
102L, 104L, 105L, 107L, 108L, 109L, 111L, 115L, 116L, 118L, 119L,
121L, 122L, 123L, 124L, 126L, 128L, 129L, 130L, 131L, 132L, 133L,
134L, 135L, 137L, 138L, 139L, 140L, 141L, 144L, 145L, 146L, 147L,
149L, 150L, 153L, 156L, 159L, 160L, 164L, 166L, 167L, 168L, 170L,
172L, 175L, 177L, 178L, 179L, 180L, 181L, 182L, 183L, 185L, 186L,
189L, 194L, 195L, 196L, 199L, 200L, 201L, 202L, 205L, 206L, 207L,
209L, 212L, 216L, 217L, 219L, 220L, 222L, 223L, 224L, 225L, 226L,
228L, 229L, 231L, 233L, 234L, 235L, 239L, 241L, 242L, 243L, 244L,
246L, 248L, 249L, 250L, 251L, 252L, 253L, 254L, 255L, 261L, 263L,
264L, 265L, 267L, 268L, 274L, 278L, 280L, 282L, 283L, 284L, 286L,
288L, 289L, 292L, 293L, 294L, 295L, 296L, 300L, 301L, 303L, 305L,
307L, 310L, 311L, 312L, 314L, 316L, 317L, 319L, 320L, 321L, 322L,
323L, 324L, 325L, 328L, 330L, 334L, 335L, 336L, 338L, 340L, 341L,
342L, 343L, 344L, 345L, 346L, 348L, 350L, 351L, 356L, 358L, 360L,
362L, 363L, 364L, 367L, 368L, 371L, 373L, 375L, 376L, 378L, 380L,
383L, 384L, 386L, 387L, 389L, 391L, 392L, 395L, 396L, 398L), class = "data.frame", na.action = structure(c(3L,
4L, 7L, 16L, 17L, 23L, 27L, 28L, 34L, 36L, 37L, 42L, 43L, 44L,
48L, 49L, 50L, 53L, 55L, 57L, 59L, 61L, 64L, 68L, 71L, 72L, 74L,
75L, 76L, 77L, 79L, 84L, 86L, 91L, 99L, 103L, 106L, 110L, 112L,
113L, 114L, 117L, 120L, 125L, 127L, 136L, 142L, 143L, 148L, 151L,
152L, 154L, 155L, 157L, 158L, 161L, 162L, 163L, 165L, 169L, 171L,
173L, 174L, 176L, 184L, 187L, 188L, 190L, 191L, 192L, 193L, 197L,
198L, 203L, 204L, 208L, 210L, 211L, 213L, 214L, 215L, 218L, 221L,
227L, 230L, 232L, 236L, 237L, 238L, 240L, 245L, 247L, 256L, 257L,
258L, 259L, 260L, 262L, 266L, 269L, 270L, 271L, 272L, 273L, 275L,
276L, 277L, 279L, 281L, 285L, 287L, 290L, 291L, 297L, 298L, 299L,
302L, 304L, 306L, 308L, 309L, 313L, 315L, 318L, 326L, 327L, 329L,
331L, 332L, 333L, 337L, 339L, 347L, 349L, 352L, 353L, 354L, 355L,
357L, 359L, 361L, 365L, 366L, 369L, 370L, 372L, 374L, 377L, 379L,
381L, 382L, 385L, 388L, 390L, 393L, 394L, 397L, 399L, 400L), .Names = c("3",
"4", "7", "16", "17", "23", "27", "28", "34", "36", "37", "42",
"43", "44", "48", "49", "50", "53", "55", "57", "59", "61", "64",
"68", "71", "72", "74", "75", "76", "77", "79", "84", "86", "91",
"99", "103", "106", "110", "112", "113", "114", "117", "120",
"125", "127", "136", "142", "143", "148", "151", "152", "154",
"155", "157", "158", "161", "162", "163", "165", "169", "171",
"173", "174", "176", "184", "187", "188", "190", "191", "192",
"193", "197", "198", "203", "204", "208", "210", "211", "213",
"214", "215", "218", "221", "227", "230", "232", "236", "237",
"238", "240", "245", "247", "256", "257", "258", "259", "260",
"262", "266", "269", "270", "271", "272", "273", "275", "276",
"277", "279", "281", "285", "287", "290", "291", "297", "298",
"299", "302", "304", "306", "308", "309", "313", "315", "318",
"326", "327", "329", "331", "332", "333", "337", "339", "347",
"349", "352", "353", "354", "355", "357", "359", "361", "365",
"366", "369", "370", "372", "374", "377", "379", "381", "382",
"385", "388", "390", "393", "394", "397", "399", "400"), class = "omit"))
答案 0 :(得分:3)
我使用data.table
代替dplyr
library(data.table)
setDT(dfAux1) # make dfAux1 as data table by reference
# calculate total number by movies, then compute percent for `Rating >= 4` by movies and then sort `tot` by descending order and also eliminating duplicates in movies using `.SD[1]` which gives the first row in each movie.
dfAux1[, .(Rating, tot = .N), by = movie ][Rating >= 4, .(percent = .N/tot, tot), by = movie ][order(-tot), .SD[1], by = movie]
# movie percent tot
# 1: Toy Story 0.35294118 17
# 2: The Silence of the Lambs 0.43750000 16
# 3: Star Wars IV - A New Hope 0.53333333 15
# 4: Star Wars VI - Return of the Jedi 0.35714286 14
# 5: Independence Day 0.30769231 13
# 6: Gladiator 0.50000000 12
# 7: Total Recall 0.08333333 12
# 8: Groundhog Day 0.41666667 12
# 9: The Matrix 0.41666667 12
# 10: Schindler's List 0.33333333 12
# 11: The Sixth Sense 0.33333333 12
# 12: Saving Private Ryan 0.36363636 11
# 13: Pulp Fiction 0.36363636 11
# 14: Stand by Me 0.36363636 11
# 15: Shakespeare in Love 0.27272727 11
# 16: Raiders of the Lost Ark 0.27272727 11
# 17: Forrest Gump 0.30000000 10
# 18: The Shawshank Redemption 0.70000000 10
# 19: Babe 0.40000000 10
# 20: Blade Runner 0.44444444 9
答案 1 :(得分:2)
使用data.table
的单行解决方案和来自OP的数据可以是:
library(data.table)
setDT(dfAux1)[, .(pct = sum(Rating>=4)/.N), by=movie][order(-pct)][1:5]
movie pct
1: The Shawshank Redemption 0.7000000
2: Star Wars IV - A New Hope 0.5333333
3: Gladiator 0.5000000
4: Blade Runner 0.4444444
5: The Silence of the Lambs 0.4375000
答案 2 :(得分:1)
我使用dplyr包按movie
列对您的数据进行分组,并根据rating
列执行计算。
在summarise()
中,我创建了三个新列:
Total_Review
:计算每movie
的评价总数。FourPlus_Rating
:计算Rating
值为4或更高的评论子集。Per_FourPlus_Rating
:将FourPlus_Rating
除以Total_Review
。然后我根据Per_FourPlus_Rating
按降序排列日期。最后,我调用head()
来指定我只希望tibble
返回前5行。
# install necessary package
install.packages( pkgs = "dplyr" )
# load necessary package
library( dplyr )
# view first six rows
head( x = df )
# Rating movie
# 1 1 Star Wars IV - A New Hope
# 2 5 Star Wars IV - A New Hope
# 5 4 Star Wars IV - A New Hope
# 6 2 Star Wars IV - A New Hope
# 8 4 Star Wars IV - A New Hope
# 9 5 Star Wars IV - A New Hope
# perform calculations using
# dplyr functions
df %>%
group_by( movie ) %>%
summarise( Total_Review = n()
, FourPlus_Rating = length( Rating[ which( Rating >= 4 ) ] )
, Per_FourPlus_Rating = length( Rating[ which( Rating >= 4 ) ] ) / n() ) %>%
arrange( desc( Per_FourPlus_Rating ) ) %>%
head( n = 5 )
# A tibble: 5 x 4
# movie Total_Review FourPlus_Rating Per_FourPlus_Rati…
# <fct> <int> <int> <dbl>
# 1 The Shawshank Rede… 10 7 0.700
# 2 Star Wars IV - A N… 15 8 0.533
# 3 Gladiator 12 6 0.500
# 4 Blade Runner 9 4 0.444
# 5 The Silence of the… 16 7 0.438
# end of script #
答案 3 :(得分:0)
这是一个dplyr解决方案:
dfAuxhigh=filter(dfAux1,Rating>=4)%>%group_by(movie)%>%summarize(percentHigh=n())
dfAux=dfAux1%>%group_by(movie)%>%summarize(percentAll=n())
result<-merge(dfAuxhigh,dfAux,by="movie")%>%mutate(percentage=percentHigh/percentAll)
result<-result[order(result$percentage,decreasing = T)[1:5],c(1,4)]
答案 4 :(得分:0)
library(tidyverse)
df %>%
group_by(movie, Rating) %>%
summarise(n = n()) %>% #< get freq of movies
mutate(freq = n/sum(n)) %>% #< find perc for each rating, by movie
filter(Rating >=4) %>% #< filter for desired rating (4 or above)
summarise(freq = sum(freq)) %>% #< summarize again
top_n(5) %>%
arrange(desc(freq)) %>%
mutate(freq = paste0(round(freq*100, 2), "%"))
#> movie freq
#> 1 The Shawshank Redemption 70%
#> 2 Star Wars IV - A New Hope 53.33%
#> 3 Gladiator 50%
#> 4 Blade Runner 44.44%
#> 5 The Silence of the Lambs 43.75%