我需要计算电影数据集的平均评分,但有些电影只有一个评论或一些评论。
我需要数据集来保留至少有10条评论的行
我该怎么做?
当我运行下面的代码时,我只获得5星评级的电影,但因为他们只有一个评论
rt %>%
group_by(movieId) %>%
summarise(AVG = mean(rating)) %>%
arrange(desc(AVG)) %>%
head(10)
示例dput()
dput(rt_sample)
structure(list(userId = c(431L, 624L, 564L, 353L, 250L, 345L,
138L, 288L, 529L, 351L, 423L, 544L, 240L, 627L, 363L, 150L, 306L,
329L, 56L, 379L, 475L, 294L, 30L, 452L, 547L, 564L, 662L, 172L,
19L, 564L, 575L, 30L, 659L, 262L, 38L, 579L, 254L, 86L, 175L,
197L, 619L, 615L, 564L, 544L, 22L, 195L, 615L, 452L, 587L, 547L,
452L, 452L, 380L, 311L, 56L, 475L, 247L, 519L, 30L, 580L, 15L,
118L, 609L, 461L, 472L, 120L, 241L, 584L, 384L, 85L, 624L, 150L,
625L, 158L, 534L, 570L, 57L, 481L, 243L, 380L, 295L, 73L, 460L,
311L, 461L, 605L, 21L, 637L, 624L, 481L, 391L, 447L, 452L, 587L,
509L, 179L, 111L, 518L, 92L, 165L), movieId = c("587", "143255",
"2793", "4701", "64034", "7034", "4886", "160", "3347", "1286",
"35836", "92535", "2599", "527", "2083", "7569", "2028", "3",
"83976", "3113", "55721", "4228", "594", "3988", "6662", "2059",
"186", "527", "913", "1726", "953", "4617", "493", "1285", "1078",
"2139", "208", "186", "5839", "780", "296", "7361", "3325", "593",
"3535", "1201", "1193", "4239", "3683", "5015", "2231", "280",
"51091", "3791", "1256", "31221", "410", "1213", "1226", "3254",
"122902", "1221", "5241", "5377", "3133", "4623", "377", "5418",
"39446", "415", "103339", "6059", "44191", "481", "1894", "1213",
"1028", "1527", "2174", "56788", "3252", "95510", "3317", "597",
"553", "3088", "1372", "2231", "136654", "4628", "46", "477",
"2852", "2917", "1834", "86882", "6502", "2080", "454", "8807"
), rating = c(4, 1.5, 1, 1.5, 4, 4.5, 0.5, 2, 4, 5, 3.5, 5, 4.5,
4, 3, 3, 4, 3, 4, 4, 4, 3.5, 3, 1, 5, 1, 3, 4, 5, 1, 5, 5, 3,
3.5, 4.5, 0.5, 3, 1, 4, 5, 5, 4, 4, 4, 4.5, 3, 3, 3, 3.5, 3,
3, 3, 4, 4.5, 4, 0.5, 3, 5, 4, 3, 0.5, 5, 1, 3.5, 5, 4.5, 3,
5, 3.5, 3, 1, 3, 5, 3.5, 4, 2, 4, 4, 3.5, 4, 4.5, 4, 4, 5, 4,
5, 3, 5, 1.5, 4, 4, 2, 3.5, 4.5, 4, 5, 4, 3, 2, 3.5), timestamp = c(1140455281L,
1474309547L, 974709135L, 1142770526L, 1469807741L, 1109289694L,
1440379171L, 845862398L, 959966090L, 975636664L, 1356120891L,
1435787488L, 1098940438L, 1201378616L, 942345331L, 1114306250L,
939715834L, 867072039L, 1467004653L, 1378179814L, 1447328235L,
1062536768L, 945116219L, 978404955L, 1064696846L, 974839862L,
839022731L, 843290844L, 855191654L, 974843527L, 1012598971L,
994456213L, 834694644L, 1434333103L, 1389721750L, 1325551004L,
845157280L, 848159406L, 1052884893L, 975429453L, 831921736L,
1408779622L, 974838545L, 1435785913L, 1131662694L, 975416922L,
1425505047L, 1008975762L, 1111362162L, 1039012334L, 976420234L,
976420133L, 1220050534L, 1076968462L, 1467009159L, 1447327944L,
953362225L, 1471150621L, 945296019L, 1156127585L, 1443384352L,
950153998L, 1029869675L, 1096527586L, 953095219L, 1167420969L,
847339539L, 1271884897L, 1153573696L, 837512420L, 1408799647L,
1116309023L, 1452848602L, 1231502746L, 973375257L, 1475784311L,
907765600L, 1437107001L, 1094261196L, 1243140203L, 1112544768L,
1345799054L, 1072836551L, 898526635L, 1090907347L, 980194797L,
853851257L, 1231346568L, 1459164933L, 1437107118L, 891534104L,
832493451L, 1067731810L, 1112034877L, 939341767L, 1436670432L,
1097431764L, 945362206L, 848525694L, 1111479419L)), .Names = c("userId",
"movieId", "rating", "timestamp"), row.names = c(NA, -100L), class = c("tbl_df",
"tbl", "data.frame"))
答案 0 :(得分:1)
如评论中所示,您可以在filter(YOUR_FILTER_CONDITION)
数据之前使用summarise
。
您的样本数据不包含足够重复的“movieId”值,以便能够证明这一点,因此这里有一些可重现的样本数据:
set.seed(1)
movieId <- sample(12, 100, TRUE)
rt <- data.frame(
userId = sample(1000, sum(movieId), TRUE),
movieId = rep(seq_along(movieId), movieId),
rating = round(runif(sum(movieId), 0, 5), 1),
timestamp = sample(10000, sum(movieId), TRUE)
)
filter()
的实施:
rt %>%
group_by(movieId) %>%
filter(n() >= 10) %>%
summarise(AVG = mean(rating)) %>%
arrange(desc(AVG))
# # A tibble: 27 x 2
# movieId AVG
# <int> <dbl>
# 1 37 3.26
# 2 96 3.00
# 3 61 2.97
# 4 77 2.96
# 5 52 2.94
# 6 6 2.88
# 7 95 2.78
# 8 68 2.76
# 9 72 2.72
# 10 21 2.62
# # ... with 17 more rows