如何删除不相关的行 - R?

时间:2018-02-13 22:46:45

标签: r count dplyr remove-if

我需要计算电影数据集的平均评分,但有些电影只有一个评论或一些评论。

我需要数据集来保留至少有10条评论的行

我该怎么做?

当我运行下面的代码时,我只获得5星评级的电影,但因为他们只有一个评论

rt %>%
  group_by(movieId) %>%
  summarise(AVG = mean(rating)) %>%
  arrange(desc(AVG)) %>%
  head(10)

示例dput()

dput(rt_sample)

structure(list(userId = c(431L, 624L, 564L, 353L, 250L, 345L, 
138L, 288L, 529L, 351L, 423L, 544L, 240L, 627L, 363L, 150L, 306L, 
329L, 56L, 379L, 475L, 294L, 30L, 452L, 547L, 564L, 662L, 172L, 
19L, 564L, 575L, 30L, 659L, 262L, 38L, 579L, 254L, 86L, 175L, 
197L, 619L, 615L, 564L, 544L, 22L, 195L, 615L, 452L, 587L, 547L, 
452L, 452L, 380L, 311L, 56L, 475L, 247L, 519L, 30L, 580L, 15L, 
118L, 609L, 461L, 472L, 120L, 241L, 584L, 384L, 85L, 624L, 150L, 
625L, 158L, 534L, 570L, 57L, 481L, 243L, 380L, 295L, 73L, 460L, 
311L, 461L, 605L, 21L, 637L, 624L, 481L, 391L, 447L, 452L, 587L, 
509L, 179L, 111L, 518L, 92L, 165L), movieId = c("587", "143255", 
"2793", "4701", "64034", "7034", "4886", "160", "3347", "1286", 
"35836", "92535", "2599", "527", "2083", "7569", "2028", "3", 
"83976", "3113", "55721", "4228", "594", "3988", "6662", "2059", 
"186", "527", "913", "1726", "953", "4617", "493", "1285", "1078", 
"2139", "208", "186", "5839", "780", "296", "7361", "3325", "593", 
"3535", "1201", "1193", "4239", "3683", "5015", "2231", "280", 
"51091", "3791", "1256", "31221", "410", "1213", "1226", "3254", 
"122902", "1221", "5241", "5377", "3133", "4623", "377", "5418", 
"39446", "415", "103339", "6059", "44191", "481", "1894", "1213", 
"1028", "1527", "2174", "56788", "3252", "95510", "3317", "597", 
"553", "3088", "1372", "2231", "136654", "4628", "46", "477", 
"2852", "2917", "1834", "86882", "6502", "2080", "454", "8807"
), rating = c(4, 1.5, 1, 1.5, 4, 4.5, 0.5, 2, 4, 5, 3.5, 5, 4.5, 
4, 3, 3, 4, 3, 4, 4, 4, 3.5, 3, 1, 5, 1, 3, 4, 5, 1, 5, 5, 3, 
3.5, 4.5, 0.5, 3, 1, 4, 5, 5, 4, 4, 4, 4.5, 3, 3, 3, 3.5, 3, 
3, 3, 4, 4.5, 4, 0.5, 3, 5, 4, 3, 0.5, 5, 1, 3.5, 5, 4.5, 3, 
5, 3.5, 3, 1, 3, 5, 3.5, 4, 2, 4, 4, 3.5, 4, 4.5, 4, 4, 5, 4, 
5, 3, 5, 1.5, 4, 4, 2, 3.5, 4.5, 4, 5, 4, 3, 2, 3.5), timestamp = c(1140455281L, 
1474309547L, 974709135L, 1142770526L, 1469807741L, 1109289694L, 
1440379171L, 845862398L, 959966090L, 975636664L, 1356120891L, 
1435787488L, 1098940438L, 1201378616L, 942345331L, 1114306250L, 
939715834L, 867072039L, 1467004653L, 1378179814L, 1447328235L, 
1062536768L, 945116219L, 978404955L, 1064696846L, 974839862L, 
839022731L, 843290844L, 855191654L, 974843527L, 1012598971L, 
994456213L, 834694644L, 1434333103L, 1389721750L, 1325551004L, 
845157280L, 848159406L, 1052884893L, 975429453L, 831921736L, 
1408779622L, 974838545L, 1435785913L, 1131662694L, 975416922L, 
1425505047L, 1008975762L, 1111362162L, 1039012334L, 976420234L, 
976420133L, 1220050534L, 1076968462L, 1467009159L, 1447327944L, 
953362225L, 1471150621L, 945296019L, 1156127585L, 1443384352L, 
950153998L, 1029869675L, 1096527586L, 953095219L, 1167420969L, 
847339539L, 1271884897L, 1153573696L, 837512420L, 1408799647L, 
1116309023L, 1452848602L, 1231502746L, 973375257L, 1475784311L, 
907765600L, 1437107001L, 1094261196L, 1243140203L, 1112544768L, 
1345799054L, 1072836551L, 898526635L, 1090907347L, 980194797L, 
853851257L, 1231346568L, 1459164933L, 1437107118L, 891534104L, 
832493451L, 1067731810L, 1112034877L, 939341767L, 1436670432L, 
1097431764L, 945362206L, 848525694L, 1111479419L)), .Names = c("userId", 
"movieId", "rating", "timestamp"), row.names = c(NA, -100L), class = c("tbl_df", 
"tbl", "data.frame"))

1 个答案:

答案 0 :(得分:1)

如评论中所示,您可以在filter(YOUR_FILTER_CONDITION)数据之前使用summarise

您的样本数据不包含足够重复的“movieId”值,以便能够证明这一点,因此这里有一些可重现的样本数据:

set.seed(1)
movieId <- sample(12, 100, TRUE)
rt <- data.frame(
  userId    = sample(1000, sum(movieId), TRUE),
  movieId   = rep(seq_along(movieId), movieId),
  rating    = round(runif(sum(movieId), 0, 5), 1),
  timestamp = sample(10000, sum(movieId), TRUE)
)

filter()的实施:

rt %>%
  group_by(movieId) %>%
  filter(n() >= 10) %>%
  summarise(AVG = mean(rating)) %>%
  arrange(desc(AVG))
# # A tibble: 27 x 2
#    movieId   AVG
#      <int> <dbl>
#  1      37  3.26
#  2      96  3.00
#  3      61  2.97
#  4      77  2.96
#  5      52  2.94
#  6       6  2.88
#  7      95  2.78
#  8      68  2.76
#  9      72  2.72
# 10      21  2.62
# # ... with 17 more rows