Question

对于我的作业，我需要使用dplyr进行分析，然后将其转换为常规的“ base R”代码。该数据集由奥运会参与者组成。我需要用奖牌的方法来制作表现最好的10个国家/地区的条形图。

我的dplyr代码可正常运行，如下所示：

medalwinners <- filter(olympics, grepl("Bronze|Silver|Gold", Medal))
medals_perteam <- sapply(split(medalwinners, medalwinners$Team), nrow)
medals_perteam <- sort(medals_perteam, decreasing = TRUE)
topmedals <- medalwinners[medalwinners$Team %in% names(medals_perteam)[1:10],]

我现在需要对base R做同样的事情。但是，我得到的结果非常相似。这段代码给出以下警告：警告信息：在奖牌== c（“ Bronze”，“ Silver”，“ Gold”）中：较长的物体长度不是较短的物体长度的倍数

medalwinners_baseR <- subset(olympics, Medal == c("Bronze", "Silver", "Gold"))

我最终得到两个大小不同的数据帧。 dplyr对象具有5000行，而基本R对象具有1651行。在查看数据时，我似乎找不到差异。

对象的数据如下：

    Medalwinners object
    structure(list(Name = c("Juhamatti Tapio Aaltonen", "Giovanni Abagnale", 
    "Patimat Abakarova", "Luc Abalo", "Luc Abalo", "Jeremy Abbott"
    ), Sex = c("M", "M", "F", "M", "M", "M"), Age = c(28L, 21L, 21L, 
    27L, 31L, 28L), Height = c(184L, 198L, 165L, 182L, 182L, 175L
    ), Weight = c(85, 90, 49, 86, 86, 70), Team = c("Finland", "Italy", 
    "Azerbaijan", "France", "France", "United States"), NOC = c("FIN", 
    "ITA", "AZE", "FRA", "FRA", "USA"), Games = c("2014 Winter", 
    "2016 Summer", "2016 Summer", "2012 Summer", "2016 Summer", "2014 Winter"
    ), Year = c(2014L, 2016L, 2016L, 2012L, 2016L, 2014L), Season = c("Winter", 
    "Summer", "Summer", "Summer", "Summer", "Winter"), City = c("Sochi", 
    "Rio de Janeiro", "Rio de Janeiro", "London", "Rio de Janeiro", 
    "Sochi"), Sport = c("Ice Hockey", "Rowing", "Taekwondo", "Handball", 
    "Handball", "Figure Skating"), Event = c("Ice Hockey Men's Ice Hockey", 
    "Rowing Men's Coxless Pairs", "Taekwondo Women's Flyweight", 
    "Handball Men's Handball", "Handball Men's Handball", "Figure Skating Mixed Team"
    ), Medal = c("Bronze", "Bronze", "Bronze", "Gold", "Silver", 
    "Bronze"), BMI = c(25.1063327032136, 22.9568411386593, 17.9981634527089, 
    25.9630479410699, 25.9630479410699, 22.8571428571429), weightcategories = structure(c(6L, 
    6L, 2L, 6L, 6L, 4L), .Label = c("31-40", "41-50", "51-60", "61-70", 
    "71-80", "81-90", "91-100", "101-110", "111-120", "121-130", 
    "131-140", "141-150", "151-160"), class = "factor")), .Names = c("Name", 
    "Sex", "Age", "Height", "Weight", "Team", "NOC", "Games", "Year", 
    "Season", "City", "Sport", "Event", "Medal", "BMI", "weightcategories"
    ), row.names = c(NA, 6L), class = "data.frame")

Medalwinners_BaseR object
structure(list(Name = c("Patimat Abakarova", "Luc Abalo", "Jeremy Abbott", 
"Denis Mikhaylovich Ablyazin", "Denis Mikhaylovich Ablyazin", 
"Denis Mikhaylovich Ablyazin"), Sex = c("F", "M", "M", "M", "M", 
"M"), Age = c(21L, 27L, 28L, 19L, 19L, 24L), Height = c(165L, 
182L, 175L, 161L, 161L, 161L), Weight = c(49, 86, 70, 62, 62, 
62), Team = c("Azerbaijan", "France", "United States", "Russia", 
"Russia", "Russia"), NOC = c("AZE", "FRA", "USA", "RUS", "RUS", 
"RUS"), Games = c("2016 Summer", "2012 Summer", "2014 Winter", 
"2012 Summer", "2012 Summer", "2016 Summer"), Year = c(2016L, 
2012L, 2014L, 2012L, 2012L, 2016L), Season = c("Summer", "Summer", 
"Winter", "Summer", "Summer", "Summer"), City = c("Rio de Janeiro", 
"London", "Sochi", "London", "London", "Rio de Janeiro"), Sport = c("Taekwondo", 
"Handball", "Figure Skating", "Gymnastics", "Gymnastics", "Gymnastics"
), Event = c("Taekwondo Women's Flyweight", "Handball Men's Handball", 
"Figure Skating Mixed Team", "Gymnastics Men's Floor Exercise", 
"Gymnastics Men's Horse Vault", "Gymnastics Men's Horse Vault"
), Medal = c("Bronze", "Gold", "Bronze", "Bronze", "Silver", 
"Silver"), BMI = c(17.9981634527089, 25.9630479410699, 22.8571428571429, 
23.9188302920412, 23.9188302920412, 23.9188302920412), weightcategories = structure(c(2L, 
6L, 4L, 4L, 4L, 4L), .Label = c("31-40", "41-50", "51-60", "61-70", 
"71-80", "81-90", "91-100", "101-110", "111-120", "121-130", 
"131-140", "141-150", "151-160"), class = "factor")), .Names = c("Name", 
"Sex", "Age", "Height", "Weight", "Team", "NOC", "Games", "Year", 
"Season", "City", "Sport", "Event", "Medal", "BMI", "weightcategories"
), row.names = c(13L, 15L, 34L, 109L, 110L, 116L), class = "data.frame")

Original Dataset
structure(list(Name = c("A Lamusi", "Juhamatti Tapio Aaltonen", 
"Andreea Aanei", "Jamale (Djamel-) Aarrass (Ahrass-)", "Nstor Abad Sanjun", 
"Nstor Abad Sanjun"), Sex = c("M", "M", "F", "M", "M", "M"), 
    Age = c(23L, 28L, 22L, 30L, 23L, 23L), Height = c(170L, 184L, 
    170L, 187L, 167L, 167L), Weight = c(60, 85, 125, 76, 64, 
    64), Team = c("China", "Finland", "Romania", "France", "Spain", 
    "Spain"), NOC = c("CHN", "FIN", "ROU", "FRA", "ESP", "ESP"
    ), Games = c("2012 Summer", "2014 Winter", "2016 Summer", 
    "2012 Summer", "2016 Summer", "2016 Summer"), Year = c(2012L, 
    2014L, 2016L, 2012L, 2016L, 2016L), Season = c("Summer", 
    "Winter", "Summer", "Summer", "Summer", "Summer"), City = c("London", 
    "Sochi", "Rio de Janeiro", "London", "Rio de Janeiro", "Rio de Janeiro"
    ), Sport = c("Judo", "Ice Hockey", "Weightlifting", "Athletics", 
    "Gymnastics", "Gymnastics"), Event = c("Judo Men's Extra-Lightweight", 
    "Ice Hockey Men's Ice Hockey", "Weightlifting Women's Super-Heavyweight", 
    "Athletics Men's 1,500 metres", "Gymnastics Men's Individual All-Around", 
    "Gymnastics Men's Floor Exercise"), Medal = c(NA, "Bronze", 
    NA, NA, NA, NA), BMI = c(20.7612456747405, 25.1063327032136, 
    43.2525951557093, 21.7335354170837, 22.9481157445588, 22.9481157445588
    ), weightcategories = structure(c(3L, 6L, 10L, 5L, 4L, 4L
    ), .Label = c("31-40", "41-50", "51-60", "61-70", "71-80", 
    "81-90", "91-100", "101-110", "111-120", "121-130", "131-140", 
    "141-150", "151-160"), class = "factor")), .Names = c("Name", 
"Sex", "Age", "Height", "Weight", "Team", "NOC", "Games", "Year", 
"Season", "City", "Sport", "Event", "Medal", "BMI", "weightcategories"
), row.names = c(NA, 6L), class = "data.frame")

Answer 1

您在Medal上的过滤条件不同。

他们不一定非要；您可以使用

grepl("Bronze|Silver|Gold", Medal)

（如您的tidyverse解决方案中一样）或

Medal %in% c("Bronze", "Silver", "Gold")

使用dplyr和base

1 个答案: