我已经收到有关不同品牌糖果成分的数据集,以及有关价格(百分比),糖和利润(百分比)的信息。成分信息是虚拟变量,其中0表示不存在该特性,而1表示存在该特性。其他变量是数字,代表糖果的糖含量,产品的价格。每个产品的利润是一个字符变量。目标是选择一种统计方法来确定消费者的喜好并预测新产品。我想在R中实现此解决方案。
我的目标是将变量“ winpercent”分为两个子组。如果值> 43,078,911,则为赢家;如果值<43,078,911,则为输家。解决方案在R中会是什么样?
dpt(rbind(head(Sweets.df,10),tail(Sweets.df,10)))
dput(rbind(head(Sweets.df, 10), tail(Sweets.df, 10)))
structure(list(competitorname = c("100 Grand", "3 Musketeers",
"One dime", "One quarter", "Air Heads", "Almond Joy", "Baby Ruth",
"Boston Baked Beans", "Candy Corn", "Caramel Apple Pops", "Tootsie Roll Juniors",
"Tootsie Roll Midgies", "Tootsie Roll Snack Bars", "Trolli Sour Bites",
"Twix", "Twizzlers", "Warheads", "WelchÕs Fruit Snacks", "WertherÕs Original Caramel",
"Whoppers"), chocolate = c(1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L), fruity = c(0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,
0L, 0L), caramel = c(1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), peanutyalmondy = c(0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), nougat = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), crispedricewafer = c(1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L), hard = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L), bar = c(1L, 1L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L), pluribus = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L), sugarpercent = c(0.73199999,
0.60399997, 0.011, 0.011, 0.90600002, 0.465, 0.60399997, 0.31299999,
0.90600002, 0.60399997, 0.31299999, 0.17399999, 0.465, 0.31299999,
0.546, 0.22, 0.093000002, 0.31299999, 0.186, 0.87199998), pricepercent = c(0.86000001,
0.51099998, 0.116, 0.51099998, 0.51099998, 0.76700002, 0.76700002,
0.51099998, 0.32499999, 0.32499999, 0.51099998, 0.011, 0.32499999,
0.255, 0.90600002, 0.116, 0.116, 0.31299999, 0.26699999, 0.84799999
), winpercent = c("66.971.725", "67.602.936", "32.261.086", "46.116.505",
"52.341.465", "50.347.546", "56.914.547", "23.417.824", "38.010.963",
"34.517.681", "43.068.897", "45.736.748", "49.653.503", "47.173.229",
"81.642.914", "45.466.282", "39.011.898", "44.375.519", "41.904.308",
"49.524.113")), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L), class = "data.frame")
Sweets.df <- read.csv("Sweets.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
答案 0 :(得分:1)
您的winpercent
不是数字(请注意数字和.
之间的引号是分隔符),因此我们首先解决该问题,然后根据值创建一个新变量:
Lidl.df$winpercent = as.numeric(gsub(".", "", Lidl.df$winpercent, fixed = TRUE))
Lidl.df$result = ifelse(Lidl.df$winpercent > 43078911, "winner", "loser")