我已经收到有关不同品牌糖果成分的数据集,以及有关价格(百分比),糖和利润(百分比)的信息。成分信息是虚拟变量,其中0表示不存在该特性,而1表示存在该特性。其他变量是数字,代表糖果的糖含量,产品的价格。每个产品的利润是一个字符变量。目标是选择一种统计方法来确定消费者的喜好并预测新产品。我想在R中实现此解决方案。
由于类别虚拟变量和类别变量winpercent引起了问题,我决定将所有变量转换为相同的数据类型
dpt(rbind(head(Candy.df,10),tail(Candy.df,10)))
dput(rbind(head(Candy.df, 10), tail(Candy.df, 10)))
structure(list(competitorname = c("100 Grand", "3 Musketeers",
"One dime", "One quarter", "Air Heads", "Almond Joy", "Baby Ruth",
"Boston Baked Beans", "Candy Corn", "Caramel Apple Pops", "Tootsie Roll Juniors",
"Tootsie Roll Midgies", "Tootsie Roll Snack Bars", "Trolli Sour Bites",
"Twix", "Twizzlers", "Warheads", "WelchÕs Fruit Snacks", "WertherÕs Original Caramel",
"Whoppers"), chocolate = c(1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L), fruity = c(0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L,
0L, 0L), caramel = c(1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L), peanutyalmondy = c(0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), nougat = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), crispedricewafer = c(1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L), hard = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L), bar = c(1L, 1L, 0L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L), pluribus = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L), sugarpercent = c(0.73199999,
0.60399997, 0.011, 0.011, 0.90600002, 0.465, 0.60399997, 0.31299999,
0.90600002, 0.60399997, 0.31299999, 0.17399999, 0.465, 0.31299999,
0.546, 0.22, 0.093000002, 0.31299999, 0.186, 0.87199998), pricepercent = c(0.86000001,
0.51099998, 0.116, 0.51099998, 0.51099998, 0.76700002, 0.76700002,
0.51099998, 0.32499999, 0.32499999, 0.51099998, 0.011, 0.32499999,
0.255, 0.90600002, 0.116, 0.116, 0.31299999, 0.26699999, 0.84799999
), winpercent = c("66.971.725", "67.602.936", "32.261.086", "46.116.505",
"52.341.465", "50.347.546", "56.914.547", "23.417.824", "38.010.963",
"34.517.681", "43.068.897", "45.736.748", "49.653.503", "47.173.229",
"81.642.914", "45.466.282", "39.011.898", "44.375.519", "41.904.308",
"49.524.113")), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 77L, 78L, 79L, 80L, 81L, 82L, 83L, 84L, 85L, 86L), class = "data.frame")
Candy.df <- read.csv("Candy.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)
Candy.df <- sapply(Candy.df, as.factor)
dis <- lda(winpercent~chocolate+fruity+caramel+peanutyalmondy+nougat+crispedricewafer+bar+
pluribus+hard+sugarpercent+pricepercent,data= Candy.df)
Error in eval(predvars, data, env) : Object 'chocolate' not found