Question

我是R.的新手。我正在尝试使用lda对生成的网格中的所有点进行分类。训练集是使用rmvnorm(n,mean,sigma)随机生成的两个点组。这是我的代码：`

 # number of samples
n=100;

# parameters: G2
meanG1 = matrix( 
  c(2, 2), # the data elements 
  nrow=1,              # number of rows 
  ncol=2,              # number of columns 
  byrow = TRUE)        # fill matrix by rows 
sigmaG1 = matrix( 
  c(1,0,0,1), # the data elements 
  nrow=2,              # number of rows 
  ncol=2,              # number of columns 
  byrow = TRUE)        # fill matrix by rows 

library(mvtnorm)  

# Generating a matrix G1 with norm distribution 
G1 = rmvnorm(n, meanG1, sigmaG1)
G1[,3]=1

# parameters: G2
meanG2 = matrix( 
  c(0, 0), # the data elements 
  nrow=1,              # number of rows 
  ncol=2,              # number of columns 
  byrow = TRUE)        # fill matrix by rows 
sigmaG2 = matrix( 
  c(1,0.75,0.75,1), # the data elements 
  nrow=2,              # number of rows 
  ncol=2,              # number of columns 
  byrow = TRUE)        # fill matrix by rows 

# # Generating a matrix G2 with norm distribution
G2 = rmvnorm(n, meanG2, sigmaG2)

# adding a column as a label = 1 to G1 matrix 
G1 = cbind(G1, 1 )
# adding a column as a label = 2 to G2 matrix 
G2 = cbind(G2, 2 )    
# Concatenate both matrices
G = rbind(G1,G2)    
# Transforming Matrix into dataFrame
bothGroupsWithLabel <- as.data.frame(G)
# Shuffling data row-wise
bothGroupsWithLabel <- bothGroupsWithLabel[sample(nrow(bothGroupsWithLabel)),]

# plotting the generated matrices
plot(c(G1[,1]),c(G1[,2]),col="red")
points(c(G2[,1]),c(G2[,2]),col="blue")

# Generating a grid
K = 40;
seqx1 = seq(min(G1[,1]),max(G1[,1]),length = K)
seqx2 = seq(min(G1[,2]),max(G1[,2]),length = K)
myGrid = expand.grid(z1=seqx1,z2=seqx2);

plot(myGrid[,1],myGrid[,2])

library(MASS)  

# Creating a model
model.lda = lda(bothGroupsWithLabel[,3] ~bothGroupsWithLabel[,1]+bothGroupsWithLabel[,2] , data = bothGroupsWithLabel);
Ypred = predict(model.lda, newdata=myGrid);
Ypredgrid = Ypred$class

以下是我的数据bothGroupsWithLabel V1 V2 V3 69 2.0683949 0.5779272 1 53 2.1261046 2.0420350 1 118 -1.4502033 -1.4775360 2 148 1.1705251 1.5437296 2 195 0.3100763 -0.2594026 2 40 1.8573633 3.7717020 1

的一部分

和 myGrid z1 z2 1 0.1048024 -0.2034172 2 0.2227540 -0.2034172 3 0.3407055 -0.2034172 4 0.4586571 -0.2034172 5 0.5766086 -0.2034172 6 0.6945602 -0.2034172

我的网格由40 * 40个点组成，因此myGird数据框的大小为1600行和2列。数据框bothGroupsWithLabel由200行和3列组成，前两列是点的坐标，第三列用于标签。我的问题是当我拨打predict(model.lda, newdata=myGrid)时收到此警告消息： Warning message: 'newdata' had 1600 rows but variables found have 200 rows 我在这里失踪了什么？谁能帮帮我吗？

Answer 1

问题在于您生成模型的方式。使用公式和data=...时，最好只使用变量名称。为了使其正常工作，您还必须在newdata中使变量名称匹配。因此，当您创建myGrid时，请添加以下行：

names(myGrid) = c("V1", "V2")

然后让你的最后几行：

model.lda = lda(V3 ~ V1 + V2 , data = bothGroupsWithLabel);
Ypred = predict(model.lda, newdata=myGrid);
Ypredgrid = Ypred$class

那应该得到你想要的。

使用R中的lda进行预测：警告消息：'newdata'有1600行，但找到的变量有200行

1 个答案: