在SOM包中,所有实体都被预测为同一类

时间:2016-10-06 11:53:08

标签: r machine-learning datagrid unsupervised-learning self-organizing-maps

每当我运行下面的代码时,它会为所有测试用例预测相同的类。所以我在形成网格时添加了更多参数,但它仍然预测相同的结果。有人可以帮我解决这个问题。

library(kohonen)
library(tm) 
library(qdap)
library(SnowballC)
dataSet <- read.csv("E:/TempDataSetWithAttributes.csv", header = TRUE)
head(dataSet)
##   **nItemId**                **sUnSpsc**
## 1 7440421 26121609 Network cable
## 2 7440442 26121609 Network cable
## 3 7440522 26121609 Network cable
## 4 7440623 26121609 Network cable
## 5 7460893 26121609 Network cable
## 6 7462277 26121609 Network cable
##                                                                                                                                                                                                                                                **ProductDesctiption**
## 1 Copper cable, category 6A F/FTP, low smoke zero halogen (LSZH), 4-pair, conductors are 23 AWG with PE insulation, twisted in pairs, wrapped in foil, surrounded by an overall metallic foil shield and protected by a low smoke, flame retardant LSZH jacket, w
## 2 Category 6A, Low Smoke Zero Halogen (LSZH), 4-pair, F/FTP copper cable. Copper conductors are 23 AWG with PE insulation. Conductors are twisted in pairs, wrapped in foil, surrounded by an overall metallic foil shield and protected by a low smoke, flame re
## 3 Category 6A, Low Smoke Zero Halogen (LSZH), 4-pair, F/FTP copper cable. Copper conductors are 23 AWG with PE insulation. Conductors are twisted in pairs, wrapped in foil, surrounded by an overall metallic foil shield and protected by a low smoke, flame re
## 4 Category 6A, Low Smoke Zero Halogen (LSZH), 4-pair, F/FTP copper cable. Copper conductors are 23 AWG with PE insulation. Conductors are twisted in pairs, wrapped in foil, surrounded by an overall metallic foil shield and protected by a low smoke, flame re
## 5                                Category 6A 4-pair, 23 AWG U/UTP copper cable, LSZH (IEC60332-1), blue.|Length: 1000 FT|Construction: LSZH PVC|Color: Blue|Number Of Pins: 4|Brand Name: Panduit|Outside Diameter: 0.285 IN|Type: NetKey Cable|Sub Brand: NetKey
## 6 Shielded marine MUD-resistant copper cable, category 7 S/FTP, low smoke zero halogen (LSZH), 4-pair, conductors are 22 AWG construction with foamed PE insulation, twisted in pairs, each surrounded by a foil, covered with an overall braided shield, within

**CREATING CORPUS**
documents <- Corpus(VectorSource(dataSet$ProductDesctiption))

**PRE-PROCESSING**
documents <- tm_map(documents, content_transformer(tolower),lazy=TRUE)
documents <- tm_map(documents, removePunctuation,lazy=TRUE)
documents <- tm_map(documents, stripWhitespace,lazy=TRUE)
documents <- tm_map(documents, removeNumbers,lazy=TRUE)
documents <- tm_map(documents, stripWhitespace,lazy=TRUE)
documents <- tm_map(documents, removeWords, stopwords("english"),lazy=TRUE)
documents <- tm_map(documents, stripWhitespace,lazy=TRUE)
documents <- tm_map(documents, stemDocument, language = "english",lazy=TRUE)
documents <- tm_map(documents, stripWhitespace,lazy=TRUE)

**CREATION OF DOCUMENT-TERM MATRIX**
documentTermMatrix <- DocumentTermMatrix(documents)


**Create Data Matrix**
documentTermMatrixFrame <- data.matrix(documentTermMatrix)
head(documentTermMatrixFrame[40,])
##         aaa        abov         abs    absbrand absmounting      accept 
##           0           0           0           0           0           0


**Create Training set**
training <- sample(nrow(documentTermMatrixFrame), 750)
Scaling of Document term Matrix

Scaledtraining <- scale(documentTermMatrixFrame[training,])
ScaledNonNAtraining <- Scaledtraining[, colSums(is.na(Scaledtraining)) != nrow(Scaledtraining)]
UnSupervised learning using Self Organizing maps

som.wines <- som(ScaledNonNAtraining, grid = somgrid(5, 5, "rectangular"))
Scaling of Test Set

Xtest <- scale(documentTermMatrixFrame[-training,])
ScaledXtest <- Xtest[, colSums(is.na(Xtest)) != nrow(Xtest)]


x<- dataSet$sUnSpsc
y<- dataSet$nItemId

**Prediction using SOM trained model**
som.prediction <- predict(som.wines, newdata = ScaledXtest,trainY=as.factor(x[training]))

som.prediction$unit.classif
##   [1]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
##  [24]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
##  [47]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
##  [70]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5 25
##  [93]  ## Heading ##5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [116]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [139]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [162]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [185]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [208]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
## [231]  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5
**Accuracy calculation**
confusion.mat <- (table("Predictions" = som.prediction$unit.classif, Actual = x[-training]))
resultinmatrix<- as.data.frame.array(confusion.mat)
accuracy <- sum(diag(confusion.mat))/nrow(Xtest) * 100
accuracy

0 个答案:

没有答案