使用Rtexttools lib进行机器学习

时间:2017-03-12 10:22:13

标签: r svn machine-learning

我有以下训练集:

    Text,y
    MRR 93345,1
    MRR 93434,1
    MRR 93554,1
    MRR 938900,1
    MRR 93970,1
    MRR 937899,1
    MRR 93868,1
    MRR 938769,1
    MRR 93930,1
    MRR 92325,1
    MRR 931932,1
    MRR 933922,1
    MRR 934390,1
    MRR 93204,1
    MRR 93023,1
    MRR 930982,1
    MRR 87678,-1
    MRR 87956,-1
    MRR 87890,-1
    MRR 878770,-1
    MRR 877886,-1
    MRR 87678367,-1
    MRR 8790,-1
    MRR 87345,-1
    MRR 87149,-1
    MRR 873790,-1
    MRR 873493,-1
    MRR 874303,-1
    MRR 874343,-1
    MRR 874304,-1
    MRR 879034,-1
    MRR 879430,-1
    MRR 87943,-1
    MRR 879434,-1
    MRR 871984,-1
    MRR 873949,-1

我的代码如下:

# Create the document term matrix
dtMatrix <- create_matrix(data["Text"],language="english", removePunctuation=TRUE, stripWhitespace=TRUE,
                          toLower=TRUE,
                          removeStopwords=TRUE,
                          stemWords=TRUE, removeSparseTerms=.998) 

# Configure the training data
container <- create_container(dtMatrix, data$y, trainSize=1:nrow(dtMatrix), virgin=FALSE) 
# train a SVM Model
model <- train_model(container, "SVM", kernel="linear" ,cost=1)

# new data
predictionData <- list("MRR 93111") 

# create a prediction document term matrix 
predMatrix <- create_matrix(predictionData, originalMatrix=dtMatrix,language="english", removePunctuation=TRUE, stripWhitespace=TRUE,
                            toLower=TRUE,
                            removeStopwords=TRUE,
                            stemWords=TRUE, removeSparseTerms=.998) 

# create the corresponding container
predSize = length(predictionData);
predictionContainer <- create_container(predMatrix, labels=rep(0,predSize), testSize=1:predSize, virgin=FALSE) 

# predict
results <- classify_model(predictionContainer, model)

现在通过使用train_model函数我想预测:MRR 93111为y = 1。 这意味着如果字符串以“MRR 93”开头,则输出应为1,而词干“MRR 87”给出-1。实际上它不起作用,因为我得到MRR 93111 -1 0.5778781

此外,如果我以不同方式对训练集进行排序,或者如果我针对相同的数据集多次运行该脚本,那么结果似乎会发生变化,这对我来说听起来很奇怪。

UPDATE1:dput(data)

structure(list(Text = structure(c(26L, 28L, 30L, 34L, 36L, 31L, 
32L, 33L, 35L, 21L, 24L, 27L, 29L, 25L, 22L, 23L, 10L, 20L, 14L, 
13L, 12L, 11L, 15L, 3L, 1L, 5L, 4L, 7L, 9L, 8L, 16L, 18L, 17L, 
19L, 2L, 6L), .Label = c("MRR 87149", "MRR 871984", "MRR 87345", 
"MRR 873493", "MRR 873790", "MRR 873949", "MRR 874303", "MRR 874304", 
"MRR 874343", "MRR 87678", "MRR 87678367", "MRR 877886", "MRR 878770", 
"MRR 87890", "MRR 8790", "MRR 879034", "MRR 87943", "MRR 879430", 
"MRR 879434", "MRR 87956", "MRR 92325", "MRR 93023", "MRR 930982", 
"MRR 931932", "MRR 93204", "MRR 93345", "MRR 933922", "MRR 93434", 
"MRR 934390", "MRR 93554", "MRR 937899", "MRR 93868", "MRR 938769", 
"MRR 938900", "MRR 93930", "MRR 93970"), class = "factor"), Y = c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 
-1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, 
-1L, -1L, -1L, -1L, -1L, -1L)), .Names = c("Text", "Y"), class = "data.frame", row.names = c(NA, 
-36L))

1 个答案:

答案 0 :(得分:1)

您的问题是您的代码在单词级别使用训练数据和分类。

> dtMatrix$dimnames$Terms
 [1] "87149"    "871984"   "87345"    "873493"   "873790"   "873949"   "874303"   "874304"   "874343"   "87678"    "87678367"
[12] "877886"   "878770"   "87890"    "8790"     "879034"   "87943"    "879430"   "879434"   "87956"    "92325"    "93023"   
[23] "930982"   "93111"    "931932"   "93204"    "93345"    "933922"   "93434"    "934390"   "93554"    "937899"   "93868"   
[34] "938769"   "938900"   "93930"    "93970"    "mrr"

我并不完全确定SVM对这些数字字符串的处理有多么令人兴奋,但似乎并不关心字符串的93部分。将琴弦分成caharcter会对个别数字产生更大的影响:

df$Text <- sapply(1:length(df$Text), function(i) paste(unlist(strsplit(df$Text[i], split = "")), collapse = " "))

我使用df而不是数据,因为数据已经是RTextTools中的一个对象,并且在运行代码时给了我一些问题。在创建矩阵时,必须更改最小字长的选项。

dtMatrix <- create_matrix(df$Text,language="english", minWordLength=1, #!
                          removePunctuation=TRUE, stripWhitespace=TRUE,
                          toLower=TRUE, removeStopwords=TRUE,
                          stemWords=TRUE, removeSparseTerms=.998)

现在我们得到:

> dtMatrix$dimnames$Terms

[1]&#34; 0&#34; &#34; 1&#34; &#34; 2&#34; &#34; 3&#34; &#34; 4&#34; &#34; 5&#34; &#34; 6&#34; &#34; 7&#34; &#34; 8&#34; &#34; 9&#34; &#34; M&#34; &#34; R&#34;

更重要的是:

> results 
  SVM_LABEL  SVM_PROB
1         1 0.9144185

我最近参加了一个关于RTextTools和SVM的研讨会,他们评论说,每次训练模型时,使用SVM都会得到略微不同的结果。我不完全确定为什么我不会尝试解释,但我们推荐了一本免费的书,名为&#34; R&#34中的应用程序统计学习简介阅读支持向量机。

以下是完整代码:

df <- structure(list(Text = structure(c(26L, 28L, 30L, 34L, 36L, 31L, 
                                        32L, 33L, 35L, 21L, 24L, 27L, 29L, 25L, 22L, 23L, 10L, 20L, 14L, 
                                        13L, 12L, 11L, 15L, 3L, 1L, 5L, 4L, 7L, 9L, 8L, 16L, 18L, 17L, 
                                        19L, 2L, 6L), .Label = c("MRR   87149", "MRR 871984", "MRR 87345", 
                                                                 "MRR 873493", "MRR 873790", "MRR 873949", "MRR 874303", "MRR 874304", 
                                                                 "MRR 874343", "MRR 87678", "MRR 87678367", "MRR 877886", "MRR 878770", 
                                                                 "MRR 87890", "MRR 8790", "MRR 879034", "MRR 87943", "MRR 879430", 
                                                                 "MRR 879434", "MRR 87956", "MRR 92325", "MRR 93023", "MRR 930982", 
                                                                 "MRR 931932", "MRR 93204", "MRR 93345", "MRR 933922", "MRR 93434", 
                                                                 "MRR 934390", "MRR 93554", "MRR 937899", "MRR 93868", "MRR 938769", 
                                                                 "MRR 938900", "MRR 93930", "MRR 93970"), class = "factor"), Y = c(1L, 
                                                                                                                                   1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, -1L, 
                                                                                                                                   -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L, 
                                                                                                                                   -1L, -1L, -1L, -1L, -1L, -1L)), .Names = c("Text", "Y"), class = "data.frame", row.names = c(NA, 
                                                                                                                                                                                                                                -36L))



df$Text <- as.character(df$Text)  
# new data
df[nrow(df)+1,] <- c("MRR    93111","")
df$Text <- sapply(1:length(df$Text), function(i) paste(unlist(strsplit(df$Text[i], split = "")), collapse = " "))

# Create the document term matrix
dtMatrix <- create_matrix(df$Text,language="english", minWordLength=1, 
                          removePunctuation=TRUE, stripWhitespace=TRUE,
                          toLower=TRUE, removeStopwords=TRUE,
                          stemWords=TRUE, removeSparseTerms=.998) 


dtMatrix$dimnames$Terms
dtMatrix$dimnames$Docs

# Configure the training data
container <- create_container(dtMatrix, df$Y, trainSize=1:36, testSize = 37, virgin=TRUE) 

container <- create_container(dtMatrix,
                              labels=df$Y, trainSize=1:36, testSize = 37, virgin=TRUE)

# train a SVM Model
model <- train_model(container, "SVM",kernel="linear" ,cost=1) ##??

results <- classify_model(container,model)

results