奇怪的韦卡副作用

时间:2013-04-09 18:19:40

标签: scala machine-learning nlp weka side-effects

我有一个成功运行的weka分类器(在scala中)。 我正在尝试实施第二个分类器与它一起工作。

在训练期间,我正在设置第一个分类器的实例,如下所示:

val data = new Instances("Actions",attrs,10)

这很好用。但是,只需添加以下行就会破坏程序。

val tagdata = new Instances("Tags",tagattrs,10)

添加此行时,程序会在更新第一个分类器时崩溃 从字面上看,工作和非工作之间的唯一变化是对该行的补充。从来没有使用Tagdata(对于那些阅读我上一个问题的人来说,使用它的一切都被注释掉了),程序甚至没有在添加的行上崩溃。这使得创建第二组实例看起来有一些奇怪的副作用。分类器不能并排训练吗?

这是它抛出的错误:

java.lang.ArrayIndexOutOfBoundsException: 75
at weka.estimators.DiscreteEstimator.addValue(DiscreteEstimator.java:89)
at weka.classifiers.bayes.NaiveBayes.updateClassifier(NaiveBayes.java:322)
at Parser$.addInstance$1(Parser.scala:253)
...

这是一个工作子样本(只要你在./lib/中有weka.jar,在./resources/中有一个conll格式测试文件)。取消注释val tagdata...行将产生上述问题。

import scala.collection.mutable.Queue
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.HashMap
import scala.collection.mutable.MultiMap
import scala.collection.mutable.MutableList
import scala.collection.mutable.Stack
import scala.actors.Actor
import scala.actors.Actor._
import scala.io.Source
import java.io.File
import scala.util.Random
import weka.core._
import weka.classifiers.bayes.NaiveBayesUpdateable
import java.net.URL

object Parser{
  //TypeDefs
  type Arc = (Int, String) //Index, Label
  type Tag = (String, Double) //Tag, probability
  type Token = (Int, String, Int, String) //Index, Word, Head, Label
  //Running params
  val testing = true
  var training = true
  var parsing = true
  val testFile = "test.conll"
  val chooser = new wekaChooser()
  //Data structs
  val storedTokens = MutableList[Token]()
  val buffer = Queue[Token]()
  var stack = Stack[Token]()
  val arcs = HashMap[Int, Arc]()
  val tags = HashMap[Int, String]()
  val classifier = new NaiveBayesUpdateable()
  val tagclassifier = new NaiveBayesUpdateable()
  val tagFrequencies = HashMap[String, List[Tag]]()
  val supplementalTagFrequencies = Map("eat" -> List(("VB", 1.0)), "beans" ->     List(("NN", 1.0)))
  val contextLists = Map("unknown" -> List(), "confirmation" -> List(("UH", 0.8), "right",     "correct", "right", "yes"))
  val context = ListBuffer[Tag]()
  val attrs = new FastVector(4)
  val tagattrs = new FastVector(4)
  var counter = 1

  def main(args: Array[String]){
    if(training) train(testFile)
  }

  def resetLists()={
    buffer.clear
    stack.clear
    stack.push((0, "ROOT", 0, "ROOT"))
    arcs.clear  
    storedTokens.clear
  }

  def applyAction(action:String, buffer:Queue[Token], stack:Stack[Token]):Unit= {
    action match{    
    case "QUIT" => {
      println("Could not parse :(")
      buffer.clear
      stack.clear
    }
    case "SHIFT" => {
      stack.push(buffer.front)
      buffer.dequeue
    }
    case "REDUCE" => {
     // tags.put(stack.top._1, 
       storedTokens += stack.pop
    }
    case x =>{
      x.split("~")match{
        case Array(action, label) => 
          if(action=="LEFTARC"){
            arcs.put(stack.top._1, (buffer.front._1, label))
            storedTokens += stack.pop
          }
          if(action=="RIGHTARC"){
            arcs.put(buffer.front._1, (stack.top._1, label))
            applyAction("SHIFT", buffer, stack)
          }}}}
      }

def train(testFile:String)={
  var wordsmap = HashMap[Int, String]()

def leftDependents(tokenindex:Int) = {
  arcs.filter(_._2._1==tokenindex).filterKeys(_<tokenindex).keys.toList
}

def addInstance(action:String)={
 val data = new Instances("Actions",attrs,10)
//     val tagdata = new Instances("Tags",tagattrs,10)
 data.setClassIndex(2)
 //tagdata.setClassIndex(2)
 val instance = new Instance(3)
//     val taginstance = new Instance(3)
 instance.setDataset(data)     
 //taginstance.setDataset(tagdata)     

 val wordshead = wordsmap.size match{
   case 0 => "NIL"
   case _ => {
     wordsmap(buffer.front._1)
   }
 }

 List(stack.top._2, buffer.front._2, action).zipWithIndex.foreach{
   case (attrString:String, index)=> attrs.elementAt(index) match{
     case attr:Attribute => {
       instance.setValue(attr,attrString)
        }
     }         
   }

 println(instance)
  //   println(taginstance)

//     tagclassifier.updateClassifier(taginstance)
 classifier.updateClassifier(instance)
 }

def setupClassifier = {
 val acts = List("LEFTARC~", "RIGHTARC~")  
 val List(words:List[String], cpostags:List[String], deprels:List[String]) =   Source.fromURL(getClass.getResource("resources/"+testFile)).getLines.toList.map{
   line => line.split("\t") match{
     case Array(id, word, _, cpostag, _, _, head, deprel, _, _) => (word, cpostag, deprel)
     case _ => ("","","")
   }
 }.unzip3.productIterator.toList.map{
   case x:List[String] => (x.toSet++Set("ROOT", "NIL")).toList
   case _ => List()
 }

 val classWords = new FastVector(words.length)
 for(word <- words) classWords.addElement(word)
 val classTags = new FastVector(cpostags.length)
 for(tag <- cpostags) classTags.addElement(tag)

 val stacktag = new Attribute("stacktag", classTags)
 val buffertag = new Attribute("buffertag", classTags)
 val wordattr = new Attribute("word", classWords)

 val fvaction = new FastVector(deprels.length*2+2)
 fvaction.addElement("SHIFT")
 fvaction.addElement("REDUCE")
 for(act <- acts; deprel <- deprels) fvaction.addElement(act+deprel)     
 val action = new Attribute("action", fvaction)

 attrs.addElement(stacktag)
 attrs.addElement(buffertag)
 attrs.addElement(action)     

 tagattrs.addElement(stacktag)
 tagattrs.addElement(wordattr)
 tagattrs.addElement(buffertag)

 val structure =  new Instances("Actions", attrs, 10)
 structure.setClassIndex(2)
 classifier.buildClassifier(structure)

 val tagstructure =  new Instances("Tags", tagattrs, 10)
 tagstructure.setClassIndex(2)
 tagclassifier.buildClassifier(tagstructure)
 }    
//The Code
println("training")
//Based on Malt's oracle prediction training. Assisted in finding that by Rehj.
val wordtaglist = HashMap[String, ListBuffer[String]]()
resetLists
setupClassifier    
Source.fromURL(getClass.getResource("resources/"+testFile)).getLines.foreach{
 line =>{
   if(!line.isEmpty)
     line.split("\t") match{
       case Array(id, word, _, cpostag, _, _, head, deprel, _, _) =>{
         wordtaglist.getOrElseUpdate(word, ListBuffer[String]())+=cpostag //update tag counts
         buffer+=((id.toInt, cpostag, head.toInt, deprel))
         wordsmap+=(id.toInt -> word)
         while(buffer.nonEmpty){
           val action = {
             if(!(stack.top._1==0) && (stack.top._3 == buffer.front._1))
               "LEFTARC~"+stack.top._4
             else if(buffer.front._3 == stack.top._1)
               "RIGHTARC~"+buffer.front._4
             else if(leftDependents(buffer.front._1).nonEmpty && leftDependents(buffer.front._1)(0) < stack.top._1)
               "REDUCE"
             else if(buffer.front._3 < stack.top._1) //&& buffer.front._3 is not root or handling roots normally
               "REDUCE"
             else "SHIFT"
           }
           addInstance(action)
           applyAction(action, buffer, stack)
         }
       }
       case _ => println("line didn't match")
     }else {
       resetLists
       wordsmap.clear
     }
 }
}
resetLists
tagFrequencies++=wordtaglist.map{ case(k,v) => (k, v.groupBy(x=>x).map(x=>((x._1,  x._2.length*1.0/v.length))).toList)}
training = false
}
}

0 个答案:

没有答案