weka中的自定义批量过滤器

时间:2015-05-22 22:55:59

标签: machine-learning weka data-mining

我正在尝试构建一个扩展SimpleBatchFilter的自定义批处理过滤器。但是,我遇到了第二次运行它以获得反向输出的问题。以下是完成两次运行后我得到的相关代码和错误:

Exception in thread "main" java.lang.IndexOutOfBoundsException: Index: 79, Size: 79
    at java.util.ArrayList.rangeCheck(ArrayList.java:653)
    at java.util.ArrayList.get(ArrayList.java:429)
    at weka.core.Attribute.addStringValue(Attribute.java:994)
    at weka.core.StringLocator.copyStringValues(StringLocator.java:155)
    at weka.core.StringLocator.copyStringValues(StringLocator.java:91)
    at weka.filters.Filter.copyValues(Filter.java:373)
    at weka.filters.Filter.push(Filter.java:290)
    at weka.filters.SimpleBatchFilter.batchFinished(SimpleBatchFilter.java:266)
    at weka.filters.Filter.useFilter(Filter.java:667)
    at likeability.Main.main(Main.java:30)

以下是相关代码:

public class TestFilter extends SimpleBatchFilter {

    private Attribute a;
    private Attribute b;
    private int sampleSizePercent = 15;
    private boolean invert = false;
    private int seed = 1;

    @Override
    protected Instances process(Instances inst) throws Exception {
        ArrayList<Instances> partitionsA = partition(inst, a);
        ArrayList<Instances> partitions = new ArrayList<Instances>();
        for(Instances data: partitionsA) {
            partitions.addAll(partition(data, b));
        }

        return getTestSet(partitions);
    }

    /*
     * Partitions the data so that there's only one nominal value of the
     * attribute a in one partition.
     */
    private ArrayList<Instances> partition(Instances data, Attribute att) throws Exception {
        ArrayList<Instances> instances = new ArrayList<Instances>();
        for (int i = 0; i < att.numValues(); i++){
            RemoveWithValues rm = new RemoveWithValues();
            rm.setAttributeIndex(Integer.toString(att.index()+1));
            rm.setInvertSelection(true);
            rm.setNominalIndices(Integer.toString(i+1));
            rm.setInputFormat(data);
            instances.add(Filter.useFilter(data, rm));
        }
        return instances;
    }

    private Instances getTestSet(List<Instances> insts) throws Exception {
        Instances output = new Instances(insts.get(0), 0);

        for(Instances inst: insts) {
            Resample filter = new Resample();
            filter.setRandomSeed(seed);
            filter.setNoReplacement(true);
            filter.setInvertSelection(invert);
            filter.setSampleSizePercent(sampleSizePercent);
            filter.setInputFormat(inst);
            Instances curr = Filter.useFilter(inst, filter);
            System.out.println(inst.size() + " " + curr.size());
            output.addAll(curr);
        }
        return output;

    }

    @Override
    protected Instances determineOutputFormat(Instances arg) throws Exception {
        return new Instances(arg, 0);
    }

    @Override
    public String globalInfo() {
        return "A filter which partitions the data so that each partition contains"
                + " only instances with one value of attribute a and b, then takes "
                + "a random subset of values from each partition and merges them to"
                + " produce the final set.";
    }

    public Capabilities getCapabilities() {
         Capabilities result = super.getCapabilities();
         result.enableAllAttributes();
         result.enableAllClasses();
         result.enable(Capability.NO_CLASS);  // filter doesn't need class to be set
         return result;
       }
   //Main and getters and setters

}

这就是我所说的:

    TestFilter filter = new TestFilter();
    filter.setA(data.attribute("gender"));
    filter.setB(data.attribute("age"));
    filter.setInputFormat(data);
    Instances test = Filter.useFilter(data, filter);
    filter.setInvert(true);
    filter.setInputFormat(data);
    Instances train = Filter.useFilter(data, filter);

在我看来,我需要在调用之间使用这两行,这非常愚蠢。我怀疑我应该使用isBatchFinished(),这是否意味着我必须实现它扩展BatchFilter而不是SimpleBatchFilter?看到一些成功的实现也会有所帮助,因为我可以找到WEKA手册中唯一的实现。

1 个答案:

答案 0 :(得分:0)

我通过扩展Filter而不是将process函数更改为batchFinished()来解决它。我发布了这个答案,因为我还没有在其他任何地方找到自定义过滤器示例。

@Override
public boolean batchFinished() throws Exception {
    if(isFirstBatchDone()) {
        invert = true;
    }
    if (getInputFormat() == null)
        throw new NullPointerException("No input instance format defined");
    Instances inst = getInputFormat();
    ArrayList<Instances> partitionsA = partition(inst, a);
    ArrayList<Instances> partitions = new ArrayList<Instances>();
    for(Instances data: partitionsA) {
        partitions.addAll(partition(data, b));
    }

private void getTestSet(List<Instances> insts) throws Exception {

    for(Instances inst: insts) {
        Resample filter = new Resample();
        filter.setRandomSeed(seed);
        filter.setNoReplacement(true);
        filter.setInvertSelection(invert);
        filter.setSampleSizePercent(sampleSizePercent);
        filter.setInputFormat(inst);
        Instances curr = Filter.useFilter(inst, filter);
        System.out.println(inst.size() + " " + curr.size());
        curr.forEach((i) -> push(i));
    }
}

@Override
public boolean setInputFormat(Instances arg) throws Exception {
    super.setInputFormat(arg);
    Instances outputFormat = new Instances(arg, 0);
    setOutputFormat(outputFormat);
    return true;
}