慢速文本预处理

时间:2015-11-19 07:51:09

标签: java performance

任何人都可以帮助解决为什么我的程序运行缓慢以处理650个文档,需要130分钟,以及如何使该程序具有更好的预处理性能 这里的主要班级****

   public static void main(String[] args) throws IOException, BiffException, Exception {
    ArrayList<dataset> dokumen = new ArrayList<>();
    ArrayList<String> term = new ArrayList<>();
    TA_baru t = new TA_baru();
    Himpunan_metoda_pembobotan hmp;
    dokumen = t.bacatxt(dokumen);//reading dokumen

    term = t.doktermlist(dokumen);//split it into term 
    hmp = new Himpunan_metoda_pembobotan(dokumen, term);
    hmp.tulistf();
    hmp.tulisidf();

}

public ArrayList<String> doktermlist(ArrayList<dataset> tt) throws IOException, BiffException {
    ArrayList<String> tmp = new ArrayList<>();
    Set<String> a = new HashSet<>();
    for (int i = 0; i < tt.size(); i++) {
        //tt.get(i).ubah_term();
        tmp = tt.get(i).getTerm();

        for (int j = 0; j < tmp.size(); j++) {
            a.add(tmp.get(j));
        }
    }

    //tmp.clear();
    tmp = new ArrayList<String>(a);
    return tmp;
}

public ArrayList<dataset> bacatxt(ArrayList<dataset> x) throws IOException, BiffException {
    x.clear();
    File folder = new File("C:\\Users\\burhan\\Documents\\NetBeansProjects\\TA_baru\\data");
    File[] listOfFiles = folder.listFiles();
    dataset y;
    for (int i = 0; i < listOfFiles.length; i++) {
        File file = listOfFiles[i];
        if (file.isFile() && file.getName().endsWith(".txt")) {
            String content = FileUtils.readFileToString(file);
            y = new dataset(file.getName(), content);
            x.add(y);
        }

    }
    return x;
}

这里有班级himpunan_metoda pembobotan

public class Himpunan_metoda_pembobotan {

ArrayList<dataset> dokumen = new ArrayList<>(), positif = new ArrayList<>(), negatif = new ArrayList<>();
ArrayList<String> term = new ArrayList<>();
ArrayList<Nilai> nabcd = new ArrayList<>();
double a, b, c, d;

public Himpunan_metoda_pembobotan(ArrayList<dataset> dok, ArrayList<String> term) {
    dokumen = dok;
    this.term = term;
    for (int i = 0; i < dok.size(); i++) {
        for (int j = 0; j < term.size(); j++) {
            abcd(term.get(j), dokumen.get(i));
        }//this loop for assign value to nabcd 
        System.out.println(i);
    }

}

public int jumlah_kategori() {
    Set setkategori = new HashSet();
    for (int i = 0; i < dokumen.size(); i++) {
        setkategori.add(dokumen.get(i).getKategori());
    }
    return setkategori.size();
}

public int jumlah_dokumen_terdapat_term(String x, ArrayList<dataset> y) {
    int a = 0;
    for (int i = 0; i < y.size(); i++) {
        if (y.get(i).getTeks().contains(x)) {
            a++;
        }
    }
    return a;
}

public void clear() {
    positif.clear();
    negatif.clear();
    a = 0;
    b = 0;
    c = 0;
    d = 0;
}
public void carinilaiabcd(String tem,String kategori ){
    for (int i = 0; i < nabcd.size(); i++) {
        if (nabcd.get(i).getKata().equals(tem)&&nabcd.get(i).getKategori().equals(kategori)) {
            a=nabcd.get(i).getA();
            b=nabcd.get(i).getB();
            c=nabcd.get(i).getC();
            d=nabcd.get(i).getD();
        }
    }
}
public void abcd(String x, dataset y) {
    Nilai n;
    clear(); //this looping make run slow, and how the solution for this problem
    for (int i = 0; i < dokumen.size(); i++) {
        if (dokumen.get(i).getKategori().equals(y.getKategori())) {
            positif.add(dokumen.get(i));
        } else {
            negatif.add(dokumen.get(i));
        }
    } //separate beetween negative and positif category
    for (int i = 0; i < positif.size(); i++) {
        if (!positif.get(i).carikata(x)) {
            a++;
        }
        if (positif.get(i).carikata(x)) {
            b++;
        }
    }
    for (int i = 0; i < negatif.size(); i++) {
        if (negatif.get(i).carikata(x)) {
            c++;
        }
        if (!negatif.get(i).carikata(x)) {
            d++;
        }
    }//assign value a, b, c,d to search value idf, tf,
    n = new Nilai(a, b, c, d, x, y.getKategori()); //assign nilai
    n.setIdf(idf(n));
    n.setTf(tf(x, y));
    nabcd.add(n); //add nilai into arraylist
}

public double tf(String term, dataset data) {
    return data.jumlah_term(term);
}
public double itf(String term, dataset data) {
    double x=0;
    x=1-(1/(1+tf(term, data)));
    return x;
}
public double rf(String term, Nilai n) {
    //abcd(term, data);
    double x = 0;
    if (n.getC() < 1) {
        x = Math.log(2 + (n.b / 1));

    } else if (n.getC() > 0) {
        x = Math.log(2 + (n.b / n.c));

    }
    return x;
}
public double tfrf(String term, Nilai n,dataset data) {
    //abcd(term, data);
    double x = 0;
    x=tf(term, data)*rf(term, n);
    return x;
}
public double idf(Nilai n) {


    double x;
    x = Math.log10(dokumen.size() / (n.b + n.c));
    return x;
}

public double tf_idf(String term, dataset data,Nilai n) {

    double x = 0;
    x = tf(term, data) * idf(n);
    return x;
}

public double x2(String term, dataset data) {
    abcd(term, data);
    double x;
    x = dokumen.size() * (((a * b) - (c * d)) / ((a + d) * (b + c) * (a + b) * (c + d)));
    return x;
}

public double OR(String term, dataset data) {
    abcd(term, data);
    double x;
    x = Math.log((b * d) / (a * b));
    return x;
}

public double IG(String term, dataset data) {
    abcd(term, data);
    double x;
    double n = dokumen.size();
    x = ((-(b + a) / n) * Math.log((b + a) / n)) - ((((c + d) / n) * Math.log((c + d) / n)))
            + ((b / n) * Math.log(b / (b + c))) + ((c / n) * Math.log(c / (b + c)))
            + ((a / n) * Math.log(a / (a + d))) + ((d / n) * Math.log(d / (d + a)));
    return x;
}

public double ngl(String term, dataset data) {
    abcd(term, data);
    double g;
    g = ((Math.sqrt(dokumen.size()) * ((a * d) - (b * c)))) / (Math.sqrt((b + c) * (a + d) * positif.size() * negatif.size()));
    return g;
}

public double gss(String term, dataset data) {
    abcd(term, data);
    double g;
    g = (a * d) - (b * c);
    return g;
}

public double iqf(String term, dataset data) {
    abcd(term, data);
    double g;
    g = Math.log(dokumen.size() / (b + c));
    return g;
}

public double qf(String term, dataset data) {
    abcd(term, data);
    double g;
    g = Math.log(b + 1);
    return g;
}

public double vrf(String term, dataset data) {
    abcd(term, data);
    double g;
    g = Math.log(b + 1) / Math.log(a + 1);
    return g;
}

private double dia(String term, ArrayList<String> dok) {
    double z, n = 0;
    for (int i = 0; i < dok.size(); i++) {

        String temp = dok.get(i).toLowerCase();
        if (temp.contains(term)) {
            n++;
        }
    }
    z = n / (double) dok.size();
    return z;
}



public void tulisidf() {
    try {
        FileWriter writer = new FileWriter("hasil_iidf.csv");
        try (FileWriter out = new FileWriter(new File("hasil_iidf.csv"))) {
            out.write("dokumen_data");
            for (int i = 0; i < term.size(); i++) {
                out.write("," + term.get(i));
            }
            out.write("," + "Class");
            out.write("\n");
            int gap=term.size();
            int n=0,j;
            System.out.println(nabcd.size());
            System.out.println(term.size());
            System.out.println(nabcd.get(0).getIdf());
            for (int i = 0; i < dokumen.size(); i++) {
                out.write("dokumen_" + i);
                for ( j=n; j < gap; j++) {
                    out.write("," + nabcd.get(j).getIdf());

                }
                n=gap;
                gap=gap+term.size();
                //n=j+1;
                out.write("," + dokumen.get(i).getKategori());
                out.write("\n");
            }
            out.close();
        }
        writer.flush();
        writer.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

public void tulistf() { //to write tf into csv
    try {
        FileWriter writer = new FileWriter("hasil_tf.csv");
        try (FileWriter out = new FileWriter(new File("hasil_tf.csv"))) {
            out.write("dokumen_data");
            for (int i = 0; i < term.size(); i++) {
                out.write("," + term.get(i));
            }
            out.write("," + "Class");
            out.write("\n");
            int gap=term.size();
            int n=0,j;
            System.out.println(nabcd.size());
            System.out.println(term.size());
            System.out.println(nabcd.get(0).getIdf());
            for (int i = 0; i < dokumen.size(); i++) {
                out.write("dokumen_" + i);
                for ( j=n; j < gap; j++) {
                    out.write("," + nabcd.get(j).getTf());
                }
                n=gap;
                gap=gap+term.size();
                //n=j+1;
                out.write("," + dokumen.get(i).getKategori());
                out.write("\n");
            }
            out.close();
        }
        writer.flush();
        writer.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
}

}

0 个答案:

没有答案