文件中单词的n-gram相似度

时间:2014-10-27 09:38:07

标签: java similarity

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

package sim;
import java.io.*;
import java.util.Arrays;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import static jdk.nashorn.internal.objects.NativeMath.max;

/**
 *
 * @author admin
 */
public class Sim {
    public String[][] bigramizedWords = new String[500][100];
    public String[] words = new String[500];
    public File file1 = new File("file1.txt");
    public File file2 = new File("file2.txt");
    public int tracker = 0;
    public double matches = 0;
    public double denominator = 0; //This will hold the sum of the bigrams of the 2 words
    public double res;
    public double results;

    public Scanner a;
    public PrintWriter pw1;
    public Sim(){
        intialize();
       // bigramize();
        results = max(res);
        System.out.println("\n\nThe Bigram Similarity value between " + words[0] + " and " + words[1] + " is " + res + ".");
        pw1.close();

    }


    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) {
        Sim si=new Sim();
        // TODO code application logic here
    }
    public void intialize() {
        int j[]=new int[35];
        try {

            File file1=new File("input.txt");
            File file2=new File("out.txt");
            Scanner a = new Scanner(file1);
            PrintWriter pw1= new PrintWriter(file2);
            int i=0,count = 0;

            while (a.hasNext()) {

                java.lang.String gram = a.next();
                if(gram.startsWith("question")|| gram.endsWith("?"))
                {
                    count=0;
                    count-=1;
                }

                if(gram.startsWith("[")||gram.startsWith("answer")||gram.endsWith(" ") )
                {
                    //pw1.println(count);
                    j[i++]=count;

                    count=0;
                    //pw1.println(gram);
                    //System.out.println(count);

                }
                else
                {
                    // System.out.println(count);
                    count+=1;
//System.out.println(count + " " + gram);

                }
       int line=gram.length();
    int sa_length;
                        //int[] j = null;
            int refans_length=j[1];
            //System.out.println(refans_length);
            for(int k=2;k<=35;k++)
               // System.out.println(j[k]);
            //System.out.println(refans_length);
            for(int m=2;m<=33;m++)

            {
                sa_length=j[2];
                //System.out.println(sa_length);

                for(int s=0;s<=refans_length;s++)
                {
                    for(int l=0;l<=sa_length;l++)
                    {
                        for (int x = 0; x <= line - 2; x++) {
                            int tracker = 0;
                    bigramizedWords[tracker][x] = gram.substring(x, x + 2);
                    System.out.println(gram.substring(x, x + 2) + "");
                    //bigramize(); 
                }
              // bigramize(); 

            }
        }

    }
            bigramize();
         words[tracker] = gram;
                tracker++;

            }
            //pw1.close();

    }


    catch (FileNotFoundException ex) {
            Logger.getLogger(Sim.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
    public void bigramize() {
        //for(int p=0;p<=sa_length;p++)
        denominator = (words[0].length() - 1) + (words[1].length() - 1);
        for (int k = 0; k < bigramizedWords[0].length; k++) {
            if (bigramizedWords[0][k] != null) {
                for (int i = 0; i < bigramizedWords[1].length; i++) {
                    if (bigramizedWords[1][i] != null) {
                        if (bigramizedWords[0][k].equals(bigramizedWords[1][i])) {
                            matches++;
                        }
                    }
                }
            }
        }
        matches *= 2;
        res = matches / denominator;
    }


}

我已经尝试过上面的代码来对文件中的单词进行粗分化&#34; input.txt&#34;我得到了bigram的结果,但我没有得到相似值。 例如: 输入文件包含为

answer:
high
risk
simulate
behaviour
solution
set
rules
[2]
rules
outline
high
source
knowledge
[1]
set
rules
simulate
behaviour

在上面的示例中,我必须将答案中的单词与[2]下的每个单词进行比较为{high,rules} {high,outline} {high,high} {high,source} {high,knowledge} and I必须存储上述比较的最大值,然后再从答案中取出第二个单词,然后进行类似的处理。最后,采用每次迭代的最大值的平均值。

0 个答案:

没有答案