/*
* To change this license header, choose License Headers in Project Properties.
* To change this template file, choose Tools | Templates
* and open the template in the editor.
*/
package sim;
import java.io.*;
import java.util.Arrays;
import java.util.Scanner;
import java.util.logging.Level;
import java.util.logging.Logger;
import static jdk.nashorn.internal.objects.NativeMath.max;
/**
*
* @author admin
*/
public class Sim {
public String[][] bigramizedWords = new String[500][100];
public String[] words = new String[500];
public File file1 = new File("file1.txt");
public File file2 = new File("file2.txt");
public int tracker = 0;
public double matches = 0;
public double denominator = 0; //This will hold the sum of the bigrams of the 2 words
public double res;
public double results;
public Scanner a;
public PrintWriter pw1;
public Sim(){
intialize();
// bigramize();
results = max(res);
System.out.println("\n\nThe Bigram Similarity value between " + words[0] + " and " + words[1] + " is " + res + ".");
pw1.close();
}
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
Sim si=new Sim();
// TODO code application logic here
}
public void intialize() {
int j[]=new int[35];
try {
File file1=new File("input.txt");
File file2=new File("out.txt");
Scanner a = new Scanner(file1);
PrintWriter pw1= new PrintWriter(file2);
int i=0,count = 0;
while (a.hasNext()) {
java.lang.String gram = a.next();
if(gram.startsWith("question")|| gram.endsWith("?"))
{
count=0;
count-=1;
}
if(gram.startsWith("[")||gram.startsWith("answer")||gram.endsWith(" ") )
{
//pw1.println(count);
j[i++]=count;
count=0;
//pw1.println(gram);
//System.out.println(count);
}
else
{
// System.out.println(count);
count+=1;
//System.out.println(count + " " + gram);
}
int line=gram.length();
int sa_length;
//int[] j = null;
int refans_length=j[1];
//System.out.println(refans_length);
for(int k=2;k<=35;k++)
// System.out.println(j[k]);
//System.out.println(refans_length);
for(int m=2;m<=33;m++)
{
sa_length=j[2];
//System.out.println(sa_length);
for(int s=0;s<=refans_length;s++)
{
for(int l=0;l<=sa_length;l++)
{
for (int x = 0; x <= line - 2; x++) {
int tracker = 0;
bigramizedWords[tracker][x] = gram.substring(x, x + 2);
System.out.println(gram.substring(x, x + 2) + "");
//bigramize();
}
// bigramize();
}
}
}
bigramize();
words[tracker] = gram;
tracker++;
}
//pw1.close();
}
catch (FileNotFoundException ex) {
Logger.getLogger(Sim.class.getName()).log(Level.SEVERE, null, ex);
}
}
public void bigramize() {
//for(int p=0;p<=sa_length;p++)
denominator = (words[0].length() - 1) + (words[1].length() - 1);
for (int k = 0; k < bigramizedWords[0].length; k++) {
if (bigramizedWords[0][k] != null) {
for (int i = 0; i < bigramizedWords[1].length; i++) {
if (bigramizedWords[1][i] != null) {
if (bigramizedWords[0][k].equals(bigramizedWords[1][i])) {
matches++;
}
}
}
}
}
matches *= 2;
res = matches / denominator;
}
}
我已经尝试过上面的代码来对文件中的单词进行粗分化&#34; input.txt&#34;我得到了bigram的结果,但我没有得到相似值。 例如: 输入文件包含为
answer:
high
risk
simulate
behaviour
solution
set
rules
[2]
rules
outline
high
source
knowledge
[1]
set
rules
simulate
behaviour
在上面的示例中,我必须将答案中的单词与[2]下的每个单词进行比较为{high,rules} {high,outline} {high,high} {high,source} {high,knowledge} and I必须存储上述比较的最大值,然后再从答案中取出第二个单词,然后进行类似的处理。最后,采用每次迭代的最大值的平均值。