我是java的新手 任何人都可以帮我解释代码,告诉我们有多少文本文件相互匹配? 假设我有两个文件' a.txt'和' b.txt' 那我需要知道比赛的百分比。 感谢
答案 0 :(得分:2)
将两个文件读入两个Strings str1,str2。
遍历每个,计算匹配的字符。按比较数除以匹配数,然后乘以100得到一个百分比。
Scanner sca = new Scanner(new File ("a.txt"));
Scanner scb = new Scanner(new File ("b.txt"));
StringBuilder sba = new StringBuilder();
StringBuilder sbb = new StringBuilder();
while(sca.hasnext()){
sba.append(sca.next());
}
while(scb.hasnext()){
sbb.append(scb.next());
}
String a = sba.toString();
String b = sbb.toString();
int maxlen = Math.max(a.length,b.length);
int matches;
for(int i =0; i<maxlen; i++){
if(a.length <=i || b.length <=i){
break;
}
if(a.chatAt(i)==b.charAt(i)){
matches++;
}
return (((double)matches/(double)maxlen)*100.0)
答案 1 :(得分:0)
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.StringTokenizer;
class File_meta_Data // class to store the metadata of file so that scoring can be done
{
String FileName;
long lineNumber;
long Position_In_Line;
long Position_In_Document;
File_meta_Data()
{
FileName = null;
lineNumber = 0;
Position_In_Line = 0;
Position_In_Document = 0;
}
}
public class bluestackv1 {
static int getNumberofInputFiles() // seeks number of resource files from user
{
System.out.println("enter the number of files");
Scanner scan = new Scanner(System.in);
return(scan.nextInt());
}
static List getFiles(int Number_of_input_files) // seeks full path of resource files from user
{
Scanner scan = new Scanner(System.in);
List filename = new ArrayList();
int i;
for(i=0;i<Number_of_input_files;i++)
{
System.out.println("enter the filename");
filename.add(scan.next());
}
return(filename);
}
static String getfile() // seeks the full pathname of the file which has to be matched with resource files
{
System.out.println("enter the name of file to be matched");
Scanner scan = new Scanner(System.in);
return(scan.next());
}
static Map MakeIndex(List filename) // output the index in the map.
{
BufferedReader reader = null; //buffered reader to read file
int count;
Map index = new HashMap();
for(count=0;count<filename.size();count++) // for all files mentioned in the resource list create index of its contents
{
try {
reader = new BufferedReader(new FileReader((String) filename.get(count)));
long lineNumber;
lineNumber=0;
int Count_of_words_in_document;
Count_of_words_in_document = 0;
String line = reader.readLine(); // data is read line by line
while(line!=null)
{
StringTokenizer tokens = new StringTokenizer(line, " ");// here the delimiter is <space> bt it can be changed to <\n>,<\t>,<\r> etc depending on problem statement
lineNumber++;
long Count_of_words_in_line;
Count_of_words_in_line = 0;
while(tokens.hasMoreTokens())
{
List<File_meta_Data> temp = new ArrayList<File_meta_Data>();
String word = tokens.nextToken();
File_meta_Data metadata = new File_meta_Data();
Count_of_words_in_document++; // contains the word number in the document
Count_of_words_in_line++; // contains the word number in line. used for scoring
metadata.FileName = filename.get(count).toString();
metadata.lineNumber = lineNumber;
metadata.Position_In_Document = Count_of_words_in_document;
metadata.Position_In_Line = Count_of_words_in_line;
int occurence;
occurence=0;
if(index.containsKey(word)) //if the word has occured already then update the new entry which concatenates the older and new entries
{
Map temp7 = new HashMap();
temp7 = (Map) index.get(word);
if(temp7.containsKey(metadata.FileName)) // entry of child Map is changed
{
List<File_meta_Data> temp8 = new ArrayList<File_meta_Data>();
temp8 = (List<File_meta_Data>)temp7.get(metadata.FileName); //outputs fioles which contain the word along with its location
temp7.remove(metadata.FileName);
temp8.add(metadata);
temp7.put(metadata.FileName, temp8); // updated entry is added
}
else // if the word has occured for the first time and no entry is in the hashMap
{
temp.add(metadata);
temp7.put(metadata.FileName, temp);
temp=null;
}
Map temp9 = new HashMap();
temp9 = (Map) index.get(word);
index.remove(word);
temp9.putAll(temp7);
index.put(word, temp9);
}
else // similarly is done for parent map also
{
Map temp6 = new HashMap();
temp.add(metadata);
temp6.put(metadata.FileName, temp);
index.put(word,temp6);
}
}
line = reader.readLine();
}
index.put("@words_in_file:"+(String)filename.get(count),Count_of_words_in_document);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return(index);
}
static String search(Map index,List filename) throws IOException //scores each resource file by comparing with each word in input file
{
double[] overlap = new double[filename.size()]; //stores overlap/coord scores
double[] sigma = new double[filename.size()]; // stores ∑t in q ( tf(t in d) · idf(t)^2 for each resource file
int i;
double max, maxid; // stores file info with max score
max=0;
maxid= -1;
for(i=0;i<filename.size();i++)
{
overlap[i] = 0;
sigma[i] = 0;
}
String bestfile = new String();
double maxscore;
maxscore = -1;
double total;
double cord;
total=0;
File File_to_be_matched = new File(getfile());
BufferedReader reader = new BufferedReader(new FileReader(File_to_be_matched));
String line = reader.readLine();
while(line!=null) //similar to index function
{
StringTokenizer tokens = new StringTokenizer(line, " ");
while(tokens.hasMoreTokens())
{
String word = tokens.nextToken();
double tf,idf;
tf = 0;
idf = 0;
total=total+1;
if(index.containsKey(word))
{
Map temp = new HashMap();
for(i=0;i<filename.size();i++) // for each file a score is calculated for corresponding word which afterwards added
{
int j,count,docFreq;
count=0;
docFreq=0;
temp = (Map) index.get(word);
if(temp.containsKey(filename.get(i)))
{
List l2= (List) temp.get(filename.get(i));
tf = (int) Math.pow((long) l2.size(),0.5); //calculate the term frequency
docFreq = temp.size(); // tells in how many files the word occurs in the file
overlap[i]++;
}
else
{
tf=0;
}
idf = (int) (1 + Math.log((long)(filename.size())/(1+docFreq)));// more the occurence higher similarity of file
sigma[i] = sigma[i] + (int)(Math.pow((long)idf,2) * tf);
}
}
}
line = reader.readLine();
}
double subsetRatio;
for(i=0;i<filename.size();i++) // all scores are added
{
int x = (int)index.get("@words_in_file:"+(String)filename.get(i));
subsetRatio = overlap[i]/x;
overlap[i] = overlap[i]/total;
overlap[i] = overlap[i] * sigma[i];
overlap[i] = overlap[i] * subsetRatio; // files which are subset of some have higher priority
if(max<overlap[i]) // maximum score is calculated
{
max=overlap[i];
maxid = i;
}
}
if(maxid!=-1)
return (String) (filename.get((int) maxid));
else
return("error: Matching does not took place");
}
public static void main(String[] args) throws IOException
{
List filename = new ArrayList();
int Number_of_input_files = getNumberofInputFiles();
filename = getFiles(Number_of_input_files);
Map index = new HashMap();
index = MakeIndex(filename);
//match(index);
while(1==1) //infinite loop
{
String Most_similar_file = search(index,filename);
System.out.println("the most similar file is : "+Most_similar_file);
}
}
}
答案 2 :(得分:0)
问题是在多个资源文件中找到最相似的文件。 这个问题有两个子问题 首先,正如问题所述,如何找到最相似的文件,通过考虑文件内容的不同方面将每个文件与分数相关联来完成 第二,用相对较大的资源文件解析输入文件的每个单词 为了解决第二个问题,反向索引已经在java中与HashMaps一起使用。由于我们的问题很简单并且没有修改我使用的是Inherited Maps而不是基于Comparator的MapReduce 搜索计算复杂度= o(RESOURCEFILES * TOTAL_WORDS_IN _INPUTFILE) 第一个问题已通过以下公式解决 得分(q,d)=坐标(q,d)•q中的Σt(tf(t in d)•idf(t)^ 2)。 subsetRatio 1)坐标(q,d)=重叠/ maxOverlap 含义:在查询中的术语中,包含更多术语的文档将具有更高的分数 Rational:基于在指定文档中找到多少查询词的分数因子 2)tf(t in d)= sqrt(freq) 文件(d)中术语(t)的术语频率因子。 含义:文档中出现的术语越频繁,其得分就越高 理由:包含更多术语的文档通常更相关 3)idf(t)= log(numDocs /(docFreq + 1))+ 1 I 含义:不同文件中术语的出现越多,其得分越低 理性:普通术语不如不常见术语重要 4)SubsetRation =发生的单词数/总单词数 暗示:假设2个文件,输入文件的最高级,然后文件的过多数据将具有更高的相似性 Rational:具有相似内容的文件必须具有更高的优先级
****************测试用例************************ 强>
1)输入文件没有与资源文件相似的单词 2)输入文件的内容与任何一个文件类似 3)输入文件内容相似但元数据不同(意思是单词的位置不相似) 4)输入文件是资源文件的子集 5)输入文件包含非常常见的单词,如all&#39; a&#39;或者&#39;和&#39; 6)输入文件不在该位置 7)输入文件无法读取
答案 3 :(得分:-1)
查看打开文件,将其作为字符读取。你实际上只需要从每个中获取一个字符,然后检查它们是否匹配。如果它们匹配,则递增总计数器和匹配计数器。如果他们不是,只有总计数器。
在此处阅读有关处理文件和流的更多信息:http://docs.oracle.com/javase/tutorial/essential/io/charstreams.html
一个例子是:
BufferedReader br1 = null;
BufferedReader br2 = null;
try
{
br1 = new BufferedReader(new InputStreamReader(new FileInputStream(new File("a.txt")), "UTF-8"));
br2 = new BufferedReader(new InputStreamReader(new FileInputStream(new File("b.txt")), "UTF-8"));
//add logic here
}
catch (Exception e)
{
e.printStackTrace();
}
finally
{
if (br1 != null)
{
try
{
br1.close();
}
catch (Exception e)
{
}
}
if (br2 != null)
{
try
{
br2.close();
}
catch (Exception e)
{
}
}
}