我必须编写一个代码来识别推文的语言并打印出某种语言的推文。我已经编写了语言识别部分,但无法仅打印必要的行。
以下是代码:
import java.io.*;
import java.util.*;
import weka.classifiers.bayes.NaiveBayes;
import weka.classifiers.functions.SMO;
import weka.classifiers.trees.RandomForest;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
public class Lang_Detect
{
public static weka.classifiers.Classifier c;
public static HashMap<String,String> trigram=new HashMap<String,String>();
public static void initiate() throws Exception
{
c = loadModel("C:\\Users\\DIV\\ff\\Maithili\\nb.model"); // loads nb model
}
public static NaiveBayes loadModel(String path) throws Exception
{
NaiveBayes classifier;
FileInputStream fis = new FileInputStream(path);
ObjectInputStream ois = new ObjectInputStream(fis);
classifier = (NaiveBayes) ois.readObject();
ois.close();
return classifier;
}
public static void read_trigram()
{
try
{
FileInputStream fis = new FileInputStream("C:\\Users\\DIV\\ff\\Maithili\\Trigram.txt");
BufferedReader br = new BufferedReader(new InputStreamReader(fis,"UTF-8"));
String line;
while((line = br.readLine())!=null)
{
String words[]=line.split(":");
trigram.put(words[0].trim(), "");
}
fis.close();
}catch(IOException f){}
}
public static String feature_vector(String line)
{
String vector="";
String words[]=line.split(" ");
HashMap<String,String> local_word=new HashMap<String,String>();
for(int i=0;i<words.length;i++)
{
char ch[]=words[i].toCharArray();
for(int j=0;j<ch.length-2;j++)
{
local_word.put(ch[j]+""+ch[j+1]+""+ch[j+2], "");
}
}
for (Map.Entry<String, String> entry : trigram.entrySet())
{
if(local_word.containsKey(entry.getKey()))
{
vector+="1,";
}
else
{
vector+="0,";
}
}
return vector;
}
public static String lang_tag(String file) throws Exception
{
String tagged_sentence="";
int l=0,cntr=0;;
//String words[]=sentence.toLowerCase().split(" ");
StringBuffer str=new StringBuffer();
read_trigram();
// TODO Auto-generated method stub
int count=1;
str.append("@relation Language\n");
for (Map.Entry<String, String> entry : trigram.entrySet())
{
str.append("@attribute Trigram"+count+" numeric\n");
count++;
}
str.append("@attribute class {HN,NP,MT}\n");
str.append("@DATA\n");
try
{
FileInputStream fis = new FileInputStream(file);
BufferedReader br = new BufferedReader(new InputStreamReader(fis,"UTF-8"));
String line;
while((line = br.readLine())!=null)
{
str.append(feature_vector(line)+"?\n");
}
fis.close();
}catch(IOException f){}
Global.file_update("C:\\Users\\DIV\\ff\\Maithili\\HN_NP_MT_Unlabelled.arff", str.toString());
Instances unlabeled = new Instances(
new BufferedReader(
new FileReader("HN_NP_MT_Unlabelled.arff")));
// set class attribute
unlabeled.setClassIndex(unlabeled.numAttributes() - 1);
Instances labeled = new Instances(unlabeled);
// label instances
for (int i = 0; i < unlabeled.numInstances(); i++)
{
double clsLabel = c.classifyInstance(unlabeled.instance(i));
String tag="";
if(clsLabel==0.0)
tag="HN";
else if(clsLabel==1.0)
tag="NP";
else if(clsLabel==2.0)
{
tag="MT";
Global.file_append("C:\\Users\\DIV\\ff\\Maithili\\Detected_Maithili_Tweets.txt", tag);
}
System.out.println(tag);
}
return tagged_sentence.trim();
}
public static void main(String[] args) throws Exception
{
initiate();
lang_tag("C:\\Users\\DIV\\ff\\Maithili\\tweets.txt");
}
}
正如你在lang_tag()中看到的,我想打印标记为MT的行,但我无法获取任何特定变量中的行。 有人能帮助我吗?