主方法中的文字似乎需要2秒多才能返回NER。我不是NLP的专家,这段代码根本不具备可扩展性。我在2个地方添加了评论,我已经确定了瓶颈。您能否提出改进建议以改善计划的性能。
感谢。
public class NERSentimentUtil
{
private static final Logger logger = Logger.getLogger(NERSentimentUtil.class);
private static final String serializedClassifier7 = "edu/stanford/nlp/models/ner/english.muc.7class.distsim.crf.ser.gz";
private static final String serializedClassifier4 = "edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz";
private static final String serializedClassifier3 = "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz";
private static NERClassifierCombiner ncc;
private static StanfordCoreNLP pipeline;
static
{
try
{
ncc = new NERClassifierCombiner(serializedClassifier3,serializedClassifier4,serializedClassifier7);
} catch (IOException e) {
e.printStackTrace();
logger.error(e);
}
}
static
{
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, parse, sentiment, sutime");
/*props.setProperty("ner.useSUTime", "0");*/
String defs_sutime = "/edu/stanford/nlp/models/sutime/defs.sutime.txt";
String holiday_sutime = "/edu/stanford/nlp/models/sutime/english.holidays.sutime.txt";
String _sutime = "/edu/stanford/nlp/models/sutime/english.sutime.txt";
String sutimeRules = defs_sutime + "," + holiday_sutime + "," + _sutime;
props.setProperty("ner.useSUTime", "true");
props.setProperty("-sutime.rules", sutimeRules);
props.setProperty("sutime.binders", "0");
props.setProperty("sutime.markTimeRanges", "false");
props.setProperty("sutime.includeRange", "false");
props.setProperty("customAnnotatorClass.sutime", "edu.stanford.nlp.time.TimeAnnotator");
props.setProperty("parse.maxlen", "20");
//props.setProperty("ner.applyNumericClassifiers", "false");
//props.setProperty("nthreads", "16");
//props.setProperty("threads", "16");
//props.setProperty("parse.nthreads","16");
//props.setProperty("ssplit.eolonly","true");
props.setProperty("-parse.model", "edu/stanford/nlp/models/srparser/englishSR.ser.gz");
RedwoodConfiguration.current().clear().apply();
pipeline = new StanfordCoreNLP(props);
//RedwoodConfiguration.empty().capture(System.err).apply();
}
//A sentiment score of 0 or 1 is negative, 2 neutral and 3 or 4 positive.
private static int getScore(int score)
{
if(score<2)
return -1;
else if(score==2)
return 0;
else
return 1;
}
public static HashMap<String,Object> getStanford(String s, long dateString)//"2013-07-14"
{
int finalScore =0;
HashMap<String,Object> map = new HashMap<String,Object>();
HashMap<String, Integer> dateMap = new HashMap<String, Integer>();
HashMap<String, Integer> dateCountMap = new HashMap<String, Integer>();
HashMap<String, String> dateSentenceMap = new HashMap<String, String>();
HashMap<String, Integer> personMap = new HashMap<String, Integer>();
HashMap<String, Integer> personCountMap = new HashMap<String, Integer>();
HashMap<String, Integer> orgMap = new HashMap<String, Integer>();
HashMap<String, Integer> orgCountMap = new HashMap<String, Integer>();
HashMap<String, Integer> locationMap = new HashMap<String, Integer>();
HashMap<String, Integer> locationCountMap = new HashMap<String, Integer>();
HashMap<String, Article_Location> locationArticleMap = new HashMap<String, Article_Location>();
ArrayList<Articel_Ner> organisationlist = new ArrayList<Articel_Ner>();
ArrayList<Articel_Ner> personlist = new ArrayList<Articel_Ner>();
ArrayList<Artilcle_Ner_Date> datelist = new ArrayList<Artilcle_Ner_Date>();
ArrayList<Article_NerLocation> locationList = new ArrayList<Article_NerLocation>();
try
{
Annotation annotation = pipeline.process(s);//1/3 rd time is taken up by this line
List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap sentence : sentences)
{
String str = sentence.toString();
int score = getSentiment(sentence);
finalScore+=score;
boolean dFlag = true;
List<Triple<String,Integer,Integer>> triples = ncc.classifyToCharacterOffsets(str);
for (Triple<String,Integer,Integer> trip : triples)
{
String ne = trip.first();
String word = str.substring(trip.second(), trip.third).toLowerCase();
switch(ne)
{
case "LOCATION":
extractLocation(locationMap, locationCountMap, locationArticleMap, score, word);
break;
case "ORGANIZATION":
extractOrg(orgMap, orgCountMap, score, word);
break;
case "PERSON":
extractPerson(personMap, personCountMap, score, word);
break;
case "DATE":
if(dFlag)
{
extractSUDate(dateString, dateMap, dateCountMap, dateSentenceMap, str, score);
dFlag = false;
}
break;
default:
break;
}
}
}
//2/3rd of the time taken by these 4 methods:: can be obtimized
mapDate(dateMap, dateCountMap, dateSentenceMap, datelist);
mapLocation(locationMap, locationCountMap, locationArticleMap, locationList);
mapOrg(orgMap, orgCountMap, organisationlist);
mapPerson(personMap, personCountMap, personlist);
//
}
catch(Exception e)
{
logger.error(e);
logger.error(s);
e.printStackTrace();
}
if(finalScore>0)
finalScore = 1;
else if(finalScore<0)
finalScore = -1;
else
finalScore = 0;
map.put("ORGANISATION", organisationlist);
map.put("PERSON", personlist);
map.put("DATE", datelist);
map.put("LOCATION", locationList);
map.put("SENTIMENT", finalScore);
return map;
}
private static void extractPerson(HashMap<String, Integer> personMap, HashMap<String, Integer> personCountMap,
int score, String word)
{
if(personMap.get(word)!=null)
{
personMap.put(word, personMap.get(word)+score);
personCountMap.put(word, personCountMap.get(word)+1);
}
else
{
personMap.put(word, score);
personCountMap.put(word, 1);
//personSentenceMap.put(pname, str);
}
}
private static void extractOrg(HashMap<String, Integer> orgMap, HashMap<String, Integer> orgCountMap,
int score, String word)
{
if(orgMap.get(word)!=null)
{
orgMap.put(word, orgMap.get(word)+score);
orgCountMap.put(word, orgCountMap.get(word)+1);
}
else
{
orgMap.put(word, score);
orgCountMap.put(word, 1);
//orgSentenceMap.put(oname, str);
}
}
private static void extractLocation(HashMap<String, Integer> locationMap,
HashMap<String, Integer> locationCountMap,
HashMap<String, Article_Location> locationArticleMap,
int score,
String word)
{
if(locationMap.get(word)!=null)
{
locationMap.put(word, locationMap.get(word)+score);
locationCountMap.put(word, locationCountMap.get(word)+1);
}
else
{
Article_Location articleLocation = LocationUtil.getLocation(word);
locationMap.put(word, score);
locationCountMap.put(word, 1);
locationArticleMap.put(word, articleLocation);
}
}
private static void extractSUDate(long dateString,
HashMap<String, Integer> dateMap,
HashMap<String, Integer> dateCountMap,
HashMap<String, String> dateSentenceMap,
String str,
int score) {
Annotation dateAnnotation = new Annotation(str);
dateAnnotation.set(CoreAnnotations.DocDateAnnotation.class, FormatUtil.getDate(dateString));
pipeline.annotate(dateAnnotation);
for(CoreMap timex:dateAnnotation.get(TimeAnnotations.TimexAnnotations.class))
{
TimeExpression timeExpression = timex.get(TimeExpression.Annotation.class);
if(timeExpression!=null && timeExpression.getTemporal()!=null &&
timeExpression.getTemporal().getTimexValue()!=null)
{
String word = checkDate(timeExpression.getTemporal().getTimexValue());
if(word!=null)
{
if(dateMap.get(word)!=null)
{
dateMap.put(word, dateMap.get(word)+score);
dateCountMap.put(word, dateCountMap.get(word)+1);
dateSentenceMap.put(word, dateSentenceMap.get(word)+" "+str);
}
else
{
dateMap.put(word, score);
dateCountMap.put(word, 1);
dateSentenceMap.put(word, str);
}
}
}
}
}
private static int getSentiment(CoreMap sentence) {
Tree annotatedTree = sentence.get(SentimentAnnotatedTree.class);
int localScore = RNNCoreAnnotations.getPredictedClass(annotatedTree);
int score = getScore(localScore);
return score;
}
private static void mapLocation(HashMap<String, Integer> locationMap,
HashMap<String, Integer> locationCountMap,
HashMap<String, Article_Location> locationArticleMap,
ArrayList<Article_NerLocation> locationList)
{
for(Map.Entry<String, Integer> entry : locationMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Article_Location articleLocation = locationArticleMap.get(key);
Article_NerLocation l1 = new Article_NerLocation();
if(value>=1)
l1.setNerSentiment(1);
else if(value<=-1)
l1.setNerSentiment(-1);
else
l1.setNerSentiment(0);
l1.setKeyword(key);
l1.setCount(locationCountMap.get(key));
if(articleLocation!=null)
{
l1.setNerCountry(articleLocation.getCountryCode());
l1.setNerLatLong(articleLocation.getLatitude()+","+articleLocation.getLongitude());
l1.setTimeZone(articleLocation.getTimeZone());
l1.setCountryName(articleLocation.getCountryName());
}
locationList.add(l1);
}
}
private static void mapDate(HashMap<String, Integer> dateMap,
HashMap<String, Integer> dateCountMap,
HashMap<String, String> dateSentenceMap,
ArrayList<Artilcle_Ner_Date> datelist)
{
for(Map.Entry<String, Integer> entry : dateMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Artilcle_Ner_Date d1 = new Artilcle_Ner_Date();
if(value>=1)
d1.setNerSentiment(1);
else if(value<=-1)
d1.setNerSentiment(-1);
else
d1.setNerSentiment(0);
d1.setKeyword(key);
d1.setCount(dateCountMap.get(key));
d1.setSentence(dateSentenceMap.get(key));
d1.setNerDateTheme1(SummaryThemeUtil.getSTByDate(dateSentenceMap.get(key)));
datelist.add(d1);
}
}
private static void mapOrg(HashMap<String, Integer> orgMap,
HashMap<String, Integer> orgCountMap,
ArrayList<Articel_Ner> organisationlist)
{
for(Map.Entry<String, Integer> entry : orgMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Articel_Ner o1 = new Articel_Ner();
if(value>=1)
o1.setNerSentiment(1);
else if(value<=-1)
o1.setNerSentiment(-1);
else
o1.setNerSentiment(0);
o1.setKeyword(key);
o1.setCount(orgCountMap.get(key));
organisationlist.add(o1);
}
}
private static void mapPerson(HashMap<String, Integer> personMap,
HashMap<String, Integer> personCountMap,
ArrayList<Articel_Ner> personlist)
{
for(Map.Entry<String, Integer> entry : personMap.entrySet())
{
String key = entry.getKey();
Integer value = entry.getValue();
Articel_Ner p1 = new Articel_Ner();
if(value>=1)
p1.setNerSentiment(1);
else if(value<=-1)
p1.setNerSentiment(-1);
else
p1.setNerSentiment(0);
p1.setKeyword(key);
p1.setCount(personCountMap.get(key));
personlist.add(p1);
}
}
private static String checkDate(String date)
{
if(date.length()<10)
return null;
else if(date.length()>10)
date = date.substring(0,10);
if (date.matches("\\d{4}-\\d{2}-\\d{2}"))
return date;
else
return null;
}
public static void main(String args[])
{
String text = "Lets meet on every 2nd week. Night is young. Happy new Year. The festival will be held on the following dates are 18 Feb 1997, the 20th of july and 4 days from today.";
long pre = System.currentTimeMillis();
HashMap<String, Object> map = getStanford(text, 1508745558);
long post = System.currentTimeMillis();
long diff = post-pre;
System.out.println(diff);
System.out.println(map);
}
}
答案 0 :(得分:1)
经过几天黑眼睛的疼痛。这就是问题所在:
斯坦福&#34; 解析&#34;模型 PCFG 或 SRparser 都是CPU杀手。你永远无法扩展。充其量我做了70个文档/秒。这是我能够在tomcat上管理的15个线程。从RabbitMQ消费的文档。使用15 GB RAM的Intel Xeon 8Core VM。 CPU总是90%。
因此,如果您想要 NER ,情绪, sutime 。最好使用单独的库而不是使用stanford 3.对于NER,您可以使用stanford的 NERClassifierCombiner 。对于情绪,您可以使用 weka 。要提取日期,您可以使用 natty 。
现在我们可以 2,000个文档/秒。
答案 1 :(得分:0)
我在你的示例文本上运行了这个命令:
java -Xmx8g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma,ner,parse,sentiment -parse.model edu/stanford/nlp/models/srparser/englishSR.ser.gz -file example-1.txt -outputFormat text -ner.nthreads 4 -parse.nthreads 4
我得到了这个时间信息:
Annotation pipeline timing information:
TokenizerAnnotator: 0.0 sec.
WordsToSentencesAnnotator: 0.0 sec.
POSTaggerAnnotator: 0.0 sec.
MorphaAnnotator: 0.0 sec.
NERCombinerAnnotator: 0.0 sec.
ParserAnnotator: 0.2 sec.
SentimentAnnotator: 0.0 sec.
TOTAL: 0.4 sec. for 39 tokens at 105.1 tokens/sec.
Pipeline setup: 14.8 sec.
Total time for StanfordCoreNLP pipeline: 15.2 sec.
我看到了0.4 sec
的处理时间。
1。)确保每次都不重建管道。您的代码似乎没有在main()方法
中重建管道 2.。)我的命令对ner
和parse
使用多线程。请注意,我也使用shift-reduce解析器,它比默认解析器快得多。
3.)通过将它们分配给用于构建管道的Properties对象,可以在Java API代码中设置所有管道设置。以下是使用Java API的详尽文档(您必须将其转换为Scala):
Java API:https://stanfordnlp.github.io/CoreNLP/api.html
命令行:https://stanfordnlp.github.io/CoreNLP/cmdline.html
4。)您不需要构建单独的NERClassifierCombiner,您可以使用ner
注释器,它也将运行SUTime。
5.)我应该注意时间将由解析器控制。您可以选择不使用-parse.maxlen N
解析真正长的句子,并将N设置为您喜欢的任何令牌长度。
6。)如果要获取完整实体提及的字符偏移量,请确保将entitymentions
注释器添加到注释器列表的末尾。每个句子都有一个实体提及列表。提到的每个实体都是CoreMap。您可以使用以下代码访问实体提及的开始和结束字符偏移量:
List<CoreMap> mentions = sentence.get(CoreAnnotations.MentionsAnnotation.class);
for (CoreMap mention : mentions) {
int docCharBegin = mention.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
int docCharEnd = mention.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
}
如果您对将其转换为Scala代码有任何疑问,请与我们联系。