我创建了一个使用twitter4j接收推文的管道。使用Redis push命令推送推文。进一步处理推文,即删除“#”,“@”并查找情绪,使用pop命令弹出推文。
我面临的问题是,虽然推文是流媒体,但有些推文不完整(长篇推文)。例如:
原创推文:“莫迪政府的另一场无声革命 - 街道灯被LED灯泡取代,节省能源消耗,并阻止有毒物质排放。”
显示推文:“RT @ BJP4India:Modi govt的另一场无声革命 - 街道灯被LED灯泡取代,节省能源消耗......”
无法理解其发生的原因。这在情绪分析中成为一个问题,因为它也将“......”视为一个单词。
以下是推送和弹出推文的代码: -
import analytics.twitter.filter.Abbreviations;
import analytics.twitter.filter.TwitterFilter;
import redis.clients.jedis.Jedis;
public class RedisJava {
Jedis jedis = new Jedis("localhost");
public void RedisTweets(String text)
{
//System.out.println("Server is running? " + jedis.ping());
jedis.rpush("list", text);
}
public String RedisPop(String text)
{
String tweet = null;
Abbreviations abbrv = new Abbreviations();
TwitterFilter twitterfilter = new TwitterFilter();
String pop = jedis.rpop("list");
//System.out.println("pop" + pop);
if(pop != null)
{
String filtered = twitterfilter.filter(text);
tweet = abbrv.tweetAbbr(filtered);
System.out.println("Popped Content: " + tweet);
return tweet;
}
else
{
return null;
}
}
}
以下是完成过滤的代码: -
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TwitterFilter {
public String filter(String text)
{
String str = null;
Pattern pat1 = Pattern.compile("(@|#|:)(\\w+|\\s+)");
Matcher mat = pat1.matcher(text);
/*String prefilter = mat.replaceAll("");
Pattern pat2 = Pattern.compile("[^\\w\\s]");
mat = pat2.matcher(prefilter);*/
String filtered = mat.replaceAll("");
String urlPattern = "((https?|ftp|gopher|telnet|file|Unsure|http):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)";
Pattern p = Pattern.compile(urlPattern,Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(filtered);
int i = 0;
while (m.find()) {
filtered = filtered.replaceAll(m.group(i),"").trim();
i++;
}
if(filtered.startsWith("RT", 0))
{
str = filtered.replaceAll("RT", "");
return str;
}
else
{
return filtered;
}
}
}
以下代码找到了情绪: -
import java.util.Properties;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.sentiment.SentimentCoreAnnotations;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CoreMap;
public class TwitterSentiment {
public String findSentiment(String line) {
if (line == null || line.isEmpty()) {
throw new IllegalArgumentException("The line must not be null or empty.");
}
Annotation annotation = processLine(line);
int mainSentiment = findMainSentiment(annotation);
if (mainSentiment < 0 || mainSentiment > 4) {
return null;
}
//App42Sentiment app42Sentiment = new App42Sentiment(line, toCss(mainSentiment));
return toCss(mainSentiment);
}
private String toCss(int sentiment) {
switch (sentiment) {
case 0:
return "very negative";
case 1:
return "negative";
case 2:
return "neutral";
case 3:
return "positive";
case 4:
return "very positive";
default:
return "default";
}
}
private int findMainSentiment(Annotation annotation) {
int mainSentiment = Integer.MIN_VALUE;
int longest = Integer.MIN_VALUE;
for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
String word = token.get(CoreAnnotations.TextAnnotation.class);
String pos = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String ne = token.get(CoreAnnotations.NamedEntityTagAnnotation.class);
String lemma = token.get(CoreAnnotations.LemmaAnnotation.class);
System.out.println("word: " + word);
System.out.println("pos: " + pos);
System.out.println("ne: " + ne);
System.out.println("Lemmas: " + lemma);
}
int sentenceLength = String.valueOf(sentence).length();
if (sentenceLength > longest) {
Tree tree = sentence.get(SentimentCoreAnnotations.SentimentAnnotatedTree.class);
mainSentiment = RNNCoreAnnotations.getPredictedClass(tree);
longest = sentenceLength;
}
}
return mainSentiment;
}
private Annotation processLine(String line) {
StanfordCoreNLP pipeline = createPieline();
return pipeline.process(line);
}
private StanfordCoreNLP createPieline() {
Properties props = createPipelineProperties();
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
return pipeline;
}
private Properties createPipelineProperties() {
Properties props = new Properties();
props.setProperty("annotators", "tokenize, ssplit, pos, lemma, ner, parse, sentiment");
return props;
}
}