我正在尝试使用Apache Spark Java API建立Twitter流。在将Twitter流保存到Elasticsearch时,我遇到了异常。我想我正在尝试保存原始推文,这就是问题所在。请让我知道我可以尝试解决此异常。
以下是代码:
package com.twitter.streaming;
import com.twitter.util.TwitterStreamUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.serializer.KryoSerializer;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.twitter.TwitterUtils;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import twitter4j.Status;
/**
* Created by Manali on 1/28/2017.
*/
public class TwitterStream {
private static final String[] filters = {"#football"};
public static void main(String[] args) throws InterruptedException {
// create the spark configuration and spark context
System.setProperty("hadoop.home.dir", "C:\\winutil\\");
SparkConf conf = new SparkConf().setAppName("SparkTwitterStreamExample").setMaster("local[2]")
.set("spark.serializer", KryoSerializer.class.getName())
.set("es.nodes", "localhost:9200")
.set("es.index.auto.create", "true");
// create a java streaming context and define the window (3 seconds batch)
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(15));
System.out.println("Initializing Twitter stream...");
// create a DStream (sequence of RDD). The object tweetsStream is a DStream of tweet statuses:
// - the Status class contains all information of a tweet
// See http://twitter4j.org/javadoc/twitter4j/Status.html
// and fill the keys and tokens in the Streamutils class!
JavaDStream<Status> twitterStream = TwitterUtils.createStream(jssc, TwitterStreamUtils.getAuth());
JavaDStream<String> statuses = twitterStream.map(
new Function<Status, String>() {
public String call(Status status) { return status.toString(); }
}
);
statuses.print();
statuses.foreachRDD(tweets->{
// save tweet to Elasticsearch
JavaEsSpark.saveJsonToEs(tweets, "spark/tweets");
return null;
});
jssc.start();
jssc.awaitTermination();
}
}
堆栈追踪:
-------------------------------------------
Time: 1486397175000 ms
-------------------------------------------
StatusJSONImpl{createdAt=Mon Feb 06 10:06:11 CST 2017, id=828635913144016896, text='夢王國超強大的XDDD
托托大愛( ´▽` )ノ
發棉花糖的執事超高超帥wwwww
#夢100 #CWT45', rel="nofollow">Twitter for Android</a>', isTruncated=false, inReplyToStatusId=-1, inReplyToUserId=-1, isFavorited=false, isRetweeted=false, favoriteCount=0, inReplyToScreenName='null', geoLocation=null, place=null, retweetCount=0, isPossiblySensitive=false, lang='ja', contributorsIDs=[], retweetedStatus=null, userMentionEntities=[], urlEntities=[], hashtagEntities=[HashtagEntityJSONImpl{text='夢100'}, HashtagEntityJSONImpl{text='CWT45'}], mediaEntities=[MediaEntityJSONImpl{id=828635824715505665, symbolEntities=[], currentUserRetweetId=-1, user=UserJSONImpl{id=4298859732, name='草加美燕', screenName='mU7oEb6DVbCda4S', location='臺灣 新北市中和', description='17歲的高
17/02/06 10:06:16 INFO BlockGenerator: Pushed block input-0-1486397175800
17/02/06 10:06:16 ERROR TaskContextImpl: Error in TaskCompletionListener
org.elasticsearch.hadoop.rest.EsHadoopInvalidRequest: Invalid UTF-8 start byte 0x89
at [Source: [B@25c68cc; line: 1, column: 3]
at org.elasticsearch.hadoop.rest.RestClient.checkResponse(RestClient.java:478)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:436)
at org.elasticsearch.hadoop.rest.RestClient.execute(RestClient.java:426)
at org.elasticsearch.hadoop.rest.RestClient.bulk(RestClient.java:153)
at org.elasticsearch.hadoop.rest.RestRepository.tryFlush(RestRepository.java:225)
at org.elasticsearch.hadoop.rest.RestRepository.flush(RestRepository.java:248)
at org.elasticsearch.hadoop.rest.RestRepository.close(RestRepository.java:267)
at org.elasticsearch.hadoop.rest.RestService$PartitionWriter.close(RestService.java:130)
at org.elasticsearch.spark.rdd.EsRDDWriter$$anonfun$write$1.apply$mcV$sp(EsRDDWriter.scala:42)
at org.apache.spark.TaskContextImpl$$anon$2.onTaskCompletion(TaskContextImpl.scala:68)
at org.apache.spark.TaskContextImpl$$anonfun$markTaskCompleted$1.apply(TaskContextImpl.scala:79)
at org.apache.spark.TaskContextImpl$$anonfun$markTaskCompleted$1.apply(TaskContextImpl.scala:77)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:77)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
17/02/06 10:06:16 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 6)
org.apache.spark.util.TaskCompletionListenerException: Invalid UTF-8 start byte 0x89
at [Source: [B@25c68cc; line: 1, column: 3]
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
17/02/06 10:06:16 INFO TaskSetManager: Starting task 1.0 in stage 3.0 (TID 7, localhost, NODE_LOCAL, 1943 bytes)
17/02/06 10:06:16 WARN TaskSetManager: Lost task 0.0 in stage 3.0 (TID 6, localhost): org.apache.spark.util.TaskCompletionListenerException: Invalid UTF-8 start byte 0x89
at [Source: [B@25c68cc; line: 1, column: 3]
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
17/02/06 10:06:16 INFO Executor: Running task 1.0 in stage 3.0 (TID 7)
17/02/06 10:06:16 ERROR TaskSetManager: Task 0 in stage 3.0 failed 1 times; aborting job
17/02/06 10:06:16 INFO BlockManager: Found block input-0-1486397172800 locally
17/02/06 10:06:16 INFO TaskSchedulerImpl: Cancelling stage 3
17/02/06 10:06:16 INFO Executor: Executor is trying to kill task 1.0 in stage 3.0 (TID 7)
17/02/06 10:06:16 INFO TaskSchedulerImpl: Stage 3 was cancelled
17/02/06 10:06:16 INFO DAGScheduler: ResultStage 3 (foreachRDD at TwitterStream.java:47) failed in 0.589 s
17/02/06 10:06:16 INFO DAGScheduler: Job 3 failed: foreachRDD at TwitterStream.java:47, took 0.608443 s
17/02/06 10:06:16 INFO JobScheduler: Finished job streaming job 1486397175000 ms.1 from job set of time 1486397175000 ms
17/02/06 10:06:16 INFO JobScheduler: Total delay: 1.086 s for time 1486397175000 ms (execution: 1.001 s)
17/02/06 10:06:16 ERROR JobScheduler: Error running job streaming job 1486397175000 ms.1
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 3.0 failed 1 times, most recent failure: Lost task 0.0 in stage 3.0 (TID 6, localhost): org.apache.spark.util.TaskCompletionListenerException: Invalid UTF-8 start byte 0x89
at [Source: [B@25c68cc; line: 1, column: 3]
at org.apache.spark.TaskContextImpl.markTaskCompleted(TaskContextImpl.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
答案 0 :(得分:0)
解析推文值时出现问题。我使用ObjectMapper,以下是使用Apache Spark将Twitter流保存到Elasticsearch的工作代码。
package com.twitter.streaming;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.twitter.util.TwitterStreamUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.serializer.KryoSerializer;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.twitter.TwitterUtils;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import twitter4j.Status;
/**
* Created by Manali on 1/28/2017.
*/
public class TwitterStream {
private static final String[] filters = {"#trumph", "#happy"};
public static void main(String[] args) throws InterruptedException {
// create the spark configuration and spark context
System.setProperty("hadoop.home.dir", "C:\\winutil\\");
SparkConf conf = new SparkConf().setAppName("SparkTwitterStreamExample").setMaster("local[2]")
.set("spark.serializer", KryoSerializer.class.getName())
.set("es.nodes", "localhost:9200")
.set("es.index.auto.create", "true");
// create a java streaming context and define the window (3 seconds batch)
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(15));
System.out.println("Initializing Twitter stream...");
// create a DStream (sequence of RDD). The object tweetsStream is a DStream of tweet statuses:
// - the Status class contains all information of a tweet
// See http://twitter4j.org/javadoc/twitter4j/Status.html
// and fill the keys and tokens in the Streamutils class!
JavaDStream<Status> twitterStream = TwitterUtils.createStream(jssc, TwitterStreamUtils.getAuth());
/* JavaDStream<String> statuses = twitterStream.map(
new Function<Status, String>() {
public String call(Status status) {
return status.toString();
}
}
);*/
//statuses.print();
// Jackson ObjectMapper for parsing
ObjectMapper mapper = new ObjectMapper();
// parse and save Twitter stream to Elasticsearch
twitterStream//.map(t -> new Tweet(t.getUser().getName(), t.getText()))
.map(t -> mapper.writeValueAsString(t))
.foreachRDD(tweets -> {
JavaEsSpark.saveJsonToEs(tweets, "spark/tweets");
return null;
});
jssc.start();
jssc.awaitTermination();
}
}