我正在尝试使用Kafka to Storm模拟流流量。我使用KafkaSpout从生产者发送的一个主题中读取一条消息,该生产者阅读了这些推文并将其发送给一个主题。我的问题是,拓扑消耗了该主题中发送的所有tweet之后,它将继续读取该主题中的消息两次。如何阻止KafkaSpout读取两次?(复制因子设置为1)
pom
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>7</source>
<target>7</target>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-streams</artifactId>
<version>2.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.2.1</version>
<scope>compile</scope>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka</artifactId>
<version>1.2.2</version>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>com.googlecode.json-simple</groupId>
<artifactId>json-simple</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.9.2</artifactId>
<version>0.8.2.2</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.5</version>
</dependency>
<!-- BOTOMETER DEPENDENCIES -->
<dependency>
<groupId>de.fjobilabs.botometer</groupId>
<artifactId>botometer-java</artifactId>
<version>0.1.1</version>
</dependency>
</dependencies>
拓扑
int kafkaSpoutCount = Integer.parseInt(configs.getProperty(Keys.KAFKA_SPOUT_COUNT));
builder.setSpout(configs.getProperty(Keys.KAFKA_SPOUT_ID), kafkaSpout, kafkaSpoutCount);
int stormBoltACount=Integer.parseInt(configs.getProperty(Keys.STORM_BOLTA_PARALLELISM));
builder.setBolt(configs.getProperty(Keys.STORM_BOLTA),new Classifier(hashtagList1,hashtagList2,hashtagList3),stormBoltACount)
.shuffleGrouping(configs.getProperty(Keys.STORM_BOLTA_INPUT));
int storm2BoltACount=Integer.parseInt(configs.getProperty(Keys.STORM_2_BOLTA_PARALLELISM));
builder.setBolt(configs.getProperty(Keys.STORM_2_BOLTA),new Bot(configs.getProperty(Keys.STORM_2_BOLTA_OUTPUT)),storm2BoltACount)
.shuffleGrouping(configs.getProperty(Keys.STORM_2_BOLTA_INPUT_SOURCE),configs.getProperty(Keys.STORM_2_BOLTA_INPUT_FIELD));
分类器
public class Classifier extends BaseRichBolt {
private OutputCollector collector;
private Rate rater;
private int count=0;
String [] hashtagY;
String [] hashtagN;
String [] hashtagC;
File file;
PrintWriter outputwriter;
public Classifier(String [] one, String [] two, String [] three){
super();
this.hashtagY=one;
this.hashtagN=two;
this.hashtagC=three;
}
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.collector=outputCollector;
rater=new Rate(hashtagY,hashtagN,hashtagC);
file=new File("/home/marco/Scrivania/Tesi/postvalue.txt");
try {
outputwriter=new PrintWriter(file);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
public void execute(Tuple input) {
String inputTweet=input.getString(0);
Gson gson=new GsonBuilder().create();
try {
TweetJ tweetPojo = gson.fromJson(inputTweet, TweetJ.class);
int i = rater.evaluate(tweetPojo);
outputwriter.println(tweetPojo.getText() + "\t" + "VOTO:" + i);
outputwriter.flush();
if (true) {
collector.emit("Pro", new Values(tweetPojo, i));
} else {
collector.emit("Con", new Values(tweetPojo, i));
}
}catch (Exception e){
System.out.println(inputTweet);
collector.ack(input);
}
collector.ack(input);
}
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declareStream("Pro",new Fields("tweet","value"));
outputFieldsDeclarer.declareStream("Con",new Fields("tweet","value"));
}
}
BOT
public class Bot extends BaseRichBolt {
private final String filename;
private OutputCollector outputCollector;
private ConcurrentHashMap<String, Integer> userPostNumber;
@Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
this.outputCollector=outputCollector;
userPostNumber=new ConcurrentHashMap<String, Integer>();
try {
WriterCycle writerCycle=new WriterCycle(filename,userPostNumber);
Thread thread=new Thread(writerCycle);
thread.start();
} catch (IOException e) {
e.printStackTrace();
}
}
public Bot (String filename){
super();
this.filename=filename;
}
@Override
public void execute(Tuple input) {
TweetJ tweet=(TweetJ)input.getValue(0);
String user=tweet.getUser().getScreenName();
if(userPostNumber.containsKey(user)){
int i=userPostNumber.get((user));
i=i+1;
userPostNumber.put(user,i);
}else{
userPostNumber.put(user,1);
}
outputCollector.ack(input);
}
@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
}
@Override
public void cleanup(){
}
}
要在kafka主题中发送消息,我使用此类
制作人
public class Producer implements Runnable {
String filePath;
public Producer(String filePath){
this.filePath =filePath;
}
public void run() {
File f = new File(filePath);
BufferedReader bufferedReader = null;
try {
bufferedReader = new BufferedReader(new FileReader(f));
} catch (FileNotFoundException e) {
e.printStackTrace();
}
String topicName = "tweet";
//Configure the Producer
Properties configProperties = new Properties();
configProperties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
configProperties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.ByteArraySerializer");
configProperties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");
org.apache.kafka.clients.producer.Producer producer = new KafkaProducer(configProperties);
String line = null;
try {
line = bufferedReader.readLine();
} catch (IOException e) {
e.printStackTrace();
}
while (line != null) {
ProducerRecord<String, String> rec = new ProducerRecord<String, String>(topicName, line);
producer.send(rec);
try {
line = bufferedReader.readLine();
} catch (IOException e) {
e.printStackTrace();
}/*
try {
Thread.sleep(50);
} catch (InterruptedException e) {
e.printStackTrace();
}*/
}
producer.close();
}
更新
创建kafkaSpout
builder.setSpout("kafka_spout", new KafkaSpout<>(getKafkaSpoutConfig("127.0.0.1:9092")), 1);
protected KafkaSpoutConfig<String, String> getKafkaSpoutConfig(String bootstrapServers) {
return KafkaSpoutConfig.builder(bootstrapServers, "tweet")
.setProp(ConsumerConfig.GROUP_ID_CONFIG, "sample_group")
.setRetry(getRetryService())
.setOffsetCommitPeriodMs(10_000)
.setFirstPollOffsetStrategy(EARLIEST)
.setMaxUncommittedOffsets(1000000)
.build();
}
protected KafkaSpoutRetryService getRetryService() {
return new KafkaSpoutRetryExponentialBackoff(KafkaSpoutRetryExponentialBackoff.TimeInterval.microSeconds(500),
KafkaSpoutRetryExponentialBackoff.TimeInterval.milliSeconds(2), Integer.MAX_VALUE, KafkaSpoutRetryExponentialBackoff.TimeInterval.seconds(10));
}
答案 0 :(得分:0)
配置对我来说很好。
也许这个问题是重复的。确保只在execute
中确认每个元组一次。
如评论中所述,请考虑升级到较新的Kafka版本,并切换到storm-kafka-client
。
还可以让您的生活更轻松一些:考虑扩展BaseBasicBolt
而不是BaseRichBolt
。如果运行BaseBasicBolt
不会引发错误,那么execute
会自动为您修改元组。如果要使元组失败,可以抛出FailedException
。 BaseRichBolt
仅在您要进行更复杂的确认时才应使用,例如在确认之前,从内存中的许多execute
调用中聚合元组。