Kafka Spout阅读了两次有关Storm Topology的消息

时间:2019-05-24 14:32:45

标签: java apache-kafka stream apache-storm

我正在尝试使用Kafka to Storm模拟流流量。我使用KafkaSpout从生产者发送的一个主题中读取一条消息,该生产者阅读了这些推文并将其发送给一个主题。我的问题是,拓扑消耗了该主题中发送的所有tweet之后,它将继续读取该主题中的消息两次。如何阻止KafkaSpout读取两次?(复制因子设置为1)

pom

 <build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <configuration>
                <source>7</source>
                <target>7</target>
            </configuration>
        </plugin>
    </plugins>
</build>
<dependencies>

    <dependency>
        <groupId>org.apache.kafka</groupId>
        <artifactId>kafka-streams</artifactId>
        <version>2.2.0</version>
    </dependency>

    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>storm-core</artifactId>
        <version>1.2.1</version>
        <scope>compile</scope>
        <exclusions>
            <exclusion>
                <groupId>org.slf4j</groupId>
                <artifactId>slf4j-log4j12</artifactId>
            </exclusion>
        </exclusions>
    </dependency>

    <dependency>
        <groupId>junit</groupId>
        <artifactId>junit</artifactId>
        <version>3.8.1</version>
        <scope>test</scope>
    </dependency>
    <dependency>
        <groupId>org.apache.storm</groupId>
        <artifactId>storm-kafka</artifactId>
        <version>1.2.2</version>
        <scope>compile</scope>
    </dependency>

    <dependency>
        <groupId>com.googlecode.json-simple</groupId>
        <artifactId>json-simple</artifactId>
        <version>1.1</version>
    </dependency>


    <dependency>
        <groupId>org.apache.kafka</groupId>
        <artifactId>kafka_2.9.2</artifactId>
        <version>0.8.2.2</version>
    </dependency>
    <dependency>
        <groupId>com.google.code.gson</groupId>
        <artifactId>gson</artifactId>
        <version>2.8.5</version>
    </dependency>

<!-- BOTOMETER DEPENDENCIES -->

    <dependency>
        <groupId>de.fjobilabs.botometer</groupId>
        <artifactId>botometer-java</artifactId>
        <version>0.1.1</version>
    </dependency>


</dependencies>

拓扑

        int kafkaSpoutCount = Integer.parseInt(configs.getProperty(Keys.KAFKA_SPOUT_COUNT));
    builder.setSpout(configs.getProperty(Keys.KAFKA_SPOUT_ID), kafkaSpout, kafkaSpoutCount);

    int stormBoltACount=Integer.parseInt(configs.getProperty(Keys.STORM_BOLTA_PARALLELISM));
    builder.setBolt(configs.getProperty(Keys.STORM_BOLTA),new Classifier(hashtagList1,hashtagList2,hashtagList3),stormBoltACount)
            .shuffleGrouping(configs.getProperty(Keys.STORM_BOLTA_INPUT));



    int storm2BoltACount=Integer.parseInt(configs.getProperty(Keys.STORM_2_BOLTA_PARALLELISM));
    builder.setBolt(configs.getProperty(Keys.STORM_2_BOLTA),new Bot(configs.getProperty(Keys.STORM_2_BOLTA_OUTPUT)),storm2BoltACount)
            .shuffleGrouping(configs.getProperty(Keys.STORM_2_BOLTA_INPUT_SOURCE),configs.getProperty(Keys.STORM_2_BOLTA_INPUT_FIELD));

分类器

public class Classifier extends BaseRichBolt {

private OutputCollector collector;
private Rate rater;
private int count=0;
String [] hashtagY;
String [] hashtagN;
String [] hashtagC;
File file;
PrintWriter outputwriter;

public Classifier(String [] one, String [] two, String [] three){
    super();
    this.hashtagY=one;
    this.hashtagN=two;
    this.hashtagC=three;
}

public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
    this.collector=outputCollector;
    rater=new Rate(hashtagY,hashtagN,hashtagC);
    file=new File("/home/marco/Scrivania/Tesi/postvalue.txt");
    try {
        outputwriter=new PrintWriter(file);
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }

}

public void execute(Tuple input) {
    String inputTweet=input.getString(0);
    Gson gson=new GsonBuilder().create();
    try {
        TweetJ tweetPojo = gson.fromJson(inputTweet, TweetJ.class);
        int i = rater.evaluate(tweetPojo);
        outputwriter.println(tweetPojo.getText() + "\t" + "VOTO:" + i);
        outputwriter.flush();
        if (true) {
            collector.emit("Pro", new Values(tweetPojo, i));
        } else {
            collector.emit("Con", new Values(tweetPojo, i));
        }
    }catch (Exception e){
        System.out.println(inputTweet);
        collector.ack(input);
    }
    collector.ack(input);

}

public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
    outputFieldsDeclarer.declareStream("Pro",new Fields("tweet","value"));
    outputFieldsDeclarer.declareStream("Con",new Fields("tweet","value"));

}

}

BOT

public class Bot extends BaseRichBolt {
private final String filename;
private  OutputCollector outputCollector;
private ConcurrentHashMap<String, Integer> userPostNumber;

@Override
public void prepare(Map map, TopologyContext topologyContext, OutputCollector outputCollector) {
    this.outputCollector=outputCollector;
    userPostNumber=new ConcurrentHashMap<String, Integer>();
    try {
        WriterCycle writerCycle=new WriterCycle(filename,userPostNumber);
        Thread thread=new Thread(writerCycle);
        thread.start();
    } catch (IOException e) {
        e.printStackTrace();
    }
}



public Bot (String filename){
    super();
    this.filename=filename;
}

@Override
public void execute(Tuple input) {
    TweetJ tweet=(TweetJ)input.getValue(0);
    String user=tweet.getUser().getScreenName();
    if(userPostNumber.containsKey(user)){
        int i=userPostNumber.get((user));
        i=i+1;
        userPostNumber.put(user,i);
    }else{
        userPostNumber.put(user,1);
    }
    outputCollector.ack(input);

}



@Override
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {

}
@Override
public void cleanup(){

}

}

要在kafka主题中发送消息,我使用此类

制作人

public class Producer implements Runnable {
String filePath;
public Producer(String filePath){
    this.filePath =filePath;

}
public void run() {
    File f = new File(filePath);
    BufferedReader bufferedReader = null;
    try {
        bufferedReader = new BufferedReader(new FileReader(f));
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }
    String topicName = "tweet";

    //Configure the Producer
    Properties configProperties = new Properties();
    configProperties.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "localhost:9092");
    configProperties.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.ByteArraySerializer");
    configProperties.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer");

    org.apache.kafka.clients.producer.Producer producer = new KafkaProducer(configProperties);
    String line = null;
    try {
        line = bufferedReader.readLine();
    } catch (IOException e) {
        e.printStackTrace();
    }
    while (line != null) {
        ProducerRecord<String, String> rec = new ProducerRecord<String, String>(topicName, line);
        producer.send(rec);
        try {
            line = bufferedReader.readLine();
        } catch (IOException e) {
            e.printStackTrace();
        }/*
        try {
            Thread.sleep(50);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }*/
    }
    producer.close();

}

更新

创建kafkaSpout

    builder.setSpout("kafka_spout", new KafkaSpout<>(getKafkaSpoutConfig("127.0.0.1:9092")), 1);
    protected KafkaSpoutConfig<String, String> getKafkaSpoutConfig(String bootstrapServers) {
    return KafkaSpoutConfig.builder(bootstrapServers, "tweet")
            .setProp(ConsumerConfig.GROUP_ID_CONFIG, "sample_group")
            .setRetry(getRetryService())
            .setOffsetCommitPeriodMs(10_000)
            .setFirstPollOffsetStrategy(EARLIEST)
            .setMaxUncommittedOffsets(1000000)
            .build();
}

protected KafkaSpoutRetryService getRetryService() {
    return new KafkaSpoutRetryExponentialBackoff(KafkaSpoutRetryExponentialBackoff.TimeInterval.microSeconds(500),
            KafkaSpoutRetryExponentialBackoff.TimeInterval.milliSeconds(2), Integer.MAX_VALUE, KafkaSpoutRetryExponentialBackoff.TimeInterval.seconds(10));
}

1 个答案:

答案 0 :(得分:0)

配置对我来说很好。

也许这个问题是重复的。确保只在execute中确认每个元组一次。

如评论中所述,请考虑升级到较新的Kafka版本,并切换到storm-kafka-client

还可以让您的生活更轻松一些:考虑扩展BaseBasicBolt而不是BaseRichBolt。如果运行BaseBasicBolt不会引发错误,那么execute会自动为您修改元组。如果要使元组失败,可以抛出FailedExceptionBaseRichBolt仅在您要进行更复杂的确认时才应使用,例如在确认之前,从内存中的许多execute调用中聚合元组。