我正在研究一个监视基于微服务的系统的项目。 我创建的模拟微服务会生成数据并将其上传到亚马逊 Kinesis,现在我在亚马逊上使用此代码从Kinesis生产和消费Kinesis。但是我不明白如何添加更多处理器 (员工)将在同一记录列表上(可能同时)工作, 这意味着我试图找出将代码插入到我在下面添加的Amazon的已添加代码中的位置和方式。
我的程序中将有两个处理器:
我真的很感谢一些指导,因为这是我的第一个行业项目,而且我对AWS还是陌生的(尽管我对此有很多了解)。 谢谢!
/*
* Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Amazon Software License (the "License").
* You may not use this file except in compliance with the License.
* A copy of the License is located at
*
* http://aws.amazon.com/asl/
*
* or in the "license" file accompanying this file. This file is distributed
* on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*/
package com.amazonaws.services.kinesis.producer.sample;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.amazonaws.auth.DefaultAWSCredentialsProviderChain;
import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessor;
import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorCheckpointer;
import com.amazonaws.services.kinesis.clientlibrary.interfaces.IRecordProcessorFactory;
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream;
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.KinesisClientLibConfiguration;
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.Worker;
import com.amazonaws.services.kinesis.clientlibrary.lib.worker.ShutdownReason;
import com.amazonaws.services.kinesis.model.Record;
/**
* If you haven't looked at {@link SampleProducer}, do so first.
*
* <p>
* As mentioned in SampleProducer, we will check that all records are received
* correctly by the KCL by verifying that there are no gaps in the sequence
* numbers.
*
* <p>
* As the consumer runs, it will periodically log a message indicating the
* number of gaps it found in the sequence numbers. A gap is when the difference
* between two consecutive elements in the sorted list of seen sequence numbers
* is greater than 1.
*
* <p>
* Over time the number of gaps should converge to 0. You should also observe
* that the range of sequence numbers seen is equal to the number of records put
* by the SampleProducer.
*
* <p>
* If the stream contains data from multiple runs of SampleProducer, you should
* observe the SampleConsumer detecting this and resetting state to only count
* the latest run.
*
* <p>
* Note if you kill the SampleConsumer halfway and run it again, the number of
* gaps may never converge to 0. This is because checkpoints may have been made
* such that some records from the producer's latest run are not processed
* again. If you observe this, simply run the producer to completion again
* without terminating the consumer.
*
* <p>
* The consumer continues running until manually terminated, even if there are
* no more records to consume.
*
* @see SampleProducer
* @author chaodeng
*
*/
public class SampleConsumer implements IRecordProcessorFactory {
private static final Logger log = LoggerFactory.getLogger(SampleConsumer.class);
// All records from a run of the producer have the same timestamp in their
// partition keys. Since this value increases for each run, we can use it
// determine which run is the latest and disregard data from earlier runs.
private final AtomicLong largestTimestamp = new AtomicLong(0);
// List of record sequence numbers we have seen so far.
private final List<Long> sequenceNumbers = new ArrayList<>();
// A mutex for largestTimestamp and sequenceNumbers. largestTimestamp is
// nevertheless an AtomicLong because we cannot capture non-final variables
// in the child class.
private final Object lock = new Object();
/**
* One instance of RecordProcessor is created for every shard in the stream.
* All instances of RecordProcessor share state by capturing variables from
* the enclosing SampleConsumer instance. This is a simple way to combine
* the data from multiple shards.
*/
private class RecordProcessor implements IRecordProcessor {
@Override
public void initialize(String shardId) {}
@Override
public void processRecords(List<Record> records, IRecordProcessorCheckpointer checkpointer) {
long timestamp = 0;
List<Long> seqNos = new ArrayList<>();
for (Record r : records) {
// Get the timestamp of this run from the partition key.
timestamp = Math.max(timestamp, Long.parseLong(r.getPartitionKey()));
// Extract the sequence number. It's encoded as a decimal
// string and placed at the beginning of the record data,
// followed by a space. The rest of the record data is padding
// that we will simply discard.
try {
byte[] b = new byte[r.getData().remaining()];
r.getData().get(b);
seqNos.add(Long.parseLong(new String(b, "UTF-8").split(" ")[0]));
} catch (Exception e) {
log.error("Error parsing record", e);
System.exit(1);
}
}
synchronized (lock) {
if (largestTimestamp.get() < timestamp) {
log.info(String.format(
"Found new larger timestamp: %d (was %d), clearing state",
timestamp, largestTimestamp.get()));
largestTimestamp.set(timestamp);
sequenceNumbers.clear();
}
// Only add to the shared list if our data is from the latest run.
if (largestTimestamp.get() == timestamp) {
sequenceNumbers.addAll(seqNos);
Collections.sort(sequenceNumbers);
}
}
try {
checkpointer.checkpoint();
} catch (Exception e) {
log.error("Error while trying to checkpoint during ProcessRecords", e);
}
}
@Override
public void shutdown(IRecordProcessorCheckpointer checkpointer, ShutdownReason reason) {
log.info("Shutting down, reason: " + reason);
try {
checkpointer.checkpoint();
} catch (Exception e) {
log.error("Error while trying to checkpoint during Shutdown", e);
}
}
}
/**
* Log a message indicating the current state.
*/
public void logResults() {
synchronized (lock) {
if (largestTimestamp.get() == 0) {
return;
}
if (sequenceNumbers.size() == 0) {
log.info("No sequence numbers found for current run.");
return;
}
// The producer assigns sequence numbers starting from 1, so we
// start counting from one before that, i.e. 0.
long last = 0;
long gaps = 0;
for (long sn : sequenceNumbers) {
if (sn - last > 1) {
gaps++;
}
last = sn;
}
log.info(String.format(
"Found %d gaps in the sequence numbers. Lowest seen so far is %d, highest is %d",
gaps, sequenceNumbers.get(0), sequenceNumbers.get(sequenceNumbers.size() - 1)));
}
}
@Override
public IRecordProcessor createProcessor() {
return this.new RecordProcessor();
}
public static void main(String[] args) {
KinesisClientLibConfiguration config =
new KinesisClientLibConfiguration(
"KinesisProducerLibSampleConsumer",
SampleProducer.STREAM_NAME,
new DefaultAWSCredentialsProviderChain(),
"KinesisProducerLibSampleConsumer")
.withRegionName(SampleProducer.REGION)
.withInitialPositionInStream(InitialPositionInStream.TRIM_HORIZON);
final SampleConsumer consumer = new SampleConsumer();
Executors.newScheduledThreadPool(1).scheduleAtFixedRate(new Runnable() {
@Override
public void run() {
consumer.logResults();
}
}, 10, 1, TimeUnit.SECONDS);
new Worker.Builder()
.recordProcessorFactory(consumer)
.config(config)
.build()
.run();
}
}
答案 0 :(得分:0)
您的问题非常广泛,但以下是有关Kinesis消费者的一些建议,希望与您的用例有关。
每个Kinesis流都分成一个或多个碎片。每个分片都有一定的限制,例如您不能每秒向一个分片中写入超过MiB的数据,并且您一次不能向单个实例发起超过5个GetRecords(消费者的内幕processRecords调用)请求碎片。 (请参阅约束here的完整列表。)如果要处理接近或超过这些约束的数据量,则需要增加流中分片的数量。
当您只有一个使用者应用程序和一个工作程序时,它将负责处理相应流的所有分片。如果有多个工作人员,则每个工作人员都负责分片的某些子集,因此每个分片都分配给一个且只有一个工作人员(如果您查看消费者日志,则可以在分片上找到该地址的“租用”)。 / p>
如果您希望有多个处理器独立地接收Kinesis流量和流程记录,则需要注册两个单独的使用者应用程序。在上面引用的代码中,应用程序名称是KinesisClientLibConfiguration构造函数的第一个参数。请注意,即使它们是单独的消费者应用程序,每秒总5个GetRecords的限制仍然适用。
换句话说,您需要有两个单独的过程,一个将实例化与DB对话的使用者,另一个将实例化更新GUI的使用者:
KinesisClientLibConfiguration databaseSaverKclConfig =
new KinesisClientLibConfiguration(
"DatabaseSaverKclApp",
"your-stream",
new DefaultAWSCredentialsProviderChain(),
// I believe worker ids don't need to be unique, but it's a good practice to make them unique so you can easily identify the workers
"unique-worker-id")
.withRegionName(SampleProducer.REGION)
// this only matters the very first time your consumer is launched, subsequent launches will read the checkpoint from the previous runs
.withInitialPositionInStream(InitialPositionInStream.TRIM_HORIZON);
final IRecordProcessorFactory databaseSaverConsumer = new DatabaseSaverConsumer();
KinesisClientLibConfiguration guiUpdaterKclConfig =
new KinesisClientLibConfiguration(
"GuiUpdaterKclApp",
"your-stream",
new DefaultAWSCredentialsProviderChain(),
"unique-worker-id")
.withRegionName(SampleProducer.REGION)
.withInitialPositionInStream(InitialPositionInStream.TRIM_HORIZON);
final IRecordProcessorFactory guiUpdaterConsumer = new GuiUpdaterConsumer();
DatabaseSaverConsumer和GuiUpdaterConsumer的实现如何?他们每个人都需要在processRecords方法中实现自定义逻辑。您需要确保他们每个人都在该方法内完成正确的工作,并且检查点逻辑是正确的。让我们解读这些:
说到GUI,您用来显示数据的原因是什么,为什么Kinesis使用者需要更新它,而不是GUI本身查询基础数据存储?
无论如何,我希望这会有所帮助。让我知道您是否还有其他具体问题。