我使用的是 Flink 1.12.0,我有以下完整代码来演示如何将流事件写入 parquet 文件。
当我运行代码时,我观察到两件事:
println(s"$i is emitted")
在 StudentSource#run 中总是打印 0 is emitted
,并且没有打印 1 is emitted
或 i
当我查看输出目录时,我看到可能每秒创建new empty
个文件(文件名中包含 inprogress
)。
我开启了检查点,不知道问题出在哪里,能帮忙看看吗?
package org.example.official.connector
import org.apache.flink.core.fs.Path
import org.apache.flink.formats.parquet.avro.ParquetAvroWriters
import org.apache.flink.runtime.state.filesystem.FsStateBackend
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
import org.scalatest.funsuite.AnyFunSuite
import scala.beans.BeanProperty
case class Student(
@BeanProperty id: Int,
@BeanProperty name: String
)
class StudentSource(val emitInterval: Int = 0) extends SourceFunction[Student] {
override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
(0 to 100).foreach {
i =>
ctx.collect(Student(i, s"name-$i"))
//always print 0 is emitted
println(s"$i is emitted")
if (emitInterval > 0) {
Thread.sleep(emitInterval)
}
}
//Keep the source running
while (true) {
Thread.sleep(100)
}
}
override def cancel(): Unit = {
}
}
class StreamParquetAvroFileSink_001 extends AnyFunSuite {
def enableCheckpoint(env: StreamExecutionEnvironment) = {
env.enableCheckpointing(5 * 1000, CheckpointingMode.EXACTLY_ONCE)
env.getCheckpointConfig.setCheckpointInterval(5 * 1000)
env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
val path = "file:///D:/flink-checkpoints/"
println(s"checkpoint path is ${path}")
val backend = new FsStateBackend(path)
env.setStateBackend(backend)
}
//The test suite
test("StreamParquetAvroFileSink_001") {
val path = new Path("file:///D:/data01/logs/aa")
val sink: StreamingFileSink[Student] = StreamingFileSink
.forBulkFormat(path, ParquetAvroWriters.forReflectRecord(classOf[Student]))
.build()
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(3)
enableCheckpoint(env)
val ds = env.addSource(new StudentSource(emitInterval = 1000))
ds.addSink(sink)
env.execute()
}
}