将接收器流式传输到镶木地板文件时的奇怪行为

时间:2021-06-26 07:32:52

标签: apache-flink

我使用的是 Flink 1.12.0,我有以下完整代码来演示如何将流事件写入 parquet 文件。

当我运行代码时,我观察到两件事:

  1. println(s"$i is emitted") 在 StudentSource#run 中总是打印 0 is emitted,并且没有打印 1 is emittedi

    的其他值
  2. 当我查看输出目录时,我看到可能每秒创建new empty 个文件(文件名中包含 inprogress)。

我开启了检查点,不知道问题出在哪里,能帮忙看看吗?

package org.example.official.connector

import org.apache.flink.core.fs.Path
import org.apache.flink.formats.parquet.avro.ParquetAvroWriters
import org.apache.flink.runtime.state.filesystem.FsStateBackend
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.functions.sink.filesystem.StreamingFileSink
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
import org.scalatest.funsuite.AnyFunSuite

import scala.beans.BeanProperty

case class Student(
                    @BeanProperty id: Int,
                    @BeanProperty name: String
                  )

class StudentSource(val emitInterval: Int = 0) extends SourceFunction[Student] {
  override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
    (0 to 100).foreach {
      i =>
        ctx.collect(Student(i, s"name-$i"))
        //always print 0 is emitted
        println(s"$i is emitted")
        if (emitInterval > 0) {
          Thread.sleep(emitInterval)
        }
    }

    //Keep the source running
    while (true) {
      Thread.sleep(100)
    }

  }

  override def cancel(): Unit = {
  }
}

class StreamParquetAvroFileSink_001 extends AnyFunSuite {
  def enableCheckpoint(env: StreamExecutionEnvironment) = {
    env.enableCheckpointing(5 * 1000, CheckpointingMode.EXACTLY_ONCE)
    env.getCheckpointConfig.setCheckpointInterval(5 * 1000)
    env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
    val path = "file:///D:/flink-checkpoints/"

    println(s"checkpoint path is ${path}")
    val backend = new FsStateBackend(path)
    env.setStateBackend(backend)
  }

  //The test suite
  test("StreamParquetAvroFileSink_001") {
    val path = new Path("file:///D:/data01/logs/aa")
    val sink: StreamingFileSink[Student] = StreamingFileSink
      .forBulkFormat(path, ParquetAvroWriters.forReflectRecord(classOf[Student]))
      .build()
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(3)
    enableCheckpoint(env)
    val ds = env.addSource(new StudentSource(emitInterval = 1000))
    ds.addSink(sink)
    env.execute()

  }
}

0 个答案:

没有答案