我想使用Flink的流式文件接收器写入ORC文件,但无法正确写入文件

时间:2020-07-10 01:35:38

标签: apache-flink flink-streaming flink-batch

我正在从Kafka读取数据,并尝试将其以ORC格式写入HDFS文件系统。我使用了他们官方网站上的以下链接参考。但是我可以看到Flink为所有数据写入完全相同的内容,并制作了许多文件,并且所有文件都可以使用103KB

https://ci.apache.org/projects/flink/flink-docs-release-1.11/dev/connectors/streamfile_sink.html#orc-format

请在下面找到我的代码。

object BeaconBatchIngest extends StreamingBase {
  val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
  def getTopicConfig(configs: List[Config]): Map[String, String]  = (for (config: Config <- configs) yield (config.getString("sourceTopic"), config.getString("destinationTopic"))).toMap

  def setKafkaConfig():Unit ={
    val kafkaParams = new Properties()
    kafkaParams.setProperty("bootstrap.servers","")
    kafkaParams.setProperty("zookeeper.connect","")
    kafkaParams.setProperty("group.id", DEFAULT_KAFKA_GROUP_ID)
    kafkaParams.setProperty("auto.offset.reset", "latest")
    
    val kafka_consumer:FlinkKafkaConsumer[String] = new FlinkKafkaConsumer[String]("sourceTopics", new SimpleStringSchema(),kafkaParams)
    kafka_consumer.setStartFromLatest()
    val stream: DataStream[DataParse] = env.addSource(kafka_consumer).map(new temp)
    val schema: String = "struct<_col0:string,_col1:bigint,_col2:string,_col3:string,_col4:string>"
    val writerProperties = new Properties()

    writerProperties.setProperty("orc.compress", "ZLIB")
    val writerFactory = new OrcBulkWriterFactory(new PersonVectorizer(schema),writerProperties,new org.apache.hadoop.conf.Configuration);
    val sink: StreamingFileSink[DataParse] = StreamingFileSink
          .forBulkFormat(new Path("hdfs://warehousestore/hive/warehouse/metrics_test.db/upp_raw_prod/hour=1/"), writerFactory)
          .build()
    stream.addSink(sink)
  }


  def main(args: Array[String]): Unit = {
    setKafkaConfig()
    env.enableCheckpointing(5000)
    env.execute("Kafka_Flink_HIVE")
  }
}
class temp extends MapFunction[String,DataParse]{

  override def map(record: String): DataParse = {
    new DataParse(record)
  }
}

class DataParse(data : String){
  val parsedJason = parse(data)
  val timestamp = compact(render(parsedJason \ "timestamp")).replaceAll("\"", "").toLong
  val event = compact(render(parsedJason \ "event")).replaceAll("\"", "")
  val source_id = compact(render(parsedJason \ "source_id")).replaceAll("\"", "")
  val app = compact(render(parsedJason \ "app")).replaceAll("\"", "")
  val json = data
}
class PersonVectorizer(schema: String) extends Vectorizer[DataParse](schema) {

  override def vectorize(element: DataParse, batch: VectorizedRowBatch): Unit = {
    val eventColVector = batch.cols(0).asInstanceOf[BytesColumnVector]
    val timeColVector = batch.cols(1).asInstanceOf[LongColumnVector]
    val sourceIdColVector = batch.cols(2).asInstanceOf[BytesColumnVector]
    val appColVector = batch.cols(3).asInstanceOf[BytesColumnVector]
    val jsonColVector = batch.cols(4).asInstanceOf[BytesColumnVector]
    timeColVector.vector(batch.size + 1) = element.timestamp
    eventColVector.setVal(batch.size + 1, element.event.getBytes(StandardCharsets.UTF_8))
    sourceIdColVector.setVal(batch.size + 1, element.source_id.getBytes(StandardCharsets.UTF_8))
    appColVector.setVal(batch.size + 1, element.app.getBytes(StandardCharsets.UTF_8))
    jsonColVector.setVal(batch.size + 1, element.json.getBytes(StandardCharsets.UTF_8))
  }

}

1 个答案:

答案 0 :(得分:1)

在使用批量格式(例如ORC)的情况下,import SwiftUI struct AddNewBean: View { @Environment(\.presentationMode) var presentationMode @Environment(\.managedObjectContext) var moc @FetchRequest(entity: Bean.entity(), sortDescriptors: []) var beans: FetchedResults<Bean> @State var BeanRoaster: String = "" @State var BeanName: String = "" @State var BeanStyle: String = "Dark" @State private var RoastDate = Date() var dateFormatter: DateFormatter { let formatter = DateFormatter() formatter.dateStyle = .long return formatter } @State private var showImagePicker : Bool = false @State private var image : Image? @State private var inputImage: UIImage? @State var imageAlt: Data = .init(count: 0) let RStyles = ["Dark", "Medium", "Light"] func loadImage() { guard let inputImage = inputImage else { return } image = Image(uiImage: inputImage) } var body: some View { NavigationView { VStack { Form { VStack { image?.resizable().scaledToFit().aspectRatio(contentMode: .fit) HStack { Spacer() Button("Open Camera"){ self.showImagePicker = true }.padding(5) .foregroundColor(Color.white) .background(Color.accentColor) .cornerRadius(10) Spacer() }.sheet(isPresented: self.$showImagePicker, onDismiss: loadImage){ PhotoCaptureView(showImagePicker: self.$showImagePicker, image: self.$image) } } TextField("Röster", text: $BeanRoaster) TextField("Name der Bohne", text: $BeanName) Picker("Roestung", selection: $BeanStyle) { ForEach(RStyles, id: \.self) { Text($0) } } DatePicker(selection: $RoastDate, in: ...Date(), displayedComponents: .date) {Text("Röstdatum")} HStack { Spacer() if BeanRoaster != "" && BeanName != "" { Button(action: { //.... let pickedImage = self.inputImage?.jpegData(compressionQuality: 1.0) print("image, inputimage, pickedImage") print(self.image as Any) // prints: Optional(SwiftUI.Image(provider: SwiftUI.ImageProviderBox<__C.UIImage>)) print(self.inputImage as Any) // prints: nil print(pickedImage as Any) // prints: nil //..... let bean = Bean(context: self.moc) bean.id = UUID() bean.roaster = "\(self.BeanRoaster)" bean.name = "\(self.BeanName)" bean.roastStyle = "\(self.BeanStyle)" bean.roastDate = self.RoastDate bean.active = true bean.img = pickedImage try? self.moc.save() self.presentationMode.wrappedValue.dismiss() }) { Image(systemName: "tray") .foregroundColor(.blue) .font(.largeTitle) .padding(.vertical) } Text("Save").foregroundColor(.blue) Spacer() } else { HStack { Spacer() Text("Trage Bohnendaten ein!") Spacer() } } } Section { HStack { Spacer() Button(action: {self.presentationMode.wrappedValue.dismiss()}) { Image(systemName: "") .foregroundColor(.red) .font(.largeTitle) .padding(.vertical) } Text("Dismiss").foregroundColor(.red) Spacer() } } }.navigationBarTitle("New Bean") } } } } 会在每个检查点都移到新文件中。如果缩短检查点间隔(当前为5秒),则不会写入太多文件。