我正在从Kafka读取数据,并尝试将其以ORC格式写入HDFS文件系统。我使用了他们官方网站上的以下链接参考。但是我可以看到Flink为所有数据写入完全相同的内容,并制作了许多文件,并且所有文件都可以使用103KB
请在下面找到我的代码。
object BeaconBatchIngest extends StreamingBase {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
def getTopicConfig(configs: List[Config]): Map[String, String] = (for (config: Config <- configs) yield (config.getString("sourceTopic"), config.getString("destinationTopic"))).toMap
def setKafkaConfig():Unit ={
val kafkaParams = new Properties()
kafkaParams.setProperty("bootstrap.servers","")
kafkaParams.setProperty("zookeeper.connect","")
kafkaParams.setProperty("group.id", DEFAULT_KAFKA_GROUP_ID)
kafkaParams.setProperty("auto.offset.reset", "latest")
val kafka_consumer:FlinkKafkaConsumer[String] = new FlinkKafkaConsumer[String]("sourceTopics", new SimpleStringSchema(),kafkaParams)
kafka_consumer.setStartFromLatest()
val stream: DataStream[DataParse] = env.addSource(kafka_consumer).map(new temp)
val schema: String = "struct<_col0:string,_col1:bigint,_col2:string,_col3:string,_col4:string>"
val writerProperties = new Properties()
writerProperties.setProperty("orc.compress", "ZLIB")
val writerFactory = new OrcBulkWriterFactory(new PersonVectorizer(schema),writerProperties,new org.apache.hadoop.conf.Configuration);
val sink: StreamingFileSink[DataParse] = StreamingFileSink
.forBulkFormat(new Path("hdfs://warehousestore/hive/warehouse/metrics_test.db/upp_raw_prod/hour=1/"), writerFactory)
.build()
stream.addSink(sink)
}
def main(args: Array[String]): Unit = {
setKafkaConfig()
env.enableCheckpointing(5000)
env.execute("Kafka_Flink_HIVE")
}
}
class temp extends MapFunction[String,DataParse]{
override def map(record: String): DataParse = {
new DataParse(record)
}
}
class DataParse(data : String){
val parsedJason = parse(data)
val timestamp = compact(render(parsedJason \ "timestamp")).replaceAll("\"", "").toLong
val event = compact(render(parsedJason \ "event")).replaceAll("\"", "")
val source_id = compact(render(parsedJason \ "source_id")).replaceAll("\"", "")
val app = compact(render(parsedJason \ "app")).replaceAll("\"", "")
val json = data
}
class PersonVectorizer(schema: String) extends Vectorizer[DataParse](schema) {
override def vectorize(element: DataParse, batch: VectorizedRowBatch): Unit = {
val eventColVector = batch.cols(0).asInstanceOf[BytesColumnVector]
val timeColVector = batch.cols(1).asInstanceOf[LongColumnVector]
val sourceIdColVector = batch.cols(2).asInstanceOf[BytesColumnVector]
val appColVector = batch.cols(3).asInstanceOf[BytesColumnVector]
val jsonColVector = batch.cols(4).asInstanceOf[BytesColumnVector]
timeColVector.vector(batch.size + 1) = element.timestamp
eventColVector.setVal(batch.size + 1, element.event.getBytes(StandardCharsets.UTF_8))
sourceIdColVector.setVal(batch.size + 1, element.source_id.getBytes(StandardCharsets.UTF_8))
appColVector.setVal(batch.size + 1, element.app.getBytes(StandardCharsets.UTF_8))
jsonColVector.setVal(batch.size + 1, element.json.getBytes(StandardCharsets.UTF_8))
}
}
答案 0 :(得分:1)
在使用批量格式(例如ORC)的情况下,import SwiftUI
struct AddNewBean: View {
@Environment(\.presentationMode) var presentationMode
@Environment(\.managedObjectContext) var moc
@FetchRequest(entity: Bean.entity(), sortDescriptors: []) var beans: FetchedResults<Bean>
@State var BeanRoaster: String = ""
@State var BeanName: String = ""
@State var BeanStyle: String = "Dark"
@State private var RoastDate = Date()
var dateFormatter: DateFormatter {
let formatter = DateFormatter()
formatter.dateStyle = .long
return formatter }
@State private var showImagePicker : Bool = false
@State private var image : Image?
@State private var inputImage: UIImage?
@State var imageAlt: Data = .init(count: 0)
let RStyles = ["Dark", "Medium", "Light"]
func loadImage() {
guard let inputImage = inputImage else { return }
image = Image(uiImage: inputImage)
}
var body: some View {
NavigationView {
VStack {
Form {
VStack {
image?.resizable().scaledToFit().aspectRatio(contentMode: .fit)
HStack {
Spacer()
Button("Open Camera"){
self.showImagePicker = true
}.padding(5)
.foregroundColor(Color.white)
.background(Color.accentColor)
.cornerRadius(10)
Spacer()
}.sheet(isPresented: self.$showImagePicker, onDismiss: loadImage){
PhotoCaptureView(showImagePicker: self.$showImagePicker, image: self.$image)
}
}
TextField("Röster", text: $BeanRoaster)
TextField("Name der Bohne", text: $BeanName)
Picker("Roestung", selection: $BeanStyle) {
ForEach(RStyles, id: \.self) {
Text($0)
}
}
DatePicker(selection: $RoastDate, in: ...Date(), displayedComponents: .date) {Text("Röstdatum")}
HStack {
Spacer()
if BeanRoaster != "" && BeanName != "" {
Button(action: {
//....
let pickedImage = self.inputImage?.jpegData(compressionQuality: 1.0)
print("image, inputimage, pickedImage")
print(self.image as Any) // prints: Optional(SwiftUI.Image(provider: SwiftUI.ImageProviderBox<__C.UIImage>))
print(self.inputImage as Any) // prints: nil
print(pickedImage as Any) // prints: nil
//.....
let bean = Bean(context: self.moc)
bean.id = UUID()
bean.roaster = "\(self.BeanRoaster)"
bean.name = "\(self.BeanName)"
bean.roastStyle = "\(self.BeanStyle)"
bean.roastDate = self.RoastDate
bean.active = true
bean.img = pickedImage
try? self.moc.save()
self.presentationMode.wrappedValue.dismiss()
}) {
Image(systemName: "tray")
.foregroundColor(.blue)
.font(.largeTitle)
.padding(.vertical)
}
Text("Save").foregroundColor(.blue)
Spacer()
} else {
HStack {
Spacer()
Text("Trage Bohnendaten ein!")
Spacer()
}
}
}
Section {
HStack {
Spacer()
Button(action: {self.presentationMode.wrappedValue.dismiss()}) {
Image(systemName: "")
.foregroundColor(.red)
.font(.largeTitle)
.padding(.vertical)
}
Text("Dismiss").foregroundColor(.red)
Spacer()
}
}
}.navigationBarTitle("New Bean")
}
}
}
}
会在每个检查点都移到新文件中。如果缩短检查点间隔(当前为5秒),则不会写入太多文件。