Apache Beam获取kafka数据执行SQL错误:没有架构时无法调用getSchema

时间:2019-11-22 10:17:55

标签: apache-beam apache-beam-io beam-sql

我将多个表的数据输入到kafka,beam将在获取数据后执行SQL,但是现在出现以下错误:

线程“ main”中的异常

  

java.lang.IllegalStateException:如果存在,则无法调用getSchema   没有架构   org.apache.beam.sdk.values.PCollection.getSchema(PCollection.java:328)     在   org.apache.beam.sdk.extensions.sql.impl.schema.BeamPCollectionTable。(BeamPCollectionTable.java:34)     在   org.apache.beam.sdk.extensions.sql.SqlTransform.toTableMap(SqlTransform.java:141)     在   org.apache.beam.sdk.extensions.sql.SqlTransform.expand(SqlTransform.java:102)     在   org.apache.beam.sdk.extensions.sql.SqlTransform.expand(SqlTransform.java:82)     在org.apache.beam.sdk.Pipeline.applyInternal(Pipeline.java:539)处   org.apache.beam.sdk.Pipeline.applyTransform(Pipeline.java:473)在   org.apache.beam.sdk.values.PCollectionTuple.apply(PCollectionTuple.java:248)     在BeamSqlTest.main(BeamSqlTest.java:65)

有可行的解决方案吗?请帮帮我!

2 个答案:

答案 0 :(得分:1)

我认为您需要使用PCollection<Row> applysetRowSchema()为输入集合setSchema()设置架构。问题在于您的模式是动态的,并且是在运行时定义的(不确定Beam是否支持此模式)。在开始处理输入数据之前,您能否拥有静态架构并对其进行定义?

此外,由于您的输入源是不受限制的,因此您需要定义要在其后应用SqlTransform的窗口。

答案 1 :(得分:0)

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.beam.repackaged.sql.com.google.common.collect.ImmutableMap;
import org.apache.beam.runners.direct.DirectRunner;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.extensions.sql.SqlTransform;
import org.apache.beam.sdk.io.kafka.KafkaIO;
import org.apache.beam.sdk.io.kafka.KafkaRecord;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.schemas.Schema;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SimpleFunction;
import org.apache.beam.sdk.values.*;
import org.apache.kafka.common.serialization.StringDeserializer;

import java.util.ArrayList;
import java.util.List;

class BeamSqlTest {
    public static void main(String[] args) {
        PipelineOptions options = PipelineOptionsFactory.fromArgs(args).as(PipelineOptions.class);
        options.setRunner(DirectRunner.class);
        Pipeline p = Pipeline.create(options);

        PCollection<KafkaRecord<String, String>> lines = p.apply(KafkaIO.<String, String>read()
                .withBootstrapServers("192.168.8.16")
                .withTopic("tmp_table.reuslt")
                .withKeyDeserializer(StringDeserializer.class)
                .withValueDeserializer(StringDeserializer.class)
                .withConsumerConfigUpdates(ImmutableMap.of("group.id", "beam_app"))
                .withReadCommitted()
                .commitOffsetsInFinalize());

        PCollection<Row> apply = lines.apply(ParDo.of(new DoFn<KafkaRecord<String, String>,Row>(){
            @ProcessElement
            public void processElement(ProcessContext c) {
                String jsonData = c.element().getKV().getValue(); //data: {id:0001@int,name:test01@string,age:29@int,score:99@int}
                if(!"data_increment_heartbeat".equals(jsonData)){ //Filter out heartbeat information
                    JSONObject jsonObject = JSON.parseObject(jsonData);
                    Schema.Builder builder = Schema.builder();
                    //A data pipeline may have data from multiple tables so the Schema is obtained dynamically
                    //This assumes data from a single table
                    List<Object> list = new ArrayList<Object>();
                    for(String s : jsonObject.keySet()) {
                        String[] dataType = jsonObject.get(s).toString().split("@");   //data@field type
                        if(dataType[1].equals("int")){
                            builder.addInt32Field(s);
                        }else if(dataType[1].equals("string")){
                            builder.addStringField(s);
                        }
                        list.add(dataType[0]);
                    }
                    Schema schema = builder.build();
                    Row row = Row.withSchema(schema).addValues(list).build();
                    System.out.println(row);
                    c.output(row);
                }
            }
        }));

        PCollection<Row> result = PCollectionTuple.of(new TupleTag<>("USER_TABLE"), apply)
                .apply(SqlTransform.query("SELECT COUNT(id) total_count, SUM(score) total_score FROM USER_TABLE GROUP BY id"));

        result.apply( "log_result", MapElements.via( new SimpleFunction<Row, Row>() {
            @Override
            public Row apply(Row input) {
                System.out.println("USER_TABLE result: " + input.getValues());
                return input;
            }
        }));`enter code here`

    }
}