解析apache beam中的问题(beamSql)

时间:2018-03-21 08:47:44

标签: apache-beam

我有一个下面的代码,我正在以字符串格式读取文件,然后将其转换为类格式,然后将其转换为BeamRecord,最后将其转换回字符串格式并将输出写入谷歌存储。< / p>

DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
     options.setProject("beta-194409");
     options.setStagingLocation("gs://clrtegbucket/staging");
     options.setRunner(DataflowRunner.class);
     DataflowRunner.fromOptions(options);
     Pipeline p = Pipeline.create(options);

    PCollection<String> weekly = p.apply(TextIO.read().from("gs://gcp/input/WeeklyDueto.csv")); 
    PCollection<ClassWeeklyDueto> pojos = weekly.apply(ParDo.of(new DoFn<String, ClassWeeklyDueto>() { // converting String into class
                                                                                    // typ
        private static final long serialVersionUID = 1L;
        @ProcessElement
        public void processElement(ProcessContext c) {
            String[] strArr = c.element().split(",");
            ClassWeeklyDueto clr = new ClassWeeklyDueto();
            clr.setCatLib(strArr[1]);           
            clr.setCausalValue(strArr[7]);
            clr.setDuetoValue(strArr[5]);
            clr.setModelIteration(strArr[8]);
            clr.setOutlet(strArr[0]);
            clr.setPrimaryCausalKey(strArr[6]);
            clr.setProdKey(strArr[2]);
            clr.setPublished(strArr[9]);
            clr.setSalesComponent(strArr[4]);
            clr.setWeek(strArr[3]);
            global_Weekly.add(clr);
            c.output(clr);
        }
    }));

    BeamRecordSqlType appType = BeamRecordSqlType.create(
              Arrays.asList("Outlet", "CatLib", "ProdKey", "Week", "SalesComponent", "DuetoValue","PrimaryCausalKey", "CausalValue", "ModelIteration", "Published"),
              Arrays.asList(Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.FLOAT, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR));

    PCollection<BeamRecord> apps = pojos.apply(ParDo.of(new DoFn<ClassWeeklyDueto, BeamRecord>() {
        private static final long serialVersionUID = 1L;
        @ProcessElement
        public void processElement(ProcessContext c) {
            BeamRecord br = new BeamRecord(appType, {
            BeamRecord br = new BeamRecord(appType, c.element().Outlet, c.element().CatLib, c.element().ProdKey,
                    c.element().Week, c.element().SalesComponent, c.element().DuetoValue,
                    c.element().PrimaryCausalKey, c.element().CausalValue, c.element().ModelIteration,
                    c.element().Published);
            c.output(br); }
    })).setCoder(appType.getRecordCoder());

    PCollection<String> gs_output_final = apps.apply(ParDo.of(new DoFn<BeamRecord, String>() {
        private static final long serialVersionUID = 1L;
        @ProcessElement
        public void processElement(ProcessContext c) {
            c.output(c.element().toString());
            System.out.println(c.element().toString());
        }
    }));
 gs_output_final.apply(TextIO.write().to("gs://gcp/output/Q"));

我在下面创建了ClassWeeklyDueto类:

package com.pojo;  
import java.io.Serializable;  
public class ClassWeeklyDueto implements Serializable {  

private static final long serialVersionUID = 1L;
public String Outlet;
public String CatLib;
public String ProdKey;
public String Week;
public String SalesComponent;
public float DuetoValue;
public String PrimaryCausalKey;
public String CausalValue;
public String ModelIteration;
public String Published;
public String getOutlet() {
    return Outlet;
}
public void setOutlet(String outlet) {
    Outlet = outlet;
}
public String getCatLib() {
    return CatLib;
}
public void setCatLib(String catLib) {
    CatLib = catLib;
}
public String getProdKey() {
    return ProdKey;
}
public void setProdKey(String prodKey) {
    ProdKey = prodKey;
}
public String getWeek() {
    return Week;
}
public void setWeek(String week) {
    Week = week;
}
public String getSalesComponent() {
    return SalesComponent;
}
public void setSalesComponent(String salesComponent) {
    SalesComponent = salesComponent;
}
public float getDuetoValue() {
    return DuetoValue;
}
public void setDuetoValue(float duetoValue) {
    DuetoValue = duetoValue;
}
public String getPrimaryCausalKey() {
    return PrimaryCausalKey;
}
public void setPrimaryCausalKey(String primaryCausalKey) {
    PrimaryCausalKey = primaryCausalKey;
}
public String getCausalValue() {
    return CausalValue;
}
public void setCausalValue(String causalValue) {
    CausalValue = causalValue;
}
public String getModelIteration() {
    return ModelIteration;
}
public void setModelIteration(String modelIteration) {
    ModelIteration = modelIteration;
}
public String getPublished() {
    return Published;
}
public void setPublished(String published) {
    Published = published;
}
public float setDuetoValue(String string) {
    // TODO Auto-generated method stub
    float f = Float.valueOf(string.trim()).floatValue();
    return f;
}

}

DueToValue字段声明为float类型,只有声明为varchar的字段才会被解析,其余的数据类型都不会被解析。

那么如何解析声明为Int或float或甚至Date的字段?

1 个答案:

答案 0 :(得分:1)

当您从CSV手动分割字符串行时,您将获得一个字符串数组。然后,您必须手动解析字符串中的值。 Java不会自动处理它。

在处理浮动的情况下,您需要将clr.setDueToValue(strArr[5])更改为clr.setDueToValue(Float.parseFloat(strArr[5]))see the doc

同样,您可以使用Integer.parseInt()来解析整数。

对于解析日期,您可能需要使用SimpleDateFormat