如何使用flink写入S3?

时间:2018-07-19 03:02:06

标签: apache-flink

我发现无法编译的旧旧不完整代码(http://antburton.com/writing-to-s3-with-flink/)和一些模棱两可的信息(https://lists.apache.org/thread.html/%3C1519058352816-0.post@n4.nabble.com%3E)。

flink是否支持以1.5.1或任何其他版本写入S3?

下面是我能弥补的最好的

package com.example;

import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.datastream.DataStream;

import org.apache.flink.streaming.connectors.fs.Clock;
import org.apache.flink.streaming.connectors.fs.StringWriter;
import org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink;
import org.apache.flink.streaming.connectors.fs.bucketing.DateTimeBucketer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import org.apache.hadoop.fs.Path;

import java.util.Properties;

/**
* Skeleton for a Flink Streaming Job.
*
* <p>For a tutorial how to write a Flink streaming application, check the
* tutorials and examples on the <a href="http://flink.apache.org/docs/stable/">Flink Website</a>.
*
* <p>To package your appliation into a JAR file for execution, run
* 'mvn clean package' on the command line.
*
* <p>If you change the name of the main class (with the public static void main(String[] args))
* method, change the respective entry in the POM.xml file (simply search for 'mainClass').
*/
public class StreamingJob {

    public static void main(String[] args) throws Exception {

        System.out.println("main ... ");
        // set up the streaming execution environment
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        // env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
// generate a Watermark every second
        env.getConfig().setAutoWatermarkInterval(1000);

// configure Kafka consumer
        Properties props = new Properties();
        props.setProperty("zookeeper.connect", "localhost:2181"); // Zookeeper default host:port
        props.setProperty("bootstrap.servers", "localhost:9092"); // Broker default host:port
        props.setProperty("group.id", "myGroup");                 // Consumer group ID
        props.setProperty("auto.offset.reset", "earliest");       // Always read topic from start


// create a Kafka consumer
        FlinkKafkaConsumer011<String> consumer =
                new FlinkKafkaConsumer011<String>(
                        "connect-test",
                        new SimpleStringSchema(),
                        props);

        DataStream<String> stream = env.addSource(consumer);


        env.enableCheckpointing(2000, CheckpointingMode.AT_LEAST_ONCE);


        BucketingSink<String> sink = new BucketingSink<String>("s3://gubo-test/");
        sink.setUseTruncate(false);
        sink.setBucketer(new DateTimeBucketerGubo("yyyy-MM-dd--HHmm"));
        sink.setWriter(new StringWriter<String>());
        sink.setBatchSize(3);
        sink.setPendingPrefix("file-");
        sink.setPendingSuffix(".txt");




        //stream.writeAsText("s3://gubo-test/flink-test.txt");
        stream.addSink(sink);
        stream.print();

        env.execute("Flink Streaming Java API Skeleton");
    }




    public  static class DateTimeBucketerGubo<T> extends DateTimeBucketer<T> {


        public DateTimeBucketerGubo(String formatString) {
            super(formatString);
        }

        public Path getBucketPath(Clock clock, Path basePath, T element) {

            System.out.println("DateTimeBucketerGubo.getBucketPath:  basePath: " + basePath.toString());

            Path ret = super.getBucketPath(clock, basePath, element);

            System.out.println("returnning: " + ret.toString());


            return ret;
        }

    }
}

它抛出:

    java.io.IOException: /2018-07-19--1050 doesn't exist
at org.apache.hadoop.fs.s3.Jets3tFileSystemStore.get(Jets3tFileSystemStore.java:170)
at org.apache.hadoop.fs.s3.Jets3tFileSystemStore.retrieveINode(Jets3tFileSystemStore.java:221)
at sun.reflect.GeneratedMethodAccessor29.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:409)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeMethod(RetryInvocationHandler.java:163)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invoke(RetryInvocationHandler.java:155)
at org.apache.hadoop.io.retry.RetryInvocationHandler$Call.invokeOnce(RetryInvocationHandler.java:95)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:346)
at com.sun.proxy.$Proxy21.retrieveINode(Unknown Source)
at org.apache.hadoop.fs.s3.S3FileSystem.getFileStatus(S3FileSystem.java:340)
at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1437)
at org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink.openNewPartFile(BucketingSink.java:516)
at org.apache.flink.streaming.connectors.fs.bucketing.BucketingSink.invoke(BucketingSink.java:446)
at org.apache.flink.streaming.api.functions.sink.SinkFunction.invoke(SinkFunction.java:52)
at org.apache.flink.streaming.api.operators.StreamSink.processElement(StreamSink.java:56)
at org.apache.flink.streaming.runtime.tasks.OperatorChain$CopyingChainingOutput.pushToOperator(OperatorChain.java:558)
at org.apache.flink.streaming.runtime.tasks.OperatorChain$CopyingChainingOutput.collect(OperatorChain.java:533)
at org.apache.flink.streaming.runtime.tasks.OperatorChain$CopyingChainingOutput.collect(OperatorChain.java:513)
at org.apache.flink.streaming.runtime.tasks.OperatorChain$BroadcastingOutputCollector.collect(OperatorChain.java:628)
at org.apache.flink.streaming.runtime.tasks.OperatorChain$BroadcastingOutputCollector.collect(OperatorChain.java:581)
at org.apache.flink.streaming.api.operators.AbstractStreamOperator$CountingOutput.collect(AbstractStreamOperator.java:679)
at org.apache.flink.streaming.api.operators.AbstractStreamOperator$CountingOutput.collect(AbstractStreamOperator.java:657)
at org.apache.flink.streaming.api.operators.StreamSourceContexts$NonTimestampContext.collect(StreamSourceContexts.java:104)
at org.apache.flink.streaming.api.operators.StreamSourceContexts$NonTimestampContext.collectWithTimestamp(StreamSourceContexts.java:111)
at org.apache.flink.streaming.connectors.kafka.internals.AbstractFetcher.emitRecordWithTimestamp(AbstractFetcher.java:398)
at org.apache.flink.streaming.connectors.kafka.internal.Kafka010Fetcher.emitRecord(Kafka010Fetcher.java:89)
at org.apache.flink.streaming.connectors.kafka.internal.Kafka09Fetcher.runFetchLoop(Kafka09Fetcher.java:154)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumerBase.run(FlinkKafkaConsumerBase.java:738)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:87)
at org.apache.flink.streaming.api.operators.StreamSource.run(StreamSource.java:56)
at org.apache.flink.streaming.runtime.tasks.SourceStreamTask.run(SourceStreamTask.java:99)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:306)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:703)
at java.lang.Thread.run(Thread.java:748)

0 个答案:

没有答案