Spark - 单独加载多个文件

时间:2018-04-23 12:48:23

标签: apache-spark apache-spark-sql spark-dataframe

我是Spark新手,这是我的第一个代码。如果我错了,请纠正我。

我想逐个加载多个文件 - 我不想引用文件夹来一起处理所有文件。想要只处理选定的文件。

下面是我的代码(引用Spark文档),它可以处理单个文件,但是当我处理多个文件时,它会为每个文件创建一个DataFrame。我希望将数据添加到以前的DataFrame中,以便我添加的所有记录(来自多个文件)将驻留在单个表(mylogs)中。有没有办法做到这一点?我想分别为每个文件调用readFileToSchema(...)方法。

import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class TestApp {

    public static void main(String[] args) {
        TestApp app = new TestApp();
        SparkSession spark = SparkSession.builder().appName("Simple Application")
                .config("spark.master", "local").getOrCreate();

        String[] afiles = {"/Users/logs/test1.txt","/Users/logs/test2.txt"};
        final List<String> files = Arrays.asList(afiles);

        for (String file : files) {
            app.readFileToSchema(spark, file);
        }
    }

    public void readFileToSchema(SparkSession spark, String filePath) {

        SparkContext sc = spark.sparkContext();
        JavaRDD<String> logRDD = sc.textFile(filePath, 1).toJavaRDD();

        String schemaString = "a1 b1 c1 d1";

        List<StructField> fields = new ArrayList<>();
        for (String fieldName : schemaString.split(" ")) {
            StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
            fields.add(field);
        }
        StructType schema = DataTypes.createStructType(fields);

        RegexMatch reg = new RegexMatch();
        JavaRDD<Row> rowRDD = logRDD
                .filter(line -> reg.isMatched(line))
                .map((Function<String, Row>) line -> {

                    String[] sp = line.split(" ");
                    String msg = line.substring(line.indexOf(sp[5]));
                    return RowFactory.create(sp[0] + " " + sp[1], sp[4], sp[5], msg);
                });

        Dataset<Row> logDataFrame = spark.createDataFrame(rowRDD, schema);

        logDataFrame.createOrReplaceTempView("mylogs");

        Dataset<Row> results = spark.sql("SELECT distinct(b1) FROM mylogs");

        List<Row> allrows = results.collectAsList();

        System.out.println("size : " + allrows.size());

        //spark.stop();
    }
}

1 个答案:

答案 0 :(得分:0)

试试这个

    public static void main(String[] args) {
        TestApp app = new TestApp();
        SparkSession spark = SparkSession.builder().appName("Simple Application")
            .config("spark.master", "local").getOrCreate();

        String afiles = "/Users/logs/test1.txt,/Users/logs/test2.txt";
        app.readFileToSchema(spark, afiles);
    }

    public void readFileToSchema(SparkSession spark, String files) {
        SparkContext sc = spark.sparkContext();
        JavaRDD<String> logRDD = sc.textFile(files).toJavaRDD();

   //Rest of your code.
   }