我是Spark新手,这是我的第一个代码。如果我错了,请纠正我。
我想逐个加载多个文件 - 我不想引用文件夹来一起处理所有文件。想要只处理选定的文件。
下面是我的代码(引用Spark文档),它可以处理单个文件,但是当我处理多个文件时,它会为每个文件创建一个DataFrame。我希望将数据添加到以前的DataFrame中,以便我添加的所有记录(来自多个文件)将驻留在单个表(mylogs)中。有没有办法做到这一点?我想分别为每个文件调用readFileToSchema(...)方法。
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class TestApp {
public static void main(String[] args) {
TestApp app = new TestApp();
SparkSession spark = SparkSession.builder().appName("Simple Application")
.config("spark.master", "local").getOrCreate();
String[] afiles = {"/Users/logs/test1.txt","/Users/logs/test2.txt"};
final List<String> files = Arrays.asList(afiles);
for (String file : files) {
app.readFileToSchema(spark, file);
}
}
public void readFileToSchema(SparkSession spark, String filePath) {
SparkContext sc = spark.sparkContext();
JavaRDD<String> logRDD = sc.textFile(filePath, 1).toJavaRDD();
String schemaString = "a1 b1 c1 d1";
List<StructField> fields = new ArrayList<>();
for (String fieldName : schemaString.split(" ")) {
StructField field = DataTypes.createStructField(fieldName, DataTypes.StringType, true);
fields.add(field);
}
StructType schema = DataTypes.createStructType(fields);
RegexMatch reg = new RegexMatch();
JavaRDD<Row> rowRDD = logRDD
.filter(line -> reg.isMatched(line))
.map((Function<String, Row>) line -> {
String[] sp = line.split(" ");
String msg = line.substring(line.indexOf(sp[5]));
return RowFactory.create(sp[0] + " " + sp[1], sp[4], sp[5], msg);
});
Dataset<Row> logDataFrame = spark.createDataFrame(rowRDD, schema);
logDataFrame.createOrReplaceTempView("mylogs");
Dataset<Row> results = spark.sql("SELECT distinct(b1) FROM mylogs");
List<Row> allrows = results.collectAsList();
System.out.println("size : " + allrows.size());
//spark.stop();
}
}
答案 0 :(得分:0)
试试这个
public static void main(String[] args) {
TestApp app = new TestApp();
SparkSession spark = SparkSession.builder().appName("Simple Application")
.config("spark.master", "local").getOrCreate();
String afiles = "/Users/logs/test1.txt,/Users/logs/test2.txt";
app.readFileToSchema(spark, afiles);
}
public void readFileToSchema(SparkSession spark, String files) {
SparkContext sc = spark.sparkContext();
JavaRDD<String> logRDD = sc.textFile(files).toJavaRDD();
//Rest of your code.
}