我试图运行一个测试,该测试是从数据库中加载5条记录,然后查找表以查找值并将其替换为记录。最后将其保存在一个csv文件中。对于我要修改的5条记录和1百万条记录,这非常慢。
我已经写了一些代码,这纯粹是一个测试。如果我必须用纯Java编写它,它将更快。但我知道它可能无法在该级别上扩展。
package com.pru.eaf.ark;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.broadcast.Broadcast;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import lombok.Builder;
import lombok.Getter;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
@Slf4j
public class Executor {
public static void main(String[] args) {
if(args.length<3) {
for (String string : args) {
log.info(string);
}
System.exit(1);
}
boolean isRunLocal = new Boolean(args[0]);
// Logger.getLogger("org.apache").setLevel(Level.WARN);
SparkConf sparkConf = new SparkConf()
.setAppName("Read Text to RDD")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
//Taking care of both Running locally and on cluster
if(isRunLocal) {
System.setProperty("hadoop.home.dir", "C:\\winutils");
sparkConf.setMaster("local[*]")
.set("spark.executor.memory", "2g");
}
log.info("Starting Spark Application");
SparkSession s = SparkSession.builder().config(sparkConf).getOrCreate();
JavaSparkContext sc = new JavaSparkContext(s.sparkContext());
// provide path to input text file
String path = args[1];
JavaRDD<Data> data = sc.textFile(path).map((String line)->{
return DataParser.parseData(line);
});
// Reading from database
SQLContext sqlContext = new SQLContext(s);
Dataset<Row> lookup = sqlContext.read()
.jdbc("jdbc:oracle:thin:@***",
"beam_poc_pricing", "id", 0, 10, Integer.parseInt(args[2]),
getProps());
//Broadcast the lookup table for spark nodes to process.
Broadcast<Dataset<Row>> broadcasted = sc.broadcast(lookup);
//Filter the data on the broadcasted table.
//Create a list and print the RDD.
List<Sales>sl = new ArrayList<>();
for(Data d : data.collect()) {
Dataset<Row> row = broadcasted.value();
Row[] rows = (Row[]) row.filter((FilterFunction<Row>) r ->
r.getString(1).equals(d.getBrand()) &&
r.getString(2).equals(d.getModel()))
.collect();
if (rows != null && rows.length>0)
{
Row r = rows[0];
Sales sa = new Sales();
sa.setBrand(d.getBrand());
sa.setId(UUID.randomUUID().toString());
sa.setModel(d.getModel());
sa.setPricing(r.getAs("PRICING"));
sa.setSaleDate(d.getSaleDate());
sa.setSaleQty(d.getSaleQty());
sl.add(sa);
}
JavaRDD<Sales> sales = sc.parallelize(sl);
Dataset<Row>ds = sqlContext.createDataFrame(sales, Sales.class);
Dataset<Sales>sa = ds.as(Encoders.bean(Sales.class));
sa
.coalesce(1)
.write()
.mode(SaveMode.Overwrite)
.csv(args[3]);
}
broadcasted.destroy();
sc.close();
}
static Map<String, String>getDatabaseInfo(){
Map<String, String> options = new HashMap<>();
options.put("driver", "oracle.jdbc.OracleDriver");
options.put("user", "user");
options.put("password", "password");
return options;
}
static Properties getProps() {
Properties properties = new Properties();
properties.putAll(getDatabaseInfo());
return properties;
}
//
@Builder
@Getter
@ToString
private static class Data implements Serializable{
private static final long serialVersionUID = 1L;
private String brand;
private String model;
private String saleQty;
private String saleDate;
}
private static class DataParser {
static Data parseData(final String line) {
String fields [] = line.split(",");
return Data.builder()
.brand(fields[0])
.model(fields[1])
.saleQty(fields[2])
.saleDate(fields[3])
.build();
}
}
}
系统运行缓慢。