java服务 - 火花通信

时间:2015-06-19 06:05:33

标签: java rest apache-spark

我很兴奋,我正在寻找一些指导:)

我有一个java服务,它充当我的应用程序的后端。我从UI获得了一些临时查询,我发现其中一些查询花费了大量时间。所以,我决定转向火花来完成这些任务。但我仍然坚持如何与java建立与apache spark的沟通。

我看到其他SO问题,似乎Ooyala的Spark Job Server解决了我的问题。我很想知道是否还有其他方法可以解决我的问题。

1 个答案:

答案 0 :(得分:0)

以下是java中提供的用于处理spark的示例代码,您可以在spark中输入后编写代码,因为我在这里使用了预先构建的KMean集群库,您可以使用自己的代码。 / p>

import java.util.regex.Pattern;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.mllib.clustering.KMeans;
import org.apache.spark.mllib.clustering.KMeansModel;
import org.apache.spark.mllib.linalg.Vector;
import org.apache.spark.mllib.linalg.Vectors;

public final class Spark_KMeans {
private static class ParsePoint implements Function<String, Vector> {
private static final Pattern SPACE = Pattern.compile(" ");

@Override
public Vector call(String line) {
String[] tok = SPACE.split(line);
double[] point = new double[tok.length];
for (int i = 0; i < tok.length; ++i) {
point[i] = Double.parseDouble(tok[i]);
}
return Vectors.dense(point);
}
}

public static void main(String[] args1) {

String[] args = { "/usr/spark_pack/spark_input", "3", "5" };

if (args.length < 3) {
System.err
.println("Usage: JavaKMeans <input_file> <k> <max_iterations> [<runs>]");
System.exit(1);
}
String inputFile = args[0];
int k = Integer.parseInt(args[1]);
int iterations = Integer.parseInt(args[2]);
int runs = 1;
if (args.length >= 4) {
runs = Integer.parseInt(args[3]);
}

String sparkHome = "/usr/spark_pack/spark-1.3.0-bin-hadoop2.4/";
String sparkMasterUrl = "spark://master:7077";

String jarFile1 = "/usr/spark_pack/spark-1.3.0-bin-hadoop2.4/lib/spark-assembly-1.3.0-hadoop2.4.0.jar";
String jarFile2 = "/usr/spark_pack/spark_jar/spark_mlib.jar";

/*
* JavaStreamingContext ssc = new JavaStreamingContext(sparkMasterUrl,
* "Kshitij Stream Engine", new Duration(1000), sparkHome);
*/
SparkConf conf = new SparkConf().setAppName("Log Analyzer SQL")
.setMaster(sparkMasterUrl).setSparkHome(sparkHome)
.setJars(new String[] { jarFile1, jarFile2 })
.setAppName("JavaKMeans");

JavaSparkContext sc = new JavaSparkContext(conf);

/*
* SparkConf sparkConf = new SparkConf().setAppName("JavaKMeans");
* JavaSparkContext sc = new JavaSparkContext(sparkConf);
*/

JavaRDD<String> lines = sc.textFile(inputFile);

//****This portion is my algo part*************************
JavaRDD<Vector> points = lines.map(new ParsePoint());
KMeansModel model = KMeans.train(points.rdd(), k, iterations, runs,
KMeans.K_MEANS_PARALLEL());
System.out.println("Cluster centers:");
for (Vector center : model.clusterCenters()) {
System.out.println(" " + center);
}
double cost = model.computeCost(points.rdd());
System.out.println("Cost: " + cost);
sc.stop();
}
}