我的学习参考了“机器学习的火花”这本书
groupId:org.apache.spark artifactId:spark-core_2.11 版本:2.0.1
JavaApp.java
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.DoubleFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
/**
* A simple Spark app in Java
*/
public class JavaApp {
public static void main(String[] args) {
JavaSparkContext sc = new JavaSparkContext("local[2]", "First Spark App");
// we take the raw data in CSV format and convert it into a set of records of the form (user, product, price)
JavaRDD<String[]> data = sc.textFile("data/UserPurchaseHistory.csv")
.map(new Function<String, String[]>() {
@Override
public String[] call(String s) throws Exception {
return s.split(",");
}
});
// let's count the number of purchases
long numPurchases = data.count();
// let's count how many unique users made purchases
long uniqueUsers = data.map(new Function<String[], String>() {
@Override
public String call(String[] strings) throws Exception {
return strings[0];
}
}).distinct().count();
// let's sum up our total revenue
double totalRevenue = data.map(new DoubleFunction<String[]>() {
@Override
public double call(String[] strings) throws Exception {
//double ret=Double.parseDouble(strings[2]);
//return ret;
//return Double.parseDouble(strings[2]);
}
}).sum();
// let's find our most popular product
// first we map the data to records of (product, 1) using a PairFunction
// and the Tuple2 class.
// then we call a reduceByKey operation with a Function2, which is essentially the sum function
List<Tuple2<String, Integer>> pairs = data.map(new PairFunction<String[], String, Integer>() {
@Override
public Tuple2<String, Integer> call(String[] strings) throws Exception {
return new Tuple2(strings[1], 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
}).collect();
// finally we sort the result. Note we need to create a Comparator function,
// that reverses the sort order.
Collections.sort(pairs, new Comparator<Tuple2<String, Integer>>() {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return -(o1._2() - o2._2());
}
});
String mostPopular = pairs.get(0)._1();
int purchases = pairs.get(0)._2();
// print everything out
System.out.println("Total purchases: " + numPurchases);
System.out.println("Unique users: " + uniqueUsers);
System.out.println("Total revenue: " + totalRevenue);
System.out.println(String.format("Most popular product: %s with %d purchases",
mostPopular, purchases));
sc.stop();
}
}
我的pom.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>java-spark-app</groupId>
<artifactId>java-spark-app</artifactId>
<version>1.0</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.0.1</version>
</dependency>
</dependencies>
</project>
所以,我使用maven编译代码,但无法解决以下错误消息
[INFO] 2 errors
[INFO] -------------------------------------------------------------
[INFO] ------------------------------------------------------------------------
[INFO] BUILD FAILURE
[INFO] ------------------------------------------------------------------------
[INFO] Total time: 6.785 s
[INFO] Finished at: 2016-10-10T05:17:42+00:00
[INFO] Final Memory: 34M/777M
[INFO] ------------------------------------------------------------------------
[ERROR] Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.1:compile (default-compile) on project java-spark-app: Compilation failure: Compilation failure:
[ERROR] /home/ubuntu/workspace/practice/8519OS_Code/Chapter_01/java-spark-app/src/main/java/JavaApp.java:[40,35] method map in class org.apache.spark.api.java.AbstractJavaRDDLike<T,This> cannot be applied to given types;
[ERROR] required: org.apache.spark.api.java.function.Function<java.lang.String[],R>
[ERROR] found: <anonymous org.apache.spark.api.java.function.DoubleFunction<java.lang.String[]>>
[ERROR] reason: cannot infer type-variable(s) R
[ERROR] (argument mismatch; <anonymous org.apache.spark.api.java.function.DoubleFunction<java.lang.String[]>> cannot be converted to org.apache.spark.api.java.function.Function<java.lang.String[],R>)
[ERROR] /home/ubuntu/workspace/practice/8519OS_Code/Chapter_01/java-spark-app/src/main/java/JavaApp.java:[53,51] method map in class org.apache.spark.api.java.AbstractJavaRDDLike<T,This> cannot be applied to given types;
[ERROR] required: org.apache.spark.api.java.function.Function<java.lang.String[],R>
[ERROR] found: <anonymous org.apache.spark.api.java.function.PairFunction<java.lang.String[],java.lang.String,java.lang.Integer>>
[ERROR] reason: cannot infer type-variable(s) R
[ERROR] (argument mismatch; <anonymous org.apache.spark.api.java.function.PairFunction<java.lang.String[],java.lang.String,java.lang.Integer>> cannot be converted to org.apache.spark.api.java.function.Function<java.lang.String[],R>)
[ERROR] -> [Help 1]
[ERROR]
[ERROR] To see the full stack trace of the errors, re-run Maven with the -e switch.
[ERROR] Re-run Maven using the -X switch to enable full debug logging.
[ERROR]
[ERROR] For more information about the errors and possible solutions, please read the following articles:
[ERROR] [Help 1] http://cwiki.apache.org/confluence/display/MAVEN/MojoFailureException
这个问题在4天内让我担心。 请帮帮我:(
答案 0 :(得分:0)
您引用的代码示例可能使用旧版本的Spark。我修改了你发布的代码,它与最新版本的Spark一起工作正常。尝试构建并运行代码。
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.DoubleFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
public class JavaApp {
public static void main(String[] args) {
JavaSparkContext sc = new JavaSparkContext("local[2]", "First Spark App");
// we take the raw data in CSV format and convert it into a set of records of the form (user, product, price)
JavaRDD<String[]> data = sc.textFile("data/UserPurchaseHistory.csv")
.map(new Function<String, String[]>() {
@Override
public String[] call(String s) throws Exception {
return s.split(",");
}
});
// let's count the number of purchases
long numPurchases = data.count();
// let's count how many unique users made purchases
long uniqueUsers = data.map(new Function<String[], String>() {
@Override
public String call(String[] strings) throws Exception {
return strings[0];
}
}).distinct().count();
// let's sum up our total revenue
double totalRevenue = data.mapToDouble(new DoubleFunction<String[]>() {
@Override
public double call(String[] strings) throws Exception {
//double ret=Double.parseDouble(strings[2]);
//return ret;
return Double.parseDouble(strings[2]);
}
}).sum();
// let's find our most popular product
// first we map the data to records of (product, 1) using a PairFunction
// and the Tuple2 class.
// then we call a reduceByKey operation with a Function2, which is essentially the sum function
List<Tuple2<String, Integer>> pairs = data.mapToPair(new PairFunction<String[], String, Integer>() {
@Override
public Tuple2<String, Integer> call(String[] strings) throws Exception {
return new Tuple2(strings[1], 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) throws Exception {
return integer + integer2;
}
}).collect();
// finally we sort the result. Note we need to create a Comparator function,
// that reverses the sort order.
Collections.sort(new ArrayList(pairs), new Comparator<Tuple2<String, Integer>>() {
@Override
public int compare(Tuple2<String, Integer> o1, Tuple2<String, Integer> o2) {
return -(o1._2() - o2._2());
}
});
String mostPopular = pairs.get(0)._1();
int purchases = pairs.get(0)._2();
// print everything out
System.out.println("Total purchases: " + numPurchases);
System.out.println("Unique users: " + uniqueUsers);
System.out.println("Total revenue: " + totalRevenue);
System.out.println(String.format("Most popular product: %s with %d purchases",
mostPopular, purchases));
sc.stop();
}
}