我的火花是通过以下方式编译的:
mvn -Pyarn -Phadoop-2.2 -Dhadoop.version=2.2.0 -Phive -Phive-thriftserver -DskipTests clean package
然后我尝试编写一个演示来从我的Hive表中读取记录。以下代码工作正常:
object HiveTest {
def main(args: Array[String]): Unit = {
// -- configuration --
val hconf = new Configuration()
val conf = new SparkConf().setAppName("HiveTest")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val hqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
hqlContext.sql("show databases").collect().foreach(println)
hqlContext.sql(
"""
FROM logbase_db.logbase_2
SELECT doc
limit 10
""".stripMargin).collect().foreach(println)
}
}
但是当我将UDF应用于HQL时,它会抛出错误:
scala.MatchError: interface java.util.Map (of class java.lang.Class)
at org.apache.spark.sql.hive.HiveInspectors$class.javaClassToDataType(HiveInspectors.scala:38)
at org.apache.spark.sql.hive.HiveSimpleUdf.javaClassToDataType(hiveUdfs.scala:73)
at org.apache.spark.sql.hive.HiveSimpleUdf.dataType$lzycompute(hiveUdfs.scala:103)
at org.apache.spark.sql.hive.HiveSimpleUdf.dataType(hiveUdfs.scala:103)
at org.apache.spark.sql.catalyst.expressions.Alias.toAttribute(namedExpressions.scala:105)
at org.apache.spark.sql.catalyst.plans.logical.Project$$anonfun$output$1.apply(basicOperators.scala:25)
at org.apache.spark.sql.catalyst.plans.logical.Project$$anonfun$output$1.apply(basicOperators.scala:25)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.sql.catalyst.plans.logical.Project.output(basicOperators.scala:25)
at org.apache.spark.sql.catalyst.plans.logical.Limit.output(basicOperators.scala:147)
at org.apache.spark.sql.catalyst.planning.PhysicalOperation$$anonfun$unapply$1.apply(patterns.scala:61)
at org.apache.spark.sql.catalyst.planning.PhysicalOperation$$anonfun$unapply$1.apply(patterns.scala:61)
at scala.Option.getOrElse(Option.scala:120)
at org.apache.spark.sql.catalyst.planning.PhysicalOperation$.unapply(patterns.scala:61)
at org.apache.spark.sql.sources.DataSourceStrategy$.apply(DataSourceStrategy.scala:34)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
at org.apache.spark.sql.catalyst.planning.QueryPlanner$$anonfun$1.apply(QueryPlanner.scala:58)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at org.apache.spark.sql.catalyst.planning.QueryPlanner.apply(QueryPlanner.scala:59)
at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan$lzycompute(SQLContext.scala:418)
at org.apache.spark.sql.SQLContext$QueryExecution.sparkPlan(SQLContext.scala:416)
at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan$lzycompute(SQLContext.scala:422)
at org.apache.spark.sql.SQLContext$QueryExecution.executedPlan(SQLContext.scala:422)
at org.apache.spark.sql.SchemaRDD.collect(SchemaRDD.scala:444)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:29)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:34)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:36)
at $iwC$$iwC$$iwC.<init>(<console>:38)
at $iwC$$iwC.<init>(<console>:40)
at $iwC.<init>(<console>:42)
at <init>(<console>:44)
at .<init>(<console>:48)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:601)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:852)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1125)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:674)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:705)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:669)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:828)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:846)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:846)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:846)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:846)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:846)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:846)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:873)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:785)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:628)
at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:636)
at org.apache.spark.repl.SparkILoop.loop(SparkILoop.scala:641)
at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply$mcZ$sp(SparkILoop.scala:968)
at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:916)
at org.apache.spark.repl.SparkILoop$$anonfun$process$1.apply(SparkILoop.scala:916)
at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:916)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1011)
at org.apache.spark.repl.Main$.main(Main.scala:31)
at org.apache.spark.repl.Main.main(Main.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:601)
at org.apache.spark.deploy.SparkSubmit$.launch(SparkSubmit.scala:358)
at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:75)
at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
我的UDF:
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.io.Serializable;
import java.util.*;
public class RowToMap extends UDF implements Serializable {
private static final char SEP = '^';
private static final char KVSEP = '=';
private static final String AVAILABLE = "available";
// must exist
private static final String PLT = "plt";
private static final String CAID = "k";
private static final String UUID = "uuid";
private static final String SPID = "p";
private static final String IP = "ip";
private static final String TYPE = "tp";
private static final String TIME = "ti";
private static final List<String> MANDANTORY_FIELDS =
Arrays.asList(new String[]{PLT, CAID, UUID, SPID, IP, TYPE, TIME});
private static final String STABLE = "av";
private static final String SDK = "kt";
private static final String OS = "os";
private static final String MD = "md";
private static final String URL = "pr";
private static final String IE = "e";
private static final String APP= "pn";
private static final List<String> OPTION_FIELDS = Arrays.asList(new String[]{STABLE, SDK, OS, MD,URL,IE,APP});
public Map<String, String> evaluate(String s) {
Map<String, String> map = new HashMap<String, String>();
String[] arr = StringUtils.split(s, SEP);
for(String str: arr)
{
int index = str.indexOf(KVSEP);
if(index>0)
{
String k = str.substring(0,index);
String v = str.substring(index+1);
map.put(k, v);
}
}
boolean badLog = true;
for (String mandantoryField : MANDANTORY_FIELDS) {
if (map.containsKey(mandantoryField)){
if (map.get(mandantoryField).isEmpty()) {
badLog = false;
break;
}
}else{
badLog = false;
break;
}
}
map.put(AVAILABLE, badLog ? "available" : "");
// OPTION_FIELDS
for (String optionField : OPTION_FIELDS) {
if (! map.containsKey(optionField)){
map.put(optionField, "");
}
}
return map;
}
}
我的Spark代码:
object HiveTest {
def main(args: Array[String]): Unit = {
// -- configuration --
val hconf = new Configuration()
val conf = new SparkConf().setAppName("HiveTest")
val sc = new SparkContext(conf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val hqlContext = new org.apache.spark.sql.hive.HiveContext(sc)
hqlContext.sql("CREATE TEMPORARY FUNCTION row_to_map AS 'com.hide.diablo.mzhiveudf.RowToMap'")
hqlContext.sql("set ca-meta=ALL")
hqlContext.sql(
"""
FROM logbase_db.logbase_2
SELECT row_to_map(doc) AS row,dateid
WHERE dateid >= 20140318 AND dateid <=20140318
limit 10
""".stripMargin).collect().foreach(println)
}
}
我的启动命令:
spark-submit --class "com.hide.etl.region.distribution.HiveTest" --jars /home/workspace/spark-1.2.0-bin-hadoop2/lib_managed/jars/datanucleus-api-jdo-3.2.6.jar,/home/workspace/spark-1.2.0-bin-hadoop2/lib_managed/jars/datanucleus-rdbms-3.2.9.jar,/home/workspace/spark-1.2.0-bin-hadoop2/lib_managed/jars/datanucleus-core-3.2.10.jar,/home/workspace/hadoop/spark_external_jars/udf-collection-1.0-SNAPSHOT.jar,/home/workspace/hadoop/sqoop/mysql-connector-java-5.1.31.jar --master yarn-cluster --num-executors 16 --driver-memory 8g --executor-memory 4g --executor-cores 4 scala.test.jar
我确定正确加载了UDF .jar文件,并且create function
HQL通过将相同的代码应用于spark-shell
而没有任何错误。仅在使用row_to_map
时才会抛出上述错误。
HQL在Hive cli中运行良好。那么,有谁能让我对这个错误有所了解?非常感谢!