我使用来自https://github.com/karlhigley/spark-neighbors的Locality Sensitive Hashing来处理ANN。我尝试了不同的距离:汉明,欧几里德和余弦;但是当我真的想看到结果调用方法collect()
时,这只适用于汉明距离,另外两个在使用collect()时会抛出错误。
尽管输出格式始终相同org.apache.spark.rdd.RDD [(Long,Array [(Long,Double)])]
如何以可读格式获取输出,例如数组,火花数据帧等?
以下是代码:
//defining functions for the input
import scala.util.Random
import org.apache.spark.mllib.linalg.SparseVector
//object TestHelpers {
def generateIndices(quantity: Int, dimensions: Int) = {
val indices = new Array[Int](quantity)
var i = 0
while (i < quantity) {
val possible = Random.nextInt(dimensions)
if (!indices.contains(possible)) {
indices(i) = possible
i += 1
}
}
indices
}
def generateValues(quantity: Int) = {
val values = new Array[Double](quantity)
var i = 0
while (i < quantity) {
values(i) = Random.nextGaussian()
i += 1
}
values
}
def generateRandomPoints(quantity: Int, dimensions: Int, density: Double) = {
val numElements = math.floor(dimensions * density).toInt
val points = new Array[SparseVector](quantity)
var i = 0
while (i < quantity) {
val indices = generateIndices(numElements, dimensions)
val values = generateValues(numElements)
points(i) = new SparseVector(dimensions, indices, values)
i += 1
}
points
}
//generating the input
val numPoints = 1000
val dimensions = 100
val density = 0.5
val localPoints = generateRandomPoints(numPoints, dimensions,density)
var points = sc.parallelize(localPoints).zipWithIndex.map(_.swap)
//actual model
val annModel_hamming = new ANN(dimensions = 1000, measure = "hamming").setTables(4).setSignatureLength(64).train(points2)
val neighbors_hamming = annModel_hamming.neighbors(10)
val annModel_euclidean =
new ANN(dimensions = 1000, measure = "euclidean")
.setTables(4)
.setSignatureLength(64)
.setBucketWidth(5)
.train(points)
val neighbors_euclidean = annModel_euclidean.neighbors(10)
val annModel_cosine = new ANN(20, "cosine")
.setTables(1)
.setSignatureLength(20).setRandomSeed(3)
.train(points)
val neighbors_cosine = annModel_cosine.neighbors(10)
当查看汉明距离的结果时,可以看到所需的输出
neighbors_hamming.collect
res110: Array[(Long, Array[(Long, Double)])] = Array((778,Array((439,38.0), (722,38.0), (968,40.0), (892,42.0), (576,42.0), (375,42.0), (566,42.0), (18,42.0), (574,42.0), (351,44.0))), (386,Array((718,36.0), (354,38.0), (61,40.0), (926,40.0), (210,40.0),...
但是对于欧氏距离或余弦距离neighbors_euclidean.collect()
或
neighbors_cosine.collect()
在这两种情况下我都收到以下错误:
java.lang.IllegalArgumentException:要求失败:列的 A与x的元素数不匹配。答:20,x:100
以下是完整错误:
org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 75.0 failed 4 times, most recent failure: Lost task 1.3 in stage 75.0 (TID 93, sandbox.hortonworks.com): java.lang.IllegalArgumentException: requirement failed: The columns of A don't match the number of elements of x. A: 1000, x: 100
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.mllib.linalg.BLAS$.gemv(BLAS.scala:522)
at org.apache.spark.mllib.linalg.Matrix$class.multiply(Matrices.scala:100)
at org.apache.spark.mllib.linalg.DenseMatrix.multiply(Matrices.scala:248)
at com.github.karlhigley.spark.neighbors.linalg.RandomProjection.project(RandomProjection.scala:21)
at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.signature(ScalarRandomProjectionFunction.scala:28)
at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:42)
at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:18)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:155)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:153)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:153)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:151)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:284)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:275)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:313)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:277)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1433)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1421)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1420)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1420)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
at scala.Option.foreach(Option.scala:236)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:801)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1642)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1590)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:622)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1882)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:1953)
at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:934)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
at org.apache.spark.rdd.RDD.collect(RDD.scala:933)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$3d99ae6e19b65c7f617b22f29b431fb$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:223)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$3d99ae6e19b65c7f617b22f29b431fb$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:228)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:230)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:232)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:234)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:236)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:238)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:240)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:242)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:244)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:246)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:248)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:250)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:252)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:254)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:256)
at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:258)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:260)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:262)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:264)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:266)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:268)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:270)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:272)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:274)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:276)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:278)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:280)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:282)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:284)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:286)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:288)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:290)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:292)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:294)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:296)
at $iwC$$iwC$$iwC$$iwC.<init>(<console>:298)
at $iwC$$iwC$$iwC.<init>(<console>:300)
at $iwC$$iwC.<init>(<console>:302)
at $iwC.<init>(<console>:304)
at <init>(<console>:306)
at .<init>(<console>:310)
at .<clinit>(<console>)
at .<init>(<console>:7)
at .<clinit>(<console>)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
at sun.reflect.GeneratedMethodAccessor173.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.zeppelin.spark.Utils.invokeMethod(Utils.java:38)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:717)
at org.apache.zeppelin.spark.SparkInterpreter.interpretInput(SparkInterpreter.java:928)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:871)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:864)
at org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:94)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:341)
at org.apache.zeppelin.scheduler.Job.run(Job.java:176)
at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:139)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalArgumentException: requirement failed: The columns of A don't match the number of elements of x. A: 1000, x: 100
at scala.Predef$.require(Predef.scala:233)
at org.apache.spark.mllib.linalg.BLAS$.gemv(BLAS.scala:522)
at org.apache.spark.mllib.linalg.Matrix$class.multiply(Matrices.scala:100)
at org.apache.spark.mllib.linalg.DenseMatrix.multiply(Matrices.scala:248)
at com.github.karlhigley.spark.neighbors.linalg.RandomProjection.project(RandomProjection.scala:21)
at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.signature(ScalarRandomProjectionFunction.scala:28)
at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:42)
at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:18)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:155)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:153)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:153)
at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:151)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:284)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:275)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:313)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:277)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)