如何使用欧几里德或余弦距离得到karlhigley / spark-neighbors的LSH ANN输出

时间:2017-06-09 18:49:07

标签: scala apache-spark locality-sensitive-hash approximate-nn-searching

我使用来自https://github.com/karlhigley/spark-neighbors的Locality Sensitive Hashing来处理ANN。我尝试了不同的距离:汉明,欧几里德和余弦;但是当我真的想看到结果调用方法collect()时,这只适用于汉明距离,另外两个在使用collect()时会抛出错误。 尽管输出格式始终相同org.apache.spark.rdd.RDD [(Long,Array [(Long,Double)])]

如何以可读格式获取输出,例如数组,火花数据帧等?

以下是代码:

//defining functions for the input
import scala.util.Random

import org.apache.spark.mllib.linalg.SparseVector

//object TestHelpers {
 def generateIndices(quantity: Int, dimensions: Int) = {
    val indices = new Array[Int](quantity)
    var i = 0
    while (i < quantity) {
      val possible = Random.nextInt(dimensions)
      if (!indices.contains(possible)) {
        indices(i) = possible
        i += 1
      }
    }
    indices
  }

  def generateValues(quantity: Int) = {
    val values = new Array[Double](quantity)
    var i = 0
    while (i < quantity) {
      values(i) = Random.nextGaussian()
      i += 1
    }
    values
  }

  def generateRandomPoints(quantity: Int, dimensions: Int, density: Double) = {
    val numElements = math.floor(dimensions * density).toInt
    val points = new Array[SparseVector](quantity)
    var i = 0
    while (i < quantity) {
      val indices = generateIndices(numElements, dimensions)
      val values = generateValues(numElements)
      points(i) = new SparseVector(dimensions, indices, values)
      i += 1
    }
    points
  }

//generating the input
val numPoints = 1000
val dimensions = 100
val density = 0.5

val localPoints  = generateRandomPoints(numPoints, dimensions,density)
var points = sc.parallelize(localPoints).zipWithIndex.map(_.swap)

//actual model
    val annModel_hamming = new ANN(dimensions = 1000, measure = "hamming").setTables(4).setSignatureLength(64).train(points2)

    val neighbors_hamming = annModel_hamming.neighbors(10)

val annModel_euclidean =
  new ANN(dimensions = 1000, measure = "euclidean")
    .setTables(4)
    .setSignatureLength(64)
    .setBucketWidth(5)
    .train(points)

val neighbors_euclidean = annModel_euclidean.neighbors(10)


val annModel_cosine =  new ANN(20, "cosine")
                .setTables(1)
                .setSignatureLength(20).setRandomSeed(3)
                .train(points)

val neighbors_cosine = annModel_cosine.neighbors(10)

当查看汉明距离的结果时,可以看到所需的输出

neighbors_hamming.collect
res110: Array[(Long, Array[(Long, Double)])] = Array((778,Array((439,38.0), (722,38.0), (968,40.0), (892,42.0), (576,42.0), (375,42.0), (566,42.0), (18,42.0), (574,42.0), (351,44.0))), (386,Array((718,36.0), (354,38.0), (61,40.0), (926,40.0), (210,40.0),... 

但是对于欧氏距离或余弦距离neighbors_euclidean.collect()或  neighbors_cosine.collect()

在这两种情况下我都收到以下错误:

  

java.lang.IllegalArgumentException:要求失败:列的   A与x的元素数不匹配。答:20,x:100

以下是完整错误:

org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 75.0 failed 4 times, most recent failure: Lost task 1.3 in stage 75.0 (TID 93, sandbox.hortonworks.com): java.lang.IllegalArgumentException: requirement failed: The columns of A don't match the number of elements of x. A: 1000, x: 100
    at scala.Predef$.require(Predef.scala:233)
    at org.apache.spark.mllib.linalg.BLAS$.gemv(BLAS.scala:522)
    at org.apache.spark.mllib.linalg.Matrix$class.multiply(Matrices.scala:100)
    at org.apache.spark.mllib.linalg.DenseMatrix.multiply(Matrices.scala:248)
    at com.github.karlhigley.spark.neighbors.linalg.RandomProjection.project(RandomProjection.scala:21)
    at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.signature(ScalarRandomProjectionFunction.scala:28)
    at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:42)
    at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:18)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:155)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:153)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
    at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:153)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:151)
    at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
    at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:284)
    at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
    at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:275)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:313)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:277)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
    at org.apache.spark.scheduler.Task.run(Task.scala:89)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
    at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1433)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1421)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1420)
    at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
    at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
    at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1420)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
    at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:801)
    at scala.Option.foreach(Option.scala:236)
    at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:801)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1642)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1601)
    at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1590)
    at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
    at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:622)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1882)
    at org.apache.spark.SparkContext.runJob(SparkContext.scala:1953)
    at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:934)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
    at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
    at org.apache.spark.rdd.RDD.withScope(RDD.scala:323)
    at org.apache.spark.rdd.RDD.collect(RDD.scala:933)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$3d99ae6e19b65c7f617b22f29b431fb$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:223)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$3d99ae6e19b65c7f617b22f29b431fb$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:228)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:230)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:232)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:234)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:236)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$ad149dbdbd963d0c9dc9b1d6f07f5e$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:238)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:240)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:242)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:244)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:246)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$6e49527b15a75f3b188beeb1837a4f1$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:248)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:250)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:252)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:254)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:256)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$$$93297bcd59dca476dd569cf51abed168$$$$$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:258)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:260)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:262)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:264)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:266)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:268)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:270)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:272)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:274)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:276)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:278)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:280)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:282)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:284)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:286)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:288)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:290)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:292)
    at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:294)
    at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:296)
    at $iwC$$iwC$$iwC$$iwC.<init>(<console>:298)
    at $iwC$$iwC$$iwC.<init>(<console>:300)
    at $iwC$$iwC.<init>(<console>:302)
    at $iwC.<init>(<console>:304)
    at <init>(<console>:306)
    at .<init>(<console>:310)
    at .<clinit>(<console>)
    at .<init>(<console>:7)
    at .<clinit>(<console>)
    at $print(<console>)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1065)
    at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1346)
    at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:840)
    at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:871)
    at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:819)
    at sun.reflect.GeneratedMethodAccessor173.invoke(Unknown Source)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.zeppelin.spark.Utils.invokeMethod(Utils.java:38)
    at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:717)
    at org.apache.zeppelin.spark.SparkInterpreter.interpretInput(SparkInterpreter.java:928)
    at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:871)
    at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:864)
    at org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:94)
    at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:341)
    at org.apache.zeppelin.scheduler.Job.run(Job.java:176)
    at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:139)
    at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
    at java.util.concurrent.FutureTask.run(FutureTask.java:266)
    at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
    at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
    at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.IllegalArgumentException: requirement failed: The columns of A don't match the number of elements of x. A: 1000, x: 100
    at scala.Predef$.require(Predef.scala:233)
    at org.apache.spark.mllib.linalg.BLAS$.gemv(BLAS.scala:522)
    at org.apache.spark.mllib.linalg.Matrix$class.multiply(Matrices.scala:100)
    at org.apache.spark.mllib.linalg.DenseMatrix.multiply(Matrices.scala:248)
    at com.github.karlhigley.spark.neighbors.linalg.RandomProjection.project(RandomProjection.scala:21)
    at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.signature(ScalarRandomProjectionFunction.scala:28)
    at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:42)
    at com.github.karlhigley.spark.neighbors.lsh.ScalarRandomProjectionFunction.hashTableEntry(ScalarRandomProjectionFunction.scala:18)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:155)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1$$anonfun$apply$14.apply(ANNModel.scala:153)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
    at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
    at scala.collection.mutable.ArrayOps$ofRef.foreach(ArrayOps.scala:108)
    at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
    at scala.collection.mutable.ArrayOps$ofRef.map(ArrayOps.scala:108)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:153)
    at com.github.karlhigley.spark.neighbors.ANNModel$$anonfun$generateHashTable$1.apply(ANNModel.scala:151)
    at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
    at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:284)
    at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
    at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:275)
    at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
    at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:313)
    at org.apache.spark.rdd.RDD.iterator(RDD.scala:277)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
    at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
    at org.apache.spark.scheduler.Task.run(Task.scala:89)
    at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:227)

0 个答案:

没有答案