在k = 2的情况下,我需要解决k-最近邻问题(~200,000,000次),并且要搜索的参考点集是500,000个10维向量的量级。我的代码的kNN部分是我工作的性能瓶颈,我想加快它的速度。我的第一个想法是GPU实现,但this implementation,当我使用几个大型CUDA卡进行测试时,实际上只有在查询批量大小变大(例如1000左右)时才开始闪耀。我的查询批量大小约为200,所以它真的不值得麻烦。 KD-Tree也可以工作,但我一直在避免它,因为我的实现要求我在批次之间更改引用集,这意味着我将花费大量时间重建树。
我的下一个想法是简单的多线程。 Batch-kNN令人尴尬地平行,所以这应该是直截了当的。我做的第一件事是编写批处理kNN的“Naive”和“Multithreaded”实现来比较性能。我知道使用JIT编译器进行微基准测试并不准确,但速度提高4到8倍应该是显而易见的。我发现我的多线程代码只打败了最初的两倍,尽管在Core-i7上运行4到8个线程,每个线程都需要做足够的工作来占用它们几百毫秒。如果我在这里做错了什么,我不知道它是什么。如果我没有做错什么,我不知道为什么我没有得到与线程数大致线性扩展的性能。更令人费解的是,我的任务管理器(我正在运行Windows)在我的内核上显示%60-80负载,而多线程测试以大参考阵列大小(500,000)运行。如果他们不解决kNN,他们在做什么?
如果有人能给我一些提示,说明为什么多线程示例的性能不如Naive示例的大致等于所使用的线程/内核的因素,那么它将对我有所帮助。
编辑:为了回应建议,我在单线程和多线程测试程序上运行了VisualVM分析器。 Profiler输出看起来像你期望的那样,这不是没有一些奇怪的事情发生。我不得不将参考点大小提高到5,000,000,以便为每个线程提供足够的工作,以便VisualVM的线程监视器在没有别名的情况下对其进行采样,但是一旦我这样做,我注意到了两件事:
首先,所有线程以大致同步的方式开始和停止运行,这是他们应该做的。有一些变化,但我将其归结为VisualVM使用的~1Hz轮询频率。
其次,所有多线程运行时的平均总和远远长于单线程进程完成一批200个查询所用的平均时间。因此,所有线程一次只执行 thing 七秒钟左右,但它们似乎比单线程等效线程慢得多。
以下是朴素和多线程测试:
幼稚:
import java.util.Random;
public class NaiveNNSearchTest {
public static void main(String[] args) {
int numReference = 100000;
int numQuery = 200;
int dimension = 10;
int k = 2;
double[][] reference, query;
double[][] dist;
int[][] index;
int iterations = 100;
Random rand = new Random();
reference = new double[numReference][];
query = new double[numQuery][];
long[] times = new long[iterations];
System.out.println("Starting...");
for (int it = 0; it < iterations; it++) {
for (int i = 0; i < numReference; i++) {
reference[i] = new double[dimension];
for (int j = 0; j < reference[i].length; j++) reference[i][j] = rand.nextDouble();
}
for (int i = 0; i < numQuery; i++) {
query[i] = new double[dimension];
for (int j = 0; j < query[i].length; j++) query[i][j] = rand.nextDouble();
}
dist = new double[numQuery][];
index= new int[numQuery][];
long tick = System.currentTimeMillis();
for (int i = 0; i < numQuery; i++) {
dist[i] = new double[k];
dist[i][0] = Double.POSITIVE_INFINITY;
dist[i][1] = Double.POSITIVE_INFINITY;
index[i] = new int[k];
for (int j = 0; j < numReference; j++) {
double d = dist(query[i], reference[j]);
if (d < dist[i][0]) {
dist[i][1] = dist[i][0];
dist[i][0] = d;
index[i][1] = index[i][0];
index[i][0] = j;
} else if (d < dist[i][1]) {
dist[i][1] = d;
index[i][1] = j;
}
}
}
long tock = System.currentTimeMillis();
times[it] = tock-tick;
for (k = 0; k < index.length; k++) {
System.out.print(index[k][0] + "\t");
}
System.out.println();
for (k = 0; k < index.length; k++) {
System.out.print(index[k][1] + "\t");
}
System.out.println();
System.out.println();
}
double avrTime = 0;
for (int i = 0; i < times.length; i++) {
avrTime += times[i];
}
avrTime /= (double)iterations;
avrTime /= 1000.0;
System.out.println("Done. Average time = " + avrTime + "s/query");
}
public static double dist(double[] p1, double[] p2) {
double ret = 0;
for (int i = 0; i < p1.length; i++) {
ret += (p1[i]-p2[i])*(p1[i]-p2[i]);
}
return ret;
}
}
多线程:
import static java.lang.Math.floor;
import java.util.Random;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.CyclicBarrier;
public class MultithreadNNSearchTest {
public static void main(String[] args) {
int numReference = 100000;
int numQuery = 200;
int dimension = 10;
double[][] reference, query;
int iterations = 100;
Random rand = new Random();
reference = new double[numReference][];
query = new double[numQuery][];
long[] times = new long[iterations];
NNWorker.makeWorkers(numQuery);
System.out.println("Starting multithread...");
for (int it = 0; it < iterations; it++) {
for (int i = 0; i < numReference; i++) {
reference[i] = new double[dimension];
for (int j = 0; j < reference[i].length; j++) reference[i][j] = rand.nextDouble();
}
for (int i = 0; i < numQuery; i++) {
query[i] = new double[dimension];
for (int j = 0; j < query[i].length; j++) query[i][j] = rand.nextDouble();
}
long tick = System.currentTimeMillis();
NNWorker.launchWorkers(reference, query);
long tock = System.currentTimeMillis();
times[it] = tock-tick;
int[][] index = NNWorker.getIndexes();
for (int k = 0; k < index.length; k++) {
System.out.print(index[k][0] + "\t");
}
System.out.println();
for (int k = 0; k < index.length; k++) {
System.out.print(index[k][1] + "\t");
}
System.out.println();
System.out.println();
}
double avrTime = 0;
for (int i = 0; i < times.length; i++) {
avrTime += times[i];
}
avrTime /= (double)iterations;
avrTime /= 1000.0;
NNWorker.killWorkers();
System.out.println("Done. Average time = " + avrTime + "s/query");
}
private static class NNWorker implements Runnable {
private static double[][] reference; //The vectors to search. reference[n] = one vector.
private static double[][] query; //The vectors whose neighbors we are searching for.
private static int[][] index; //Array of indices, corresponding to the data in dist.
public static CyclicBarrier startBarrier, stopBarrier;
private static boolean alive = true;
private static NNWorker[] workers;
private int startIndex, stopIndex; //Division of work.
private double[][] localDist; //Local copies of work, to avoid the possibility of false sharing.
private int[][] localIndex; //Local copies of work, to avoid the possibility of false sharing.
public static void makeWorkers(int numQuery) {
NNWorker.index = new int[numQuery][];
int numThreads = 1;
int numCores = Runtime.getRuntime().availableProcessors();
if (numCores >= numQuery) numThreads = numQuery;
else numThreads = numCores; //Adjust this in case you suspect that we're spending a lot of time waiting on cache misses.
System.out.println("Using " + numThreads + " threads.");
double perCore = (double)numQuery / (double)numCores;
int basePerCore = (int)floor(perCore);
double probabilistic = perCore%1.0; //We want to more-or-less distribute the remainder work among the threads evenly, so we'll do it probabilistically.
Random rand = new Random();
workers = new NNWorker[numThreads];
startBarrier = new CyclicBarrier(numThreads+1);
stopBarrier = new CyclicBarrier(numThreads+1);
System.out.print("Thread work assignments: ");
int slotIndex = 0;
for (int i = 0; i < workers.length-1; i++) {
int numSlots = basePerCore;
if (rand.nextDouble() <= probabilistic) numSlots++;
int start = slotIndex;
int stop = slotIndex + numSlots;
if (stop >= numQuery) stop = numQuery;
workers[i] = new NNWorker(start, stop);
slotIndex = stop;
new Thread(workers[i]).start();
System.out.print((stop-start) + "\t");
}
int start = slotIndex;
int stop = numQuery;
workers[workers.length -1] = new NNWorker(start, stop);
new Thread(workers[workers.length-1]).start();
System.out.println((stop-start));
}
public static void launchWorkers(double[][] ref, double[][] query) {
NNWorker.reference = ref;
NNWorker.query = query;
try {
startBarrier.await();
stopBarrier.await();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (BrokenBarrierException e) {
e.printStackTrace();
}
//Coallate work in a single thread. We don't really care about distance numbers in this implementation, only indices.
for (int i = 0; i < workers.length; i++) {
int start = workers[i].startIndex;
int stop = workers[i].stopIndex;
for (int j = start; j < stop; j++) {
index[j] = new int[2];
index[j][0] = workers[i].localIndex[j-start][0];
index[j][1] = workers[i].localIndex[j-start][1];
}
}
}
public static void killWorkers() {
alive = false;
try {
startBarrier.await();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (BrokenBarrierException e) {
e.printStackTrace();
}
}
public static int[][] getIndexes() {
return index;
}
public NNWorker(int startIndex, int stopIndex) {
this.startIndex = startIndex;
this.stopIndex = stopIndex;
this.localDist = new double[stopIndex-startIndex][];
this.localIndex = new int[stopIndex-startIndex][];
}
@Override
public void run() {
while (alive) {
try {
startBarrier.await();
if (!alive) return;
for (int i = startIndex; i < stopIndex; i++) {
double[] distBuffer = new double[2]; //k- hardcoded to 2, so we only care about findig the two closest.
int[] indexBuffer = new int[2];
distBuffer[0] = Double.POSITIVE_INFINITY;
distBuffer[1] = Double.POSITIVE_INFINITY;
for (int j = 0; j < reference.length; j++) {
double d = dist(query[i], reference[j]);
if (d < distBuffer[0]) {
distBuffer[1] = distBuffer[0];
distBuffer[0] = d;
indexBuffer[1] = indexBuffer[0];
indexBuffer[0] = j;
} else if (d < distBuffer[1]) {
distBuffer[1] = d;
indexBuffer[1] = j;
}
}
localDist[i-startIndex] = distBuffer; //Local buffering of data to avoid false sharing.
localIndex[i-startIndex] = indexBuffer;
}
stopBarrier.await();
} catch (InterruptedException e) {
e.printStackTrace();
return;
} catch (BrokenBarrierException e) {
e.printStackTrace();
return;
}
}
}
public static double dist(double[] p1, double[] p2) {
double ret = 0;
for (int i = 0; i < p1.length; i++) {
ret += (p1[i]-p2[i])*(p1[i]-p2[i]);
}
return ret;
}
}
}