如何编写一个Java类来排序100亿个整数,假设我们一次只能将它们的一部分放在内存中。
我已经完成了排序,但问题是如何获得10亿的价值?
如果我要将一部分内容加载到内存中,我将如何对它们进行排序?
如果您可以帮助我使用源代码,我们将不胜感激。
提前致谢。
这是我的最后一个代码,你可以运行它并现在指导我。
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.Scanner;
/**
* @project jqcontacts
* @date Mar 26, 2012
* @time 11:16:35 AM
*/
public class SortIntegers {
private static String BIG_FILE="g:/bigFile.txt";
private static String SORT_FILE_PREFIX = "g:/sortFile";
/*private static final int SORT_FILE_MAX_SIZE = 1000000;*/
private static final int SORT_FILE_MAX_SIZE = 10;
private static final String MAIN_FILE = "g:/rawfile1.txt";
private static int RAWA_FILE_MAX_SIZE = 100;
// if i keep the size of MERGE_BUFFER_INITIAL_SIZE = SORT_FILE_MAX_SIZE, the end big file is sorted.
private static int MERGE_BUFFER_INITIAL_SIZE=5;
private static int MERGE_BUFFER_SIZE_NEXT = MERGE_BUFFER_INITIAL_SIZE;
private static int MERGE_BUFFER_SIZE_PREVIOUS = 0;
private static int countFile = 0;
public static void readFile(String name) throws FileNotFoundException{
Scanner scanner = new Scanner(new File(name));
List<Integer> intList = new ArrayList<Integer>();
int fileSize = 0 ;
while(scanner.hasNextInt()){
intList.add(scanner.nextInt());
++fileSize;
if(fileSize>=SORT_FILE_MAX_SIZE){
Collections.sort(intList);
/*System.out.println("list size: " + intList.size());*/
String fileName = SORT_FILE_PREFIX + countFile +".txt";
++fileSize;
PrintWriter out = openWriter(fileName);
for(int i:intList){
writeFile(i, out);
}
out.close();
intList.clear();
++countFile;
fileSize = 0;
}
}
System.out.println("done!");
}
public static List<Integer> readSortFile(String name, List<Integer> list) throws FileNotFoundException{
Scanner scanner = new Scanner(new File(name));
int bufferSize = 0;
while(scanner.hasNextInt()){
++bufferSize;
if(bufferSize>=MERGE_BUFFER_SIZE_PREVIOUS && bufferSize<=MERGE_BUFFER_SIZE_NEXT){
list.add(scanner.nextInt());
}
if(bufferSize>=MERGE_BUFFER_SIZE_NEXT){
break;
}
}
Collections.sort(list);
return list;
}
private static PrintWriter openWriter(String name) {
try {
File file = new File(name);
PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file)), true);
return out;
} catch (IOException e) {
//System.out.println("I/O Error");
e.printStackTrace();
System.exit(0);
}
return null;
}
private static void writeFile(int i, PrintWriter out) {
/* String line = "0" + "\t" + Integer.toString(i);*/
String line = Integer.toString(i) + "\t";
out.println(line);
}
/**
* @param args
*/
public static void main(String[] args) {
generateRawIntFile();
try {
readFile(MAIN_FILE);
} catch (FileNotFoundException e) {
e.printStackTrace();
}
System.out.println("countFile: " + countFile);
// merge sort here, merge the sorted files into one
List<Integer> comboList = new ArrayList<Integer>();
boolean isDone = true;
PrintWriter outP = openWriter(BIG_FILE);
while(isDone){
for(int i=0;i<countFile;i++){
try {
//TODO: do we need the return type for readSortFile ????
comboList = readSortFile(SORT_FILE_PREFIX+i+".txt", comboList);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// System.out.println("hun writing on big file " + comboList.size());
// add the list into bigfile and clear it for further processing
try{
for(int value:comboList){
writeFile(value, outP);
}
comboList.clear();
MERGE_BUFFER_SIZE_PREVIOUS = MERGE_BUFFER_SIZE_NEXT;
MERGE_BUFFER_SIZE_NEXT += MERGE_BUFFER_INITIAL_SIZE;
System.out.println("MERGE_BUFFER_SIZE_PREVIOUS: " + MERGE_BUFFER_SIZE_PREVIOUS + " MERGE_BUFFER_SIZE_NEXT:" + MERGE_BUFFER_SIZE_NEXT);
if(MERGE_BUFFER_SIZE_PREVIOUS >= RAWA_FILE_MAX_SIZE){
System.out.println("sorting is finished");
isDone = false;
break;
}
}catch (Exception e) {
e.printStackTrace();
}
}
}
/**
*
*/
public static void generateRawIntFile() {
Random randomGenerator = new Random();
PrintWriter out = openWriter(MAIN_FILE);
for (Integer i = 0; i < RAWA_FILE_MAX_SIZE;i++){
Integer value = randomGenerator.nextInt(RAWA_FILE_MAX_SIZE);
writeFile(value, out);
}
out.close();
}
}
答案 0 :(得分:4)
只有40亿个int
值,因此最有效的方法是计算任何值的出现次数。您可以使用内存MappedByteBuffer,因此您不必拥有16 GB的内存。一旦计算了所有出现次数,计数自然会按顺序排列,因此不需要进一步排序。时间复杂度为O(n)而不是O(n * log n)行合并排序或快速排序。
import sun.nio.ch.DirectBuffer;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
public class Sort10Billion {
public static void main(String... args) throws IOException {
Runtime runtime = Runtime.getRuntime();
long used1 = runtime.totalMemory() - runtime.freeMemory();
MassiveCounterStore mcs = new MassiveCounterStore();
long start = System.nanoTime();
long count = 10 * 1000 * 1000 * 1000L;
for (long i = count; i > 0; i--)
mcs.incrementIndex((int) (i / 1019));
mcs.iterator(new NumberCountFunction() {
@Override
public void counted(int n, long count) {
// System.out.println(n + ": " + count);
}
});
long time = System.nanoTime() - start;
long used2 = runtime.totalMemory() - runtime.freeMemory();
System.out.printf("Took %.1f seconds to sort %,d numbers, using %.3f MB%n", time / 1e9, count, (used2-used1)/1e6);
mcs.close();
}
}
interface NumberCountFunction {
public void counted(int n, long count);
}
class MassiveCounterStore {
public static final int PARTITION_BITS = 26;
static final int PARTITIONS = (1 << (34 - PARTITION_BITS)); // 32-bit * 4 bytes.
final MappedByteBuffer[] buffers = new MappedByteBuffer[PARTITIONS];
final FileChannel channel;
int smallest = PARTITIONS;
int largest = 0;
public MassiveCounterStore() throws IOException {
File tmpStore = File.createTempFile("counter", "dat");
tmpStore.deleteOnExit();
channel = new RandomAccessFile(tmpStore, "rw").getChannel();
for (int i = 0; i < PARTITIONS; i++)
buffers[i] = channel.map(FileChannel.MapMode.READ_WRITE, (long) i << PARTITION_BITS, 1 << PARTITION_BITS);
}
public void incrementIndex(int n) {
long l = (n + Integer.MIN_VALUE) & 0xFFFFFFFFL;
int partition = (int) (l >> (PARTITION_BITS - 2)); // 4 bytes each.
int index = (int) ((l << 2) & ((1 << PARTITION_BITS) - 1));
MappedByteBuffer buffer = buffers[partition];
int count = buffer.getInt(index);
buffer.putInt(index, count + 1);
if (smallest > partition) smallest = partition;
if (largest < partition) largest = partition;
}
public void iterator(NumberCountFunction nfc) {
int n = (smallest << (PARTITION_BITS -2)) + Integer.MIN_VALUE;
for (int p = smallest; p <= largest; p++) {
MappedByteBuffer buffer = buffers[p];
for (int i = 0; i < 1 << PARTITION_BITS; i += 4) {
int count = buffer.getInt(i);
if (count != 0)
nfc.counted(n, count & 0xFFFFFFFFL);
n++;
}
}
assert n == Integer.MIN_VALUE;
}
public void close() {
try {
channel.close();
} catch (IOException ignored) {
}
for (MappedByteBuffer buffer : buffers) {
((DirectBuffer) buffer).cleaner().clean();
}
}
}
使用-XX运行时打印:-UseTLAB(为您提供更精确的内存使用)
Took 150.7 seconds to sort 10,000,000,000 numbers, using 0.202 MB
我认为使用202 KB非常好。 ;)
注意:您的性能在很大程度上取决于值的分布,因为这会影响缓存的效率。