编写一个Java类来排序100亿个整​​数

时间:2012-03-26 05:50:50

标签: java algorithm sorting

如何编写一个Java类来排序100亿个整​​数,假设我们一次只能将它们的一部分放在内存中。

我已经完成了排序,但问题是如何获得10亿的价值?

如果我要将一部分内容加载到内存中,我将如何对它们进行排序?

如果您可以帮助我使用源代码,我们将不胜感激。

提前致谢。

这是我的最后一个代码,你可以运行它并现在指导我。

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.Scanner;

/**
 * @project jqcontacts
 * @date Mar 26, 2012
 * @time 11:16:35 AM
 */
public class SortIntegers {

    private static String BIG_FILE="g:/bigFile.txt";
    private static String SORT_FILE_PREFIX = "g:/sortFile";
    /*private static final int SORT_FILE_MAX_SIZE = 1000000;*/
    private static final int SORT_FILE_MAX_SIZE = 10;
    private static final String MAIN_FILE =  "g:/rawfile1.txt";
    private static int RAWA_FILE_MAX_SIZE = 100;
    // if i keep the size of MERGE_BUFFER_INITIAL_SIZE = SORT_FILE_MAX_SIZE, the end big file is sorted.
    private static int MERGE_BUFFER_INITIAL_SIZE=5;
    private static int MERGE_BUFFER_SIZE_NEXT = MERGE_BUFFER_INITIAL_SIZE;
    private static int MERGE_BUFFER_SIZE_PREVIOUS = 0;

    private static int countFile = 0;

    public static void readFile(String name) throws FileNotFoundException{

        Scanner scanner = new Scanner(new File(name));
        List<Integer> intList = new ArrayList<Integer>();
         int fileSize = 0 ;

        while(scanner.hasNextInt()){
            intList.add(scanner.nextInt());
            ++fileSize;
            if(fileSize>=SORT_FILE_MAX_SIZE){
                Collections.sort(intList);
                /*System.out.println("list size: " + intList.size());*/
                String fileName = SORT_FILE_PREFIX + countFile +".txt";
                 ++fileSize;

                    PrintWriter out = openWriter(fileName);
                    for(int i:intList){
                          writeFile(i, out);
                    }

                    out.close();
                    intList.clear();
                    ++countFile;
                    fileSize = 0;
            }
        }

        System.out.println("done!");


    }


    public static List<Integer> readSortFile(String name, List<Integer> list) throws FileNotFoundException{

        Scanner scanner = new Scanner(new File(name));

        int bufferSize = 0;
        while(scanner.hasNextInt()){
            ++bufferSize;
            if(bufferSize>=MERGE_BUFFER_SIZE_PREVIOUS && bufferSize<=MERGE_BUFFER_SIZE_NEXT){
                list.add(scanner.nextInt());
            }

            if(bufferSize>=MERGE_BUFFER_SIZE_NEXT){
                break;
            }

            }


        Collections.sort(list);
        return list;
    }

     private static PrintWriter openWriter(String name) {
            try {
              File file = new File(name);
              PrintWriter out = new PrintWriter(new BufferedWriter(new FileWriter(file)), true);
              return out;
            } catch (IOException e) {
              //System.out.println("I/O Error");
              e.printStackTrace();
              System.exit(0);
            }
            return null;
          }

      private static void writeFile(int i, PrintWriter out) {
           /* String line =  "0" + "\t" + Integer.toString(i);*/

          String line =  Integer.toString(i) + "\t";
            out.println(line);
          }

    /**
     * @param args
     */
    public static void main(String[] args) {

        generateRawIntFile();

            try {
                readFile(MAIN_FILE);
            } catch (FileNotFoundException e) {

                e.printStackTrace();
            }

            System.out.println("countFile: " + countFile);

            // merge sort here, merge the sorted files into one

            List<Integer> comboList = new ArrayList<Integer>();
            boolean isDone = true;
            PrintWriter outP = openWriter(BIG_FILE);

            while(isDone){

            for(int i=0;i<countFile;i++){

                try {
                    //TODO: do we need the return type for readSortFile ????
                    comboList = readSortFile(SORT_FILE_PREFIX+i+".txt", comboList);
                } catch (FileNotFoundException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }

            }

           // System.out.println("hun writing on big file    " + comboList.size());

            // add the list into bigfile and clear it for further processing

             try{

              for(int value:comboList){

                  writeFile(value, outP);
              }

              comboList.clear();


              MERGE_BUFFER_SIZE_PREVIOUS = MERGE_BUFFER_SIZE_NEXT;
              MERGE_BUFFER_SIZE_NEXT += MERGE_BUFFER_INITIAL_SIZE;

              System.out.println("MERGE_BUFFER_SIZE_PREVIOUS: " + MERGE_BUFFER_SIZE_PREVIOUS + " MERGE_BUFFER_SIZE_NEXT:" + MERGE_BUFFER_SIZE_NEXT);

              if(MERGE_BUFFER_SIZE_PREVIOUS >= RAWA_FILE_MAX_SIZE){
                  System.out.println("sorting is finished");
                  isDone = false;
                  break;
              }

             }catch (Exception e) {
                 e.printStackTrace();
            }



            }

    }

    /**
     * 
     */
    public static void generateRawIntFile() {

         Random randomGenerator = new Random();

          PrintWriter out = openWriter(MAIN_FILE);
            for (Integer i = 0; i < RAWA_FILE_MAX_SIZE;i++){
                Integer value = randomGenerator.nextInt(RAWA_FILE_MAX_SIZE);
                  writeFile(value, out);
            }
            out.close();
    }

}

1 个答案:

答案 0 :(得分:4)

只有40亿个int值,因此最有效的方法是计算任何值的出现次数。您可以使用内存MappedByteBuffer,因此您不必拥有16 GB的内存。一旦计算了所有出现次数,计数自然会按顺序排列,因此不需要进一步排序。时间复杂度为O(n)而不是O(n * log n)行合并排序或快速排序。


import sun.nio.ch.DirectBuffer;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;

public class Sort10Billion {
    public static void main(String... args) throws IOException {
        Runtime runtime = Runtime.getRuntime();
        long used1 = runtime.totalMemory() - runtime.freeMemory();

        MassiveCounterStore mcs = new MassiveCounterStore();
        long start = System.nanoTime();
        long count = 10 * 1000 * 1000 * 1000L;
        for (long i = count; i > 0; i--)
            mcs.incrementIndex((int)  (i / 1019));
        mcs.iterator(new NumberCountFunction() {
            @Override
            public void counted(int n, long count) {
//                System.out.println(n + ": " + count);
            }
        });
        long time = System.nanoTime() - start;
        long used2 = runtime.totalMemory() - runtime.freeMemory();
        System.out.printf("Took %.1f seconds to sort %,d numbers, using %.3f MB%n", time / 1e9, count, (used2-used1)/1e6);
        mcs.close();
    }
}

interface NumberCountFunction {
    public void counted(int n, long count);
}

class MassiveCounterStore {
    public static final int PARTITION_BITS = 26;
    static final int PARTITIONS = (1 << (34 - PARTITION_BITS));  // 32-bit * 4 bytes.
    final MappedByteBuffer[] buffers = new MappedByteBuffer[PARTITIONS];
    final FileChannel channel;
    int smallest = PARTITIONS;
    int largest = 0;

    public MassiveCounterStore() throws IOException {
        File tmpStore = File.createTempFile("counter", "dat");
        tmpStore.deleteOnExit();

        channel = new RandomAccessFile(tmpStore, "rw").getChannel();
        for (int i = 0; i < PARTITIONS; i++)
            buffers[i] = channel.map(FileChannel.MapMode.READ_WRITE, (long) i << PARTITION_BITS, 1 << PARTITION_BITS);
    }

    public void incrementIndex(int n) {
        long l = (n + Integer.MIN_VALUE) & 0xFFFFFFFFL;
        int partition = (int) (l >> (PARTITION_BITS - 2)); // 4 bytes each.
        int index = (int) ((l << 2) & ((1 << PARTITION_BITS) - 1));
        MappedByteBuffer buffer = buffers[partition];
        int count = buffer.getInt(index);
        buffer.putInt(index, count + 1);
        if (smallest > partition) smallest = partition;
        if (largest < partition) largest = partition;
    }

    public void iterator(NumberCountFunction nfc) {
        int n = (smallest << (PARTITION_BITS -2)) + Integer.MIN_VALUE;
        for (int p = smallest; p <= largest; p++) {
            MappedByteBuffer buffer = buffers[p];
            for (int i = 0; i < 1 << PARTITION_BITS; i += 4) {
                int count = buffer.getInt(i);
                if (count != 0)
                    nfc.counted(n, count & 0xFFFFFFFFL);
                n++;
            }
        }
        assert n == Integer.MIN_VALUE;
    }

    public void close() {
        try {
            channel.close();
        } catch (IOException ignored) {
        }
        for (MappedByteBuffer buffer : buffers) {
            ((DirectBuffer) buffer).cleaner().clean();
        }
    }
}
使用-XX运行时打印

:-UseTLAB(为您提供更精确的内存使用)

Took 150.7 seconds to sort 10,000,000,000 numbers, using 0.202 MB

我认为使用202 KB非常好。 ;)

注意:您的性能在很大程度上取决于值的分布,因为这会影响缓存的效率。