需要将数据分成组,每组不应超过指定数量的重复项

时间:2017-06-13 03:15:18

标签: java mysql sql hadoop

我有一些数据,我必须将数据分成不同的桶,因此在分配每个桶的数据之前,桶中的重复数不应超过5个。

例如:

build.gradle

在第二个桶中的这个例子中,只有5个元素是重复的。我有>我需要遵循上述逻辑的100k数据。怎么做?

1 个答案:

答案 0 :(得分:1)

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;


public class SeparateData {

    public static void main(String[] args) {

        Integer[] intArr = {1, 2, 3, 4, 5, 35, 7, 8, 9, 10, 1, 2, 3, 4, 5, 73, 26, 19, 15, 1, 2, 3, 4, 5, 6, 33, 21, 
                22, 12, 88, 3, 2, 4, 5, 74, 13, 14, 17, 20, 1, 44, 30, 31, 37, 5, 4, 3, 99, 66, 11, 2, 5, 7, 43, 27};
        separate(intArr, 10, 5);
    }


    static void separate(Integer intArr[], int countPerBucket, int dupilicateLimit) {

        // quickSort
        Arrays.sort(intArr);

        // build a list without duplicates
        List<Integer> noDupList = new ArrayList<Integer>();
        LinkedHashMap<Integer, Integer> hashMap = new LinkedHashMap<Integer, Integer>();
        for (int i = 0; i < intArr.length - 1; i++) {
            int count = 2;
            if (intArr[i] == intArr[i + 1]) {
                if (hashMap.get(intArr[i]) != null) {
                    count = hashMap.get(intArr[i]) + 1;
                }
                hashMap.put(intArr[i], count);
            } else {
                if ((i == 0 && intArr[i] != intArr[i + 1]) || (i > 0 && intArr[i] != intArr[i - 1])) {
                    noDupList.add(intArr[i]);
                }
                if (i == intArr.length - 2 && intArr[i] != intArr[i + 1]) {
                    noDupList.add(intArr[i + 1]);
                }
            }

        }

        // build a list with duplicates
        List<List<Integer>> dupList = new ArrayList<List<Integer>>();
        Set<Integer> set = hashMap.keySet();
        while (set.size() > 0) {
            int i = 0;
            List<Integer> list = new ArrayList<Integer>();
            Iterator<Integer> iterator = set.iterator();
            while (iterator.hasNext()) {
                if (i == dupilicateLimit) {
                    break;
                }
                Integer key = iterator.next();
                list.add(key);
                Integer count = hashMap.get(key);
                if (count == 1) {
                    iterator.remove();
                } else {
                    hashMap.put(key, count - 1);
                }
                i++;
            }
            System.out.println(list);
            dupList.add(list);
        }


        //spepate the data
        List<Integer[]> buckets = new ArrayList<Integer[]>();
        int j = 0;
        for (int i = 0; i < dupList.size(); i++) {
            Integer[] bucket = new Integer[countPerBucket];
            for (int o = 0; o < bucket.length; o++) {
                if (o < dupList.get(i).size()) {
                    bucket[o] = dupList.get(i).get(o);
                } else if (o >= dupList.get(i).size() && j < noDupList.size()) {
                    bucket[o] = noDupList.get(j);
                    j++;
                }
            }
            buckets.add(bucket);
        }

        while (j < noDupList.size()) {
            Integer[] bucket = new Integer[countPerBucket];
            for (int o = 0; o < bucket.length; o++) {
                bucket[o] = noDupList.get(j);
                j++;
            }
            buckets.add(bucket);
        }
        writeResult(countPerBucket, buckets);
    }


    /**
     * print result
     *
     * @param countPerBucket
     * @param buckets
     */
    private static void writeResult(int countPerBucket, List<Integer[]> buckets) {
        for (int i = 0; i < buckets.size(); i++) {
            System.out.print("bucket" + i + "\t");
        }
        System.out.println();

        for (int k = 0; k < countPerBucket; k++) {
            for (int i = 0; i < buckets.size(); i++) {
                if (k < buckets.get(i).length && buckets.get(i)[k] != null) {
                    System.out.print(buckets.get(i)[k] + "\t");
                }
            }
            System.out.println();
        }
    }
}

<强>输出:

bucket0 bucket1 bucket2 bucket3 bucket4 bucket5 

1   1   1   1   2   5   
2   2   2   2   3   7   
3   3   3   3   4   74  
4   4   4   4   5   88  
5   5   5   5   7   99  
6   12  19  27  37  
8   13  20  30  43  
9   14  21  31  44  
10  15  22  33  66  
11  17  26  35  73