将JSONArray拆分为更小的JSONArray

时间:2015-02-01 13:46:58

标签: java json

我遇到过org.json.JSONArray对象体积很大的情况,最终导致延迟和其他问题。因此我们决定将JSONArray拆分为较小的块。 例如,如果JSONArray是这样的:

- [{"alt_party_id_type":"xyz","first_name":"child1ss","status":"1","dob":"2014-10-02 00:00:00.0","last_name":"childSs"},
{"alt_party_id_type":"xyz","first_name":"suga","status":"1","dob":"2014-11-05 00:00:00.0","last_name":"test"},
{"alt_party_id_type":"xyz","first_name":"test4a","status":"1","dob":"2000-11-05 00:00:00.0","last_name":"test4s"},
{"alt_party_id_type":"xyz","first_name":"demo56","status":"0","dob":"2000-11-04 00:00:00.0","last_name":"Demo5"},{"alt_party_id_type":"xyz","first_name":"testsss","status":"1","dob":"1900-01-01 00:00:00.0","last_name":"testssssssssss"},{"alt_party_id_type":"xyz","first_name":"Demo1234","status":"0","dob":"2014-11-21 00:00:00.0","last_name":"Demo1"},{"alt_party_id_type":"xyz","first_name":"demo2433","status":"1","dob":"2014-11-13 00:00:00.0","last_name":"demo222"},{"alt_party_id_type":"xyz","first_name":"demo333","status":"0","dob":"2014-11-12 00:00:00.0","last_name":"demo344"},{"alt_party_id_type":"xyz","first_name":"Student","status":"1","dob":"2001-12-03 00:00:00.0","last_name":"StudentTest"}]

比我需要帮助将JSONArray分成三个JSONArray:

- [{"alt_party_id_type":"xyz","first_name":"child1ss","status":"1","dob":"2014-10-02 00:00:00.0","last_name":"childSs"}, {"alt_party_id_type":"xyz","first_name":"suga","status":"1","dob":"2014-11-05 00:00:00.0","last_name":"test"}, {"alt_party_id_type":"xyz","first_name":"test4a","status":"1","dob":"2000-11-05 00:00:00.0","last_name":"test4s"}]


 - [{"alt_party_id_type":"xyz","first_name":"demo56","status":"0","dob":"2000-11-04 00:00:00.0","last_name":"Demo5"}, {"alt_party_id_type":"xyz","first_name":"testsss","status":"1","dob":"1900-01-01 00:00:00.0","last_name":"testssssssssss"}, {"alt_party_id_type":"xyz","first_name":"Demo1234","status":"0","dob":"2014-11-21 00:00:00.0","last_name":"Demo1"}] 


 - [{"alt_party_id_type":"xyz","first_name":"demo2433","status":"1","dob":"2014-11-13 00:00:00.0","last_name":"demo222"}, {"alt_party_id_type":"xyz","first_name":"demo333","status":"0","dob":"2014-11-12 00:00:00.0","last_name":"demo344"}, {"alt_party_id_type":"xyz","first_name":"Student","status":"1","dob":"2001-12-03 00:00:00.0","last_name":"StudentTest"}]

有人可以帮助我。我尝试了很多选择,但未能这样做。

1 个答案:

答案 0 :(得分:2)

处理大量输入文件时,应使用流式处理方法,而不是将整个文档加载到内存中,以减少内存占用,避免OutOfMemoryError,并且可以在读取输入时开始处理。 JSONArray几乎不支持流媒体,因此我建议您查看Jackson's streaming APIGSON streaming或类似内容。

话虽这么说,如果你坚持使用JSONArray,你可以使用JSONTokener拼凑一个流方法。这是一个示例程序,它将流输入一个输入文件并创建单独的JSON文档,每个文档最多包含10个元素。

import java.io.*;
import java.util.*;
import org.json.*;

public class JsonSplit {

    private static final int BATCH_SIZE = 10;

    public static void flushFile(List<Object> objects, int d) throws Exception {
        try (FileOutputStream output = new FileOutputStream("split-" + d
            + ".json");
                Writer writer = new OutputStreamWriter(output, "UTF-8")) {
            JSONArray jsonArray = new JSONArray(objects);
            jsonArray.write(writer);
        }
    }

    public static void main(String[] args) throws Exception {
        int outputIndex = 0;
        try (InputStream input = new BufferedInputStream(
                new FileInputStream(args[0]))) {
            JSONTokener tokener = new JSONTokener(input);
            if (tokener.nextClean() != '[') {
                throw tokener.syntaxError("Expected start of JSON array");
            }
            List<Object> jsonObjects = new ArrayList<>();
            while (tokener.nextClean() != ']') {
                // Back up one character, it's part of the next value.
                tokener.back();
                // Read the next value in the array.
                jsonObjects.add(tokener.nextValue());
                // Flush if max objects per file has been reached.
                if (jsonObjects.size() == BATCH_SIZE) {
                    flushFile(jsonObjects, outputIndex);
                    jsonObjects.clear();
                    outputIndex++;
                }
                // Read and discard commas between array elements.
                if (tokener.nextClean() != ',') {
                    tokener.back();
                }
            }
            if (!jsonObjects.isEmpty()) {
                flushFile(jsonObjects, outputIndex);
            }
            // Verify that end of input is reached.
            if (tokener.nextClean() != 0) {
                throw tokener.syntaxError("Expected end of document");
            }
        }

    }

}

要了解为什么需要大型文件的流媒体方法,下载或创建一个巨大的JSON文件,然后尝试运行一个不流式传输的天真实现。这是一个Perl命令,用于创建一个包含1,000,000个元素且文件大小约为16 MB的JSON数组。

perl -le 'print "["; for (1..1_000_000) {print "," unless $_ == 1; print "{\"id\": " . int(rand(1_000_000)) . "}";} print "]"' > input_huge.json

如果你在这个输入上运行JsonSplit,那么它会以很小的内存占用而快速地流失,产生100,000个文件,每个文件包含10个元素。此外,它将在启动时立即开始生成输出文件。

相反,如果您运行以下JsonSplitNaive程序,它一次性读取整个JSON文档,它显然会在很长一段时间内不执行任何操作,然后使用OutOfMemoryError中止。

import java.io.*;
import java.util.*;
import org.json.*;

public class JsonSplitNaive {

    /*
     * Naive version - do not use, will fail with OutOfMemoryError for
     * huge inputs.
     */

    private static final int BATCH_SIZE = 10;

    public static void flushFile(List<Object> objects, int d) throws Exception {
        try (FileOutputStream output = new FileOutputStream("split-" + d
            + ".json");
                Writer writer = new OutputStreamWriter(output, "UTF-8")) {
            JSONArray jsonArray = new JSONArray(objects);
            jsonArray.write(writer);
        }
    }

    public static void main(String[] args) throws Exception {
        int outputIndex = 0;
        try (InputStream input = new BufferedInputStream(
                new FileInputStream(args[0]))) {
            List<Object> jsonObjects = new ArrayList<>();
            JSONArray jsonArray = new JSONArray(new JSONTokener(input));
            for (int i = 0; i < jsonArray.length(); i++) {
                jsonObjects.add(jsonArray.get(i));
                // Flush if max objects per file has been reached.
                if (jsonObjects.size() == BATCH_SIZE) {
                    flushFile(jsonObjects, outputIndex);
                    jsonObjects.clear();
                    outputIndex++;
                }
            }
            if (!jsonObjects.isEmpty()) {
                flushFile(jsonObjects, outputIndex);
            }
        }
    }

}