如何在阅读和写作方面加快速度?

时间:2017-07-19 15:44:39

标签: java performance stream text-processing bigdata

public class DataMiner  {

private static BigData app = new BigData();
private static DomainOfConstants doc = new DomainOfConstants();
private static Logger log = Logger.getLogger(DataMiner.class);
private static DBManager conn = new DBManager();
private static java.sql.Connection con = null;
private static AmazonS3  s3Client;
private static Iterator<String> itr;
private static List<String> entries = new ArrayList<String>();
private static S3Object s3Object;
private static ObjectMetadata meta;
public static InputStream dataStream;
public static byte[] buffer = new byte[1024];
public static File file = new File(app.getCurrentPacsId()+".txt");



private static void obtainConnection(){
    conn.connection();
    entries = conn.grabDataSet();       
    conn.closeDb();
    downloadBucket();
}

/*
 * 
 * The Java heap size limits for Windows are:
 * maximum possible heap size on 32-bit Java: 1.8 GB
 * recommended heap size limit on 32-bit Java: 1.5 GB (or 1.8 GB with /3GB option)
 * 
 * */
/*-------------Download and un-zip backup file-------------*/
private static void downloadBucket(){

    try {
        app.setAwsCredentials(doc.getAccessKey(), doc.getSecretKey());
        s3Client = AmazonS3ClientBuilder.standard().withCredentials(new AWSStaticCredentialsProvider(app.getAwsCredentials())).withRegion(Regions.US_EAST_1).build();
        System.out.println("Connected to S3");
        itr = entries.iterator();
        while(itr.hasNext()){
            app.setBucketKey(itr.next());
            String key = app.getBucketKey();
            app.setCurrentPacsId(key);
            s3Object = s3Client.getObject(new GetObjectRequest(doc.getDesiredBucket(), app.getBucketKey()));
            try {
                ZipInputStream zis = new ZipInputStream(s3Object.getObjectContent());
                ZipEntry entry = zis.getNextEntry();
                extractObjects(buffer, s3Client, zis, entry);                   
            } catch (AmazonServiceException e) {
                log.error(e);
            } catch (SdkClientException e) {
                log.error(e);
            } catch (IOException e) {
                log.error(e);
            }
        }
        System.out.println("Processing complete");


    } catch (IllegalArgumentException e) {
        e.printStackTrace();
    } 
}

public static void extractObjects(byte[] buffer, AmazonS3 s3Client, ZipInputStream zis, ZipEntry entry) throws IOException {
    PipedOutputStream outputStream = null;
    PipedInputStream is = null;
    try {
        while (entry != null) 
        {
            String fileName = entry.getName();
            if (fileName == "lib") {
                fileName = entry.getName();
            }
            boolean containsBackup = fileName.contains(doc.getDesiredFile());

            if (containsBackup == true) {
                System.out.println("A back up file was found");
                long start = System.currentTimeMillis();
                formatSchemaName();
                System.out.println("Extracting :" + app.getCurrentPacsId());
                log.info("Extracting " + app.getCurrentPacsId() + ", 
                compressed: " + entry.getCompressedSize() + " bytes, 
                extracted: " + 
                entry.getSize() + " bytes");
         //ByteArrayOutputStream outputStream = new ByteArrayOutputStream();


                outputStream = new PipedOutputStream();
                is = new PipedInputStream(outputStream);

                int len;
                while ((len = zis.read(buffer)) >= 0) 
                {
                    outputStream.write(buffer, 0, len);
                }
   //InputStream is = new ByteArrayInputStream(outputStream.toByteArray());
                meta = new ObjectMetadata();
                meta.setContentLength(file.length());
                fileName = app.getCurrentPacsId();
                runDataConversion(is,s3Client,fileName);
                recordTime(start);
                is.close();
                outputStream.close();
                System.out.println("Unzip complete");               
            }
            else{
                System.out.println("No back up found");
            }
            entry = zis.getNextEntry();
        }
        zis.closeEntry();
        zis.close();
    } catch (AmazonServiceException e) {
        log.error(e);
    } catch (SdkClientException e) {
        log.error(e);
    }
}


/*------------Formating the replacment file name---------*/
private static void formatSchemaName(){
    String s3Key = app.getCurrentPacsId();
    String id = s3Key.replace(".zip", ".txt");
    id = id.substring(id.indexOf("_"));
    id = id.replaceFirst("_", "");
    app.setCurrentPacsId(id);
}

/*---------------Process the data file----------------------*/
private static void runDataConversion(PipedInputStream is, AmazonS3 s3Client, String fileName) {
    DataProcessor convert = new DataProcessor(s3Client);
    convert.downloadBucket(is,fileName);
}

/*-------Records execution time of program in min/sec------*/
private static void recordTime(long start) throws IOException {
    long end = System.currentTimeMillis();
    long minutes = TimeUnit.MILLISECONDS.toMinutes(end - start);
    long seconds = TimeUnit.MILLISECONDS.toSeconds(end - start);
    System.out.println("Execution speed "+ minutes + ":" + (seconds % 60) +" min/sec\n");
}

这是进行一些文本文件处理的类。当处理高达3.5gb的文件时,代码总体上非常慢。跑步时需要3个小时。我尝试在字节流上使用管道流。在64位JDK上将Java堆大小设置为-xms2800m。

public class DataProcessor {

private static AmazonS3 s3Client;
private static ObjectMetadata meta;
private static DomainOfConstants doc = new DomainOfConstants();
private static BigData app = new BigData();
public static File file = new File(app.getCurrentPacsId()+".txt");
private static Logger log = Logger.getLogger(DataProcessor.class);

//Construct connection
public DataProcessor (AmazonS3 s3Client){
    this.s3Client = s3Client;
}

//
public void downloadBucket(PipedInputStream is, String fileName) {
    try {
        File dataStream = dataConversion(is);
        s3Client.putObject(doc.getDestinationBucket(),FilenameUtils.getFullPath(doc.getDestinationKey()) + "Modified_"+ fileName, dataStream);
    } catch (AmazonServiceException e) {
        e.printStackTrace();
        log.error(e);
    } catch (SdkClientException e) {
        e.printStackTrace();
        log.error(e);

    }               
}

//Setup reading and writing streams
public static File dataConversion(PipedInputStream stream) {
    BufferedReader reader = null;
    BufferedOutputStream streamOut = null;
    String line;

    try {
        reader = new BufferedReader(new InputStreamReader(stream,doc.getFileFormat()));
        streamOut = new BufferedOutputStream(new FileOutputStream(file));
        meta = new ObjectMetadata();
        while(( line = reader.readLine() ) != null)
        {
            processLine(reader, streamOut, line);
        }
    }
    catch (IOException e) {
        e.printStackTrace();
    } finally {
        try {
            streamOut.close();
            reader.close();

        } catch (IOException e) {
            e.printStackTrace();
            log.error(e);
        }
    }
    return file;
}


/*---------------------------------------Data processing------------------------------------------------*/

    /*-----------Process and print lines---------*/
private static void processLine(BufferedReader reader, BufferedOutputStream streamOut, String line) {
    try {
        String newLine = System.getProperty("line.separator");

        while (reader.ready()) {
            if (line.contains(doc.getInsert())) {
                handleData(streamOut, line);
            } else if (line.contains(doc.getUse())) {
                handleSchemaName(streamOut, line);
            } else {
                streamOut.write(line.toLowerCase().getBytes(Charset.forName(doc.getFileFormat()).toString()));
                streamOut.write(newLine.getBytes());
            }
            line = reader.readLine();
        }
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
        log.error(e);

    } catch (IOException e) {
        e.printStackTrace();
        log.error(e);

    }
}

    /*-----------Replace-Schema-Name-----------*/
private static void handleSchemaName(BufferedOutputStream streamOut, String line) throws IOException {
    line = line.replace(line, "USE " + "`" + doc.getSchemaName() + app.getCurrentPacsId() + "`;");
    streamOut.write(line.getBytes(Charset.forName(doc.getFileFormat())));
}


    /*--------Avoid-Formating-Data-Portion-of-file--------*/
private static void handleData(BufferedOutputStream streamOut, String line) throws IOException {
    StringTokenizer tk = new StringTokenizer(line);
    while (tk.hasMoreTokens()) {
        String data = tk.nextToken();
        if (data.equals(doc.getValue())) {
            streamOut.write(data.toLowerCase().getBytes(Charset.forName(doc.getFileFormat()).toString()));
            data = tk.nextToken();
            while (tk.hasMoreTokens()) {
                streamOut.write(data.getBytes(Charset.forName(doc.getFileFormat())));
                data = tk.nextToken();
            }
        }
        streamOut.write(line.toLowerCase().getBytes(Charset.forName(doc.getFileFormat().toString())));
        streamOut.write(" ".getBytes(Charset.forName(doc.getFileFormat())));
    }
}

1 个答案:

答案 0 :(得分:2)

  1. 规则1始终使用更大的缓冲区。 1024是可怜的小。试试32-64K。
  2. 您需要在对管道进行任何写入之前启动管道读取线程。事实上,我很惊讶你没有得到'read end dead'错误。这段代码真的有用吗?
  3. 实际上摆脱了管道流。使用单个线程并随时进行所有处理。
  4. 摆脱ready()测试。这是一个额外的系统调用。只需阅读直到结束。
  5. 使用BufferedWriter代替BufferedOutputStream并停止将所有字符串转换为字节(并使用BufferedWriter.newLine()代替系统属性)。