public class DataMiner {
private static BigData app = new BigData();
private static DomainOfConstants doc = new DomainOfConstants();
private static Logger log = Logger.getLogger(DataMiner.class);
private static DBManager conn = new DBManager();
private static java.sql.Connection con = null;
private static AmazonS3 s3Client;
private static Iterator<String> itr;
private static List<String> entries = new ArrayList<String>();
private static S3Object s3Object;
private static ObjectMetadata meta;
public static InputStream dataStream;
public static byte[] buffer = new byte[1024];
public static File file = new File(app.getCurrentPacsId()+".txt");
private static void obtainConnection(){
conn.connection();
entries = conn.grabDataSet();
conn.closeDb();
downloadBucket();
}
/*
*
* The Java heap size limits for Windows are:
* maximum possible heap size on 32-bit Java: 1.8 GB
* recommended heap size limit on 32-bit Java: 1.5 GB (or 1.8 GB with /3GB option)
*
* */
/*-------------Download and un-zip backup file-------------*/
private static void downloadBucket(){
try {
app.setAwsCredentials(doc.getAccessKey(), doc.getSecretKey());
s3Client = AmazonS3ClientBuilder.standard().withCredentials(new AWSStaticCredentialsProvider(app.getAwsCredentials())).withRegion(Regions.US_EAST_1).build();
System.out.println("Connected to S3");
itr = entries.iterator();
while(itr.hasNext()){
app.setBucketKey(itr.next());
String key = app.getBucketKey();
app.setCurrentPacsId(key);
s3Object = s3Client.getObject(new GetObjectRequest(doc.getDesiredBucket(), app.getBucketKey()));
try {
ZipInputStream zis = new ZipInputStream(s3Object.getObjectContent());
ZipEntry entry = zis.getNextEntry();
extractObjects(buffer, s3Client, zis, entry);
} catch (AmazonServiceException e) {
log.error(e);
} catch (SdkClientException e) {
log.error(e);
} catch (IOException e) {
log.error(e);
}
}
System.out.println("Processing complete");
} catch (IllegalArgumentException e) {
e.printStackTrace();
}
}
public static void extractObjects(byte[] buffer, AmazonS3 s3Client, ZipInputStream zis, ZipEntry entry) throws IOException {
PipedOutputStream outputStream = null;
PipedInputStream is = null;
try {
while (entry != null)
{
String fileName = entry.getName();
if (fileName == "lib") {
fileName = entry.getName();
}
boolean containsBackup = fileName.contains(doc.getDesiredFile());
if (containsBackup == true) {
System.out.println("A back up file was found");
long start = System.currentTimeMillis();
formatSchemaName();
System.out.println("Extracting :" + app.getCurrentPacsId());
log.info("Extracting " + app.getCurrentPacsId() + ",
compressed: " + entry.getCompressedSize() + " bytes,
extracted: " +
entry.getSize() + " bytes");
//ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
outputStream = new PipedOutputStream();
is = new PipedInputStream(outputStream);
int len;
while ((len = zis.read(buffer)) >= 0)
{
outputStream.write(buffer, 0, len);
}
//InputStream is = new ByteArrayInputStream(outputStream.toByteArray());
meta = new ObjectMetadata();
meta.setContentLength(file.length());
fileName = app.getCurrentPacsId();
runDataConversion(is,s3Client,fileName);
recordTime(start);
is.close();
outputStream.close();
System.out.println("Unzip complete");
}
else{
System.out.println("No back up found");
}
entry = zis.getNextEntry();
}
zis.closeEntry();
zis.close();
} catch (AmazonServiceException e) {
log.error(e);
} catch (SdkClientException e) {
log.error(e);
}
}
/*------------Formating the replacment file name---------*/
private static void formatSchemaName(){
String s3Key = app.getCurrentPacsId();
String id = s3Key.replace(".zip", ".txt");
id = id.substring(id.indexOf("_"));
id = id.replaceFirst("_", "");
app.setCurrentPacsId(id);
}
/*---------------Process the data file----------------------*/
private static void runDataConversion(PipedInputStream is, AmazonS3 s3Client, String fileName) {
DataProcessor convert = new DataProcessor(s3Client);
convert.downloadBucket(is,fileName);
}
/*-------Records execution time of program in min/sec------*/
private static void recordTime(long start) throws IOException {
long end = System.currentTimeMillis();
long minutes = TimeUnit.MILLISECONDS.toMinutes(end - start);
long seconds = TimeUnit.MILLISECONDS.toSeconds(end - start);
System.out.println("Execution speed "+ minutes + ":" + (seconds % 60) +" min/sec\n");
}
这是进行一些文本文件处理的类。当处理高达3.5gb的文件时,代码总体上非常慢。跑步时需要3个小时。我尝试在字节流上使用管道流。在64位JDK上将Java堆大小设置为-xms2800m。
public class DataProcessor {
private static AmazonS3 s3Client;
private static ObjectMetadata meta;
private static DomainOfConstants doc = new DomainOfConstants();
private static BigData app = new BigData();
public static File file = new File(app.getCurrentPacsId()+".txt");
private static Logger log = Logger.getLogger(DataProcessor.class);
//Construct connection
public DataProcessor (AmazonS3 s3Client){
this.s3Client = s3Client;
}
//
public void downloadBucket(PipedInputStream is, String fileName) {
try {
File dataStream = dataConversion(is);
s3Client.putObject(doc.getDestinationBucket(),FilenameUtils.getFullPath(doc.getDestinationKey()) + "Modified_"+ fileName, dataStream);
} catch (AmazonServiceException e) {
e.printStackTrace();
log.error(e);
} catch (SdkClientException e) {
e.printStackTrace();
log.error(e);
}
}
//Setup reading and writing streams
public static File dataConversion(PipedInputStream stream) {
BufferedReader reader = null;
BufferedOutputStream streamOut = null;
String line;
try {
reader = new BufferedReader(new InputStreamReader(stream,doc.getFileFormat()));
streamOut = new BufferedOutputStream(new FileOutputStream(file));
meta = new ObjectMetadata();
while(( line = reader.readLine() ) != null)
{
processLine(reader, streamOut, line);
}
}
catch (IOException e) {
e.printStackTrace();
} finally {
try {
streamOut.close();
reader.close();
} catch (IOException e) {
e.printStackTrace();
log.error(e);
}
}
return file;
}
/*---------------------------------------Data processing------------------------------------------------*/
/*-----------Process and print lines---------*/
private static void processLine(BufferedReader reader, BufferedOutputStream streamOut, String line) {
try {
String newLine = System.getProperty("line.separator");
while (reader.ready()) {
if (line.contains(doc.getInsert())) {
handleData(streamOut, line);
} else if (line.contains(doc.getUse())) {
handleSchemaName(streamOut, line);
} else {
streamOut.write(line.toLowerCase().getBytes(Charset.forName(doc.getFileFormat()).toString()));
streamOut.write(newLine.getBytes());
}
line = reader.readLine();
}
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
log.error(e);
} catch (IOException e) {
e.printStackTrace();
log.error(e);
}
}
/*-----------Replace-Schema-Name-----------*/
private static void handleSchemaName(BufferedOutputStream streamOut, String line) throws IOException {
line = line.replace(line, "USE " + "`" + doc.getSchemaName() + app.getCurrentPacsId() + "`;");
streamOut.write(line.getBytes(Charset.forName(doc.getFileFormat())));
}
/*--------Avoid-Formating-Data-Portion-of-file--------*/
private static void handleData(BufferedOutputStream streamOut, String line) throws IOException {
StringTokenizer tk = new StringTokenizer(line);
while (tk.hasMoreTokens()) {
String data = tk.nextToken();
if (data.equals(doc.getValue())) {
streamOut.write(data.toLowerCase().getBytes(Charset.forName(doc.getFileFormat()).toString()));
data = tk.nextToken();
while (tk.hasMoreTokens()) {
streamOut.write(data.getBytes(Charset.forName(doc.getFileFormat())));
data = tk.nextToken();
}
}
streamOut.write(line.toLowerCase().getBytes(Charset.forName(doc.getFileFormat().toString())));
streamOut.write(" ".getBytes(Charset.forName(doc.getFileFormat())));
}
}
答案 0 :(得分:2)
ready()
测试。这是一个额外的系统调用。只需阅读直到结束。BufferedWriter
代替BufferedOutputStream
并停止将所有字符串转换为字节(并使用BufferedWriter.newLine()
代替系统属性)。