我想使用Spring Batch程序读取100亿条数据。我实现了分区处理一个大文件。
如何提高性能?
下面是我的配置。
@Bean
public FlatFileItemReader<RequestDTO> reader() throws MalformedURLException {
FlatFileItemReader<RequestDTO> itemReader = new FlatFileItemReader<>();
itemReader.setLineMapper(lineMapper());
itemReader.setResource(new FileSystemResource(fileLocation));
itemReader.setLinesToSkip(1);
return itemReader;
}
/**
* This is used for mapping values.
*
* @return LineMapper will be returned
*/
@Bean
public LineMapper<RequestDTO> lineMapper() {
DefaultLineMapper<RequestDTO> lineMapper = new DefaultLineMapper<>();
DelimitedLineTokenizer lineTokenizer = new DelimitedLineTokenizer();
lineTokenizer.setNames("name", "salary", "company");
lineTokenizer.setIncludedFields(0, 1, 2);
lineTokenizer.setDelimiter(ProcessorConstants.FILE_SEPERATOR);
BeanWrapperFieldSetMapper<RequestDTO> fieldSetMapper = new BeanWrapperFieldSetMapper<>();
fieldSetMapper.setTargetType(RequestDTO.class);
lineMapper.setLineTokenizer(lineTokenizer);
lineMapper.setFieldSetMapper(fieldSetMapper);
return lineMapper;
}
@Bean
public CRSItemProcessor processor() {
return new CRSItemProcessor();
}
@Bean
public RepositoryItemWriter<AdministrationRequest> writer(DataSource dataSource) {
return new RepositoryItemWriterBuilder<CRSAdministrationRequest>().methodName("saveAndFlush")
.repository(processorBatchDAO).build();
}
@Bean
public TaskExecutor taskExecutor() {
ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
taskExecutor.setMaxPoolSize(20);
taskExecutor.setCorePoolSize(10);
taskExecutor.setQueueCapacity(5);
taskExecutor.afterPropertiesSet();
return taskExecutor;
}
@Bean(name = "partitionerJob")
public Job partitionerJob(JobCompletionNotificationListener listener) throws UnexpectedInputException, MalformedURLException, ParseException {
return jobBuilderFactory.get("partitionerJob").incrementer(new RunIdIncrementer()).listener(listener)
.start(partitionStep())
.build();
}
@Bean
public Step partitionStep() throws UnexpectedInputException, MalformedURLException, ParseException {
return stepBuilderFactory.get("partitionStep")
.partitioner(slaveStep(null))
.partitioner("slaveStep",new CustomMultiResourcePartitioner())
.gridSize(20)
.taskExecutor(taskExecutor())
.build();
}
@Bean
public Step slaveStep(RepositoryItemWriter<CRSAdministrationRequest> writer) throws UnexpectedInputException, MalformedURLException, ParseException {
return stepBuilderFactory.get("slaveStep")
.<RequestDTO, AdministrationRequest>chunk(1000)
.reader(reader())
.processor(processor())
.writer(writer)
.build();
}
下面是自定义分区类。
public class CustomMultiResourcePartitioner implements Partitioner {
/**
* Assign the filename of each of the injected resources to an
* {@link ExecutionContext}.
*
* @see Partitioner#partition(int)
*/
@Override
public Map<String, ExecutionContext> partition(int gridSize) {
Map<String, ExecutionContext> result = new HashMap<String, ExecutionContext>();
int range = 1000;
int fromId = 1;
int toId = range;
for (int i = 1; i <= gridSize; i++) {
ExecutionContext value = new ExecutionContext();
System.out.println("\nStarting : Thread" + i);
System.out.println("fromId : " + fromId);
System.out.println("toId : " + toId);
value.putInt("fromId", fromId);
value.putInt("toId", toId);
// give each thread a name, thread 1,2,3
value.putString("name", "Thread" + i);
result.put("partition" + i, value);
fromId = toId + 1;
toId += range;
}
return result;
}
}