Spring批处理分区需要大量时间来处理100000条记录

时间:2018-11-16 22:48:07

标签: spring spring-data-jpa spring-batch

我想使用Spring Batch程序读取100亿条数据。我实现了分区处理一个大文件。

如何提高性能?

下面是我的配置。

@Bean
public FlatFileItemReader<RequestDTO> reader() throws MalformedURLException {
    FlatFileItemReader<RequestDTO> itemReader = new FlatFileItemReader<>();
    itemReader.setLineMapper(lineMapper());
    itemReader.setResource(new FileSystemResource(fileLocation));
    itemReader.setLinesToSkip(1);
    return itemReader;
}

/**
 * This is used for mapping values.
 * 
 * @return LineMapper will be returned
 */
@Bean
public LineMapper<RequestDTO> lineMapper() {
    DefaultLineMapper<RequestDTO> lineMapper = new DefaultLineMapper<>();
    DelimitedLineTokenizer lineTokenizer = new DelimitedLineTokenizer();
    lineTokenizer.setNames("name", "salary", "company");
    lineTokenizer.setIncludedFields(0, 1, 2);
    lineTokenizer.setDelimiter(ProcessorConstants.FILE_SEPERATOR);
    BeanWrapperFieldSetMapper<RequestDTO> fieldSetMapper = new BeanWrapperFieldSetMapper<>();
    fieldSetMapper.setTargetType(RequestDTO.class);
    lineMapper.setLineTokenizer(lineTokenizer);
    lineMapper.setFieldSetMapper(fieldSetMapper);
    return lineMapper;
}

@Bean
public CRSItemProcessor processor() {
    return new CRSItemProcessor();
}
@Bean
public RepositoryItemWriter<AdministrationRequest> writer(DataSource dataSource) {
    return new RepositoryItemWriterBuilder<CRSAdministrationRequest>().methodName("saveAndFlush")
            .repository(processorBatchDAO).build();
}




@Bean
public TaskExecutor taskExecutor() {
        ThreadPoolTaskExecutor taskExecutor = new ThreadPoolTaskExecutor();
        taskExecutor.setMaxPoolSize(20);
        taskExecutor.setCorePoolSize(10);
        taskExecutor.setQueueCapacity(5);
        taskExecutor.afterPropertiesSet();
        return taskExecutor;
}


@Bean(name = "partitionerJob")
public Job partitionerJob(JobCompletionNotificationListener listener) throws UnexpectedInputException, MalformedURLException, ParseException {
  return jobBuilderFactory.get("partitionerJob").incrementer(new RunIdIncrementer()).listener(listener)
    .start(partitionStep())
    .build();
}




@Bean

public Step partitionStep() throws UnexpectedInputException, MalformedURLException, ParseException {
      return stepBuilderFactory.get("partitionStep")
        .partitioner(slaveStep(null))
        .partitioner("slaveStep",new CustomMultiResourcePartitioner())
        .gridSize(20)
        .taskExecutor(taskExecutor())
        .build();
}


@Bean
public Step slaveStep(RepositoryItemWriter<CRSAdministrationRequest> writer) throws UnexpectedInputException, MalformedURLException, ParseException {
  return stepBuilderFactory.get("slaveStep")
    .<RequestDTO, AdministrationRequest>chunk(1000)
    .reader(reader())
    .processor(processor())
    .writer(writer)
    .build();
}

下面是自定义分区类。

public class CustomMultiResourcePartitioner implements Partitioner {

    /**
     * Assign the filename of each of the injected resources to an
     * {@link ExecutionContext}.
     *
     * @see Partitioner#partition(int)
     */
    @Override
    public Map<String, ExecutionContext> partition(int gridSize) {
        Map<String, ExecutionContext> result = new HashMap<String, ExecutionContext>();

        int range = 1000;
        int fromId = 1;
        int toId = range;

        for (int i = 1; i <= gridSize; i++) {
            ExecutionContext value = new ExecutionContext();

            System.out.println("\nStarting : Thread" + i);
            System.out.println("fromId : " + fromId);
            System.out.println("toId : " + toId);

            value.putInt("fromId", fromId);
            value.putInt("toId", toId);

            // give each thread a name, thread 1,2,3
            value.putString("name", "Thread" + i);

            result.put("partition" + i, value);

            fromId = toId + 1;
            toId += range;

        }

        return result;
    }
}

0 个答案:

没有答案