Spring批处理分区:连续运行从站(不是并行)

时间:2014-12-16 16:49:01

标签: java spring parallel-processing spring-batch

我使用spring批处理来执行一些计算,在读者中我必须获得要在处理器/写入器中处理的大数据,并且此过程需要大量(RAM)。 所以我尝试使用如下的分区程序拆分步骤:

<batch:step id="MyStep.master" >
    <partition step="MyStep" partitioner="MyPartitioner">
        <handler grid-size="1" task-executor="TaskExecutor" />
    </partition>
</batch:step>

<batch:step id="MyStep" >
    <batch:tasklet transaction-manager="transactionManager">
        <batch:chunk reader="MyReader" processor="MyProcessor"
            writer="MyWriter" commit-interval="1000" skip-limit="1000">
            <batch:skippable-exception-classes>
                <batch:include class="...FunctionalException" />
            </batch:skippable-exception-classes>
        </batch:chunk>
    </batch:tasklet>
</batch:step>

<bean id="MyPartitioner" class="...MyPartitioner" scope="step"/>

<bean id="TaskExecutor" class="org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor" >

<bean name="MyReader"
    class="org.springframework.batch.item.database.JdbcCursorItemReader"
    scope="step">
    <property name="dataSource" ref="dataSource" />
    <property name="sql">
        <value>
            <![CDATA[
                SELECT...                   
            ]]>
        </value>
    </property>
    <property name="rowMapper" ref="MyRowMapper" />
</bean>

<bean id="MyRowMapper" class="...MyRowMapper" />

<bean id="dataSource" class="com.mchange.v2.c3p0.ComboPooledDataSource" destroy-method="close">
    <property name="driverClass" value="org.postgresql.Driver"/>
    <property name="jdbcUrl" value="jdbc:postgresql://${database.host}/${database.name}"/>
    <property name="user" value="${database.user}"/>
    <property name="password" value="${database.password}"/>        
    <property name="acquireIncrement" value="1" />
    <property name="autoCommitOnClose" value="true" />
    <property name="minPoolSize" value="${min.pool.size}" /> <!-- min.pool.size=5  -->
    <property name="maxPoolSize" value="${max.pool.size}" /> <!-- max.pool.size=15  -->
</bean>

但是分区占用大量内存也是徒劳的,因为步骤(从属)是并行执行的,我想要做的是分割步骤并连续执行线程(不是并行)以减少内存用法(RAM),可能吗?

1 个答案:

答案 0 :(得分:0)

问题有点老,所以我不确定这现在是否有用,可能你自己解决了。

如果您对行执行顺序没有问题,解决方案是在您的分区器bean中查询您的数据库,然后将所有信息传递给每个分区以分割您的表/(start_key,end_key)这将减少ram的使用(A LOT)。

一些警告:

  1. 请注意,分区器bean和阅读器使用的查询必须具有相同的“order by”
  2. 分区读者必须是scope =“step”
  3. 如果您需要按照精确的顺序处理行,请不要使用此方法
  4. 调整RAM尝试gridSize和taskExecutor maxCoreSize的不同组合(这就是我的gridSize是jobParams的原因)
  5. 这是一个例子:

    XML配置:

    <?xml version="1.0" encoding="UTF-8"?>
    <beans xmlns="http://www.springframework.org/schema/beans"
        xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:batch="http://www.springframework.org/schema/batch"
        xmlns:context="http://www.springframework.org/schema/context"
        xmlns:aop="http://www.springframework.org/schema/aop" xmlns:util="http://www.springframework.org/schema/util"
        xsi:schemaLocation="http://www.springframework.org/schema/aop http://www.springframework.org/schema/aop/spring-aop-4.0.xsd
            http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-4.0.xsd
            http://www.springframework.org/schema/util http://www.springframework.org/schema/util/spring-util-4.0.xsd
            http://www.springframework.org/schema/batch http://www.springframework.org/schema/batch/spring-batch-3.0.xsd
            http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-4.0.xsd">
    
        <!-- JOB -->
        <batch:job id="printPdf" job-repository="jobRepository"
            restartable="false">
    
            <batch:step id="MyStep">
                <batch:partition step="MyStep.template"
                    partitioner="myPartitioner" handler="partitionHandler">
                </batch:partition>
            </batch:step>
    
        </batch:job>
    
        <!-- Partitioner -->
        <bean id="myPartitioner" class="foo.MyPartitioner"
            scope="step">
            <property name="jdbcTemplate" ref="myJdbcTemplate" />
            <property name="sql"
                value="Select ...." />
            <property name="rowMap">
                <bean
                    class="foo.MyPartitionHandlerRowMapper" />
            </property>
            <property name="preparedStatementSetter">
                <bean
                    class="org.springframework.batch.core.resource.ListPreparedStatementSetter">
                    <property name="parameters">
                        <list>
                            <value>#{jobParameters['param1']}</value>
                        </list>
                    </property>
                </bean>
            </property>
        </bean>
    
        <bean id="partitionHandler" scope="step"
            class="org.springframework.batch.core.partition.support.TaskExecutorPartitionHandler">
            <property name="taskExecutor" ref="customTaskExecutor" />
            <property name="gridSize" value="#{jobParameters['gridSize']}" />
            <property name="step" ref="MyStep.template" />
        </bean>
    
        <bean id="customTaskExecutor"
            class="org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor">
            <property name="corePoolSize" value="8" />
            <property name="maxPoolSize" value="8" />
            <property name="waitForTasksToCompleteOnShutdown" value="true" />
            <property name="awaitTerminationSeconds" value="120" />
        </bean>
    
        <batch:step id="MyStep.tempate">
            <batch:tasklet transaction-manager="transactionManager">
                <batch:chunk commit-interval="2500" reader="myReader"
                    processor="myProcessor" writer="myWriter" skip-limit="2500">
                <batch:skippable-exception-classes>
                    <batch:include class="...FunctionalException" />
                </batch:skippable-exception-classes>
                </batch:chunk>
            </batch:tasklet>
        </batch:step>
    
        <!-- Beans -->
    
        <!-- Processors -->
        <bean id="myProcessor" class="foo.MyProcessor"
            scope="step">
        </bean>
    
        <bean id="classLoaderVerifier"
            class="it.addvalue.pkjwd.services.genbean.GenericStockKeysForNoDuplicate" />
    
        <!-- Readers -->
        <bean id="myReader"
            class="org.springframework.batch.item.database.JdbcCursorItemReader"
            scope="step">
            <property name="dataSource" ref="myDataSouce" />
            <property name="sql"
                value="select ... from ... where ID >= ? and ID <= ?" />
            <property name="rowMapper">
                <bean class="foo.MyReaderPartitionedRowMapper" />
            </property>
            <property name="preparedStatementSetter">
                <bean
                    class="org.springframework.batch.core.resource.ListPreparedStatementSetter">
                    <property name="parameters">
                        <list>
                            <value>#{stepExecutionContext['START_ID']}</value>
                            <value>#{stepExecutionContext['END_ID']}</value>
                        </list>
                    </property>
                </bean>
            </property>
        </bean>
    
        <!-- Writers -->
        <bean id="myWriter"
            class="org.springframework.batch.item.database.JdbcBatchItemWriter">
            <property name="assertUpdates" value="false" />
            <property name="itemPreparedStatementSetter">
                <bean class="foo.MyWriterStatementSetters" />
            </property>
            <property name="sql"
                value="insert ..." />
            <property name="dataSource" ref="myDataSouce" />
        </bean>
    
    </beans>
    

    您的Partitioner Bean将如下所示:

    package foo;
    
    import foo.model.MyTable;
    
    import java.util.HashMap;
    import java.util.List;
    import java.util.Map;
    
    import org.apache.commons.lang.StringUtils;
    import org.springframework.batch.core.partition.support.Partitioner;
    import org.springframework.batch.item.ExecutionContext;
    import org.springframework.jdbc.core.JdbcTemplate;
    import org.springframework.jdbc.core.PreparedStatementSetter;
    import org.springframework.jdbc.core.RowMapper;
    
    public class MyPartitioner implements Partitioner
    {
        private JdbcTemplate                     jdbcTemplate;
    
        private RowMapper<foo.model.MyTable> rowMap;
    
        private String                           sql;
    
        private PreparedStatementSetter          preparedStatementSetter;
    
        public JdbcTemplate getJdbcTemplate()
        {
            return jdbcTemplate;
        }
    
        public void setJdbcTemplate(JdbcTemplate jdbcTemplate)
        {
            this.jdbcTemplate = jdbcTemplate;
        }
    
        public RowMapper<foo.model.MyTable> getRowMap()
        {
            return rowMap;
        }
    
        public void setRowMap(RowMapper<PkjwdPolizzePartition> rowMap)
        {
            this.rowMap = rowMap;
        }
    
        public String getSql()
        {
            return sql;
        }
    
        public void setSql(String sql)
        {
            this.sql = sql;
        }
    
        public PreparedStatementSetter getPreparedStatementSetter()
        {
            return preparedStatementSetter;
        }
    
        public void setPreparedStatementSetter(PreparedStatementSetter preparedStatementSetter)
        {
            this.preparedStatementSetter = preparedStatementSetter;
        }
    
        @Override
        public Map<String, ExecutionContext> partition(int gridSize)
        {
            Map<String, ExecutionContext> map = new HashMap<String, ExecutionContext>();
    
            try
            {
                List<PkjwdPolizzePartition> lstMyRows = jdbcTemplate.query(sql, preparedStatementSetter ,rowMap);
    
                if ( lstMyRows.size() > 0 )
                {
                    int total = lstMyRows.size();
                    int rowsPerPartition = total / gridSize;
                    int leftovers = total % gridSize;
                    total = lstMyRows.size() - 1;
    
                    int startPos = 0;
                    int endPos = rowsPerPartition - 1;
    
                    int i = 0;
    
                    while (endPos <= (total))
                    {
    
                        ExecutionContext context = new ExecutionContext();
    
                        if ( endPos + leftovers == total )
                        {
                            endPos = total;
                        }
                        else if ( endPos >= (total) )
                        {
                            endPos = total;
                        }
    
                        context.put("START_ID", lstMyRows.get(startPos).getId());
                        context.put("END_ID", lstMyRows.get(endPos).getId());
    
    
    
                        map.put("PART_" + StringUtils.leftPad("" + i, ("" + gridSize).length(), '0'), context);
    
                        i++;
                        startPos = endPos + 1;
                        endPos = endPos + rowsPerPartition;
                    }
                }
    
            }
            catch ( Exception e )
            {
                e.printStackTrace();
            }
    
            return map;
        }
    
    }