在Spring XD中自动重试失败的作业

时间:2015-01-05 09:08:02

标签: spring-batch spring-xd

我正在寻找一种标准模式,用于在Spring XD中自动重试失败的作业一段配置的次数并在指定的延迟之后。具体来说,我有一个从cron流定期触发的HTTP项目阅读器作业。有时我们会看到HTTP项目阅读器由于网络闪烁而失败,因此我们希望作业自动重试。

我尝试使用JobExecutionListener,当作业失败但是这个棘手的位实际上正在重试失败的作业。我可以通过触发HTTP PUT到XD管理控制器来实现(例如http://xd-server:9393/jobs/executions/2?restart=true) 哪个成功重试了这份工作。但是,我希望能够:

  • 在重试前指定延迟
  • 在XD中进行某种审核,以表明作业将在X秒内重试。

添加延迟可以在JobExecutionListener中完成,但它涉及旋转一个具有延迟的线程,该延迟实际上无法从XD容器中追踪,因此很难看到作业是否与重试有关。

您似乎需要具有延迟作业重试的特定作业定义,以便能够从XD容器中获取任何痕迹。

有人可以为此建议一种模式吗?

1 个答案:

答案 0 :(得分:0)

所以这就是我最终解决的问题:

创建了一个作业执行监听器

public class RestartableBatchJobExecutionListener extends JobExecutionListener {

    private Logger logger = LoggerFactory.getLogger(this.getClass());

    public final static String JOB_RESTARTER_NAME = "jobRestarter";

    /**
     * A list of valid exceptions that are permissible to restart the job on
     */
    private List<Class<Throwable>> exceptionsToRestartOn = new ArrayList<Class<Throwable>>();

    /**
     * The maximum number of times the job can be re-launched before failing
     */
    private int maxRestartAttempts = 0; 

    /**
     * The amount of time to wait in milliseconds before restarting a job
     */
    private long restartDelayMs = 0;

    /**
     * Map of all the jobs against how many times they have been attempted to restart
     */ 
    private HashMap<Long,Integer> jobInstanceRestartCount = new HashMap<Long,Integer>();

    @Autowired(required=false)
    @Qualifier("aynchJobLauncher")
    JobLauncher aynchJobLauncher;

    @Autowired(required=false)
    @Qualifier("jobRegistry")
    JobLocator jobLocator;

    /*
     * (non-Javadoc)
     * @see org.springframework.batch.core.JobExecutionListener#afterJob(org.springframework.batch.core.JobExecution)
     */
    @Override
    public void afterJob(JobExecution jobExecution) {

        super.afterJob(jobExecution);

        // Check if we can restart if the job has failed
        if( jobExecution.getExitStatus().equals(ExitStatus.FAILED) )
        {
            applyRetryPolicy(jobExecution);
        }
    }

    /**
     * Executes the restart policy if one has been specified
     */
    private void applyRetryPolicy(JobExecution jobExecution)
    {
        String jobName = jobExecution.getJobInstance().getJobName();
        Long instanceId = jobExecution.getJobInstance().getInstanceId();

        if( exceptionsToRestartOn.size() > 0 && maxRestartAttempts > 0 )
        {
            // Check if the job has failed for a restartable exception
            List<Throwable> failedOnExceptions = jobExecution.getAllFailureExceptions();
            for( Throwable reason : failedOnExceptions )
            {
                if( exceptionsToRestartOn.contains(reason.getClass()) || 
                    exceptionsToRestartOn.contains(reason.getCause().getClass()) )
                {
                    // Get our restart count for this job instance
                    Integer restartCount = jobInstanceRestartCount.get(instanceId);
                    if( restartCount == null )
                    {
                        restartCount = 0;
                    }

                    // Only restart if we haven't reached our limit
                    if( ++restartCount < maxRestartAttempts )
                    {
                        try
                        {
                            reLaunchJob(jobExecution, reason, restartCount);
                            jobInstanceRestartCount.put(instanceId, restartCount);
                        }
                        catch (Exception e)
                        {
                            String message = "The following error occurred while attempting to re-run job " + jobName + ":" + e.getMessage(); 
                            logger.error(message,e);
                            throw new RuntimeException( message,e);                         
                        }
                    }
                    else
                    {
                        logger.error("Failed to successfully execute jobInstanceId {} of job {} after reaching the maximum restart limit of {}. Abandoning job",instanceId,jobName,maxRestartAttempts );                        
                        try
                        {
                            jobExecution.setStatus(BatchStatus.ABANDONED);
                        }
                        catch (Exception e)
                        {
                            throw new RuntimeException( "The following error occurred while attempting to abandon job " + jobName + ":" + e.getMessage(),e);                            
                        }
                    }
                    break;
                }
            }
        }
    }

    /**
     * Re-launches the configured job with the current job execution details
     * @param jobExecution
     * @param reason
     * @throws JobParametersInvalidException 
     * @throws JobInstanceAlreadyCompleteException 
     * @throws JobRestartException 
     * @throws JobExecutionAlreadyRunningException 
     */
    private void reLaunchJob( JobExecution jobExecution, Throwable reason, int restartCount ) throws JobExecutionAlreadyRunningException, JobRestartException, JobInstanceAlreadyCompleteException, JobParametersInvalidException
    {
        try
        {
            Job jobRestarter = jobLocator.getJob(JOB_RESTARTER_NAME);
            JobParameters jobParameters =new JobParametersBuilder().
                                        addLong("delay",(long)restartDelayMs).
                                        addLong("jobExecutionId", jobExecution.getId()).
                                        addString("jobName", jobExecution.getJobInstance().getJobName())
                                        .toJobParameters();

            logger.info("Re-launching job with name {} due to exception {}. Attempt {} of {}", jobExecution.getJobInstance().getJobName(), reason, restartCount, maxRestartAttempts);

            aynchJobLauncher.run(jobRestarter, jobParameters);
        }
        catch (NoSuchJobException e)
        {
            throw new RuntimeException("Failed to find the job restarter with name=" + JOB_RESTARTER_NAME + " in container context",e);
        }
    }
}

然后在模块定义中,我将这个作业监听器添加到作业中:

<batch:job id="job">
    <batch:listeners>
        <batch:listener ref="jobExecutionListener" />
    </batch:listeners>
    <batch:step id="doReadWriteStuff" >
        <batch:tasklet>
            <batch:chunk reader="itemReader" writer="itemWriter"
                commit-interval="3">
            </batch:chunk>
        </batch:tasklet>
    </batch:step>
</batch:job>

<!-- Specific job execution listener that attempts to restart failed jobs -->
<bean id="jobExecutionListener"
    class="com.mycorp.RestartableBatchJobExecutionListener">
    <property name="maxRestartAttempts" value="3"></property>
    <property name="restartDelayMs" value="60000"></property>
    <property name="exceptionsToRestartOn">
        <list>
            <value>com.mycorp.ExceptionIWantToRestartOn</value>
        </list>
    </property>
</bean>

<!-- 
Specific job launcher that restarts jobs in a separate thread. This is important as the delayedRestartJob
fails on the HTTP call otherwise!
-->
<bean id="executor" class="org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor">
    <property name="maxPoolSize" value="10"></property>
</bean>
<bean id="aynchJobLauncher"
    class="com.mycorp.AsyncJobLauncher">
    <property name="jobRepository" ref="jobRepository" />
    <property name="taskExecutor" ref="executor" />     
</bean>

AysncJobLauncher:

public class AsyncJobLauncher extends SimpleJobLauncher
{
    @Override
    @Async
    public JobExecution run(final Job job, final JobParameters jobParameters)
            throws JobExecutionAlreadyRunningException, JobRestartException, JobInstanceAlreadyCompleteException,
            JobParametersInvalidException 
    {
        return super.run(job, jobParameters);
    }
}

然后我有一个单独的处理器模块,仅用于在延迟后重新启动作业(这允许我们从spring XD ui或db进行审计):

delayedJobRestart.xml:

<batch:job id="delayedRestartJob">
    <batch:step id="sleep" next="restartJob">
        <batch:tasklet ref="sleepTasklet" />
    </batch:step>
    <batch:step id="restartJob">
        <batch:tasklet ref="jobRestarter" />
    </batch:step>
</batch:job>

<bean id="sleepTasklet" class="com.mycorp.SleepTasklet" scope="step">
    <property name="delayMs" value="#{jobParameters['delay'] != null ? jobParameters['delay'] : '${delay}'}" />
</bean>

<bean id="jobRestarter" class="com.mycorp.HttpRequestTasklet" init-method="init" scope="step">
    <property name="uri" value="http://${xd.admin.ui.host}:${xd.admin.ui.port}/jobs/executions/#{jobParameters['jobExecutionId'] != null ? jobParameters['jobExecutionId'] : '${jobExecutionId}'}?restart=true" />
    <property name="method" value="PUT" />
</bean>

delayedJobProperties:

# Job execution ID
options.jobExecutionId.type=Long
options.jobExecutionId.description=The job execution ID of the job to be restarted

# Job execution name
options.jobName.type=String
options.jobName.description=The name of the job to be restarted. This is more for monitoring purposes 

# Delay
options.delay.type=Long
options.delay.description=The delay in milliseconds this job will wait until triggering the restart
options.delay.default=10000

及随附的帮手豆:

SleepTasklet:

public class SleepTasklet implements Tasklet
{
    private static Logger logger = LoggerFactory.getLogger(SleepTasklet.class);

    @Override
    public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception
    {
        logger.debug("Pausing current job for {}ms",delayMs);
        Thread.sleep( delayMs );

        return RepeatStatus.FINISHED;
    }

    private long delayMs;

    public long getDelayMs()
    {
        return delayMs;
    }

    public void setDelayMs(long delayMs)
    {
        this.delayMs = delayMs;
    }
}

HttpRequestTasklet:

public class HttpRequestTasklet implements Tasklet
{
    private HttpClient httpClient = null;

    private static final Logger LOGGER = LoggerFactory.getLogger(HttpRequestTasklet.class);

    private String uri;

    private String method;

    /**
     * Initialise HTTP connection.
     * @throws Exception
     */
    public void init() throws Exception 
    {
        // Create client
        RequestConfig config = RequestConfig.custom()
                .setCircularRedirectsAllowed(true)
                .setRedirectsEnabled(true)
                .setExpectContinueEnabled(true)
                .setRelativeRedirectsAllowed(true)
                .build();

        httpClient = HttpClientBuilder.create()
                .setRedirectStrategy(new LaxRedirectStrategy())
                .setDefaultRequestConfig(config)
                .setMaxConnTotal(1)
                .build();
    }

    @Override
    public RepeatStatus execute(StepContribution contribution, ChunkContext chunkContext) throws Exception
    {
        if (LOGGER.isDebugEnabled()) LOGGER.debug("Attempt HTTP {} from '" + uri + "'...",method);

        HttpUriRequest request = null;
        switch( method.toUpperCase() )
        {
            case "GET":
                request = new HttpGet(uri);
                break;
            case "POST":
                request = new HttpPost(uri);
                break;
            case "PUT":
                request = new HttpPut(uri);
                break;
            default:
                throw new RuntimeException("Http request method " + method + " not supported");
        }

        HttpResponse response = httpClient.execute(request);

        // Check response status and, if valid wrap with InputStreamReader

        StatusLine status = response.getStatusLine();

        if (status.getStatusCode() != HttpStatus.SC_OK) 
        {
            throw new Exception("Failed to get data from '" + uri + "': " + status.getReasonPhrase());
        } 

        if (LOGGER.isDebugEnabled()) LOGGER.debug("Successfully issued request");

        return RepeatStatus.FINISHED;
    }

    public String getUri()
    {
        return uri;
    }

    public void setUri(String uri)
    {
        this.uri = uri;
    }

    public String getMethod()
    {
        return method;
    }

    public void setMethod(String method)
    {
        this.method = method;
    }

    public HttpClient getHttpClient()
    {
        return httpClient;
    }

    public void setHttpClient(HttpClient httpClient)
    {
        this.httpClient = httpClient;
    }
}

最后,当构建和部署所有内容时,请创建一对作业(注意,重新启动器应定义为&#34; jobRestarter&#34;):

job create --name myJob --definition "MyJobModule " --deploy true
job create --name jobRestarter --definition "delayedRestartJob" --deploy true

有点费解,但似乎有效。