通过spark-submit进行容错/错误处理

时间:2016-03-21 10:19:32

标签: apache-spark apache-kafka pyspark fault-tolerance

我有一个火花作业,我正在使用以下命令运行:

<!DOCTYPE html>

<body>

<style>
path { fill: #CCC; }
.line { fill: none; stroke: #000; stroke-width: 5px;}
</style>

<div id="chart-container">
    <svg id="chart"></svg>
</div>

<script src="//cdnjs.cloudflare.com/ajax/libs/moment.js/2.12.0/moment.min.js"></script>
<script src="//d3js.org/d3.v3.min.js"></script>

<script>

    var now = moment();
    var chartData = [
        { timestamp: moment(now).subtract(27, 'days').format('DD-MMM-YY'), value: 40 },
        { timestamp: moment(now).subtract(25, 'days').format('DD-MMM-YY'), value: 36 },
        { timestamp: moment(now).subtract(24, 'days').format('DD-MMM-YY'), value: 33 },
        { timestamp: moment(now).subtract(21, 'days').format('DD-MMM-YY'), value: 35 },
        { timestamp: moment(now).subtract(20, 'days').format('DD-MMM-YY'), value: 35 },
        { timestamp: moment(now).subtract(18, 'days').format('DD-MMM-YY'), value: 33 },
        { timestamp: moment(now).subtract(17, 'days').format('DD-MMM-YY'), value: 33 },
        { timestamp: moment(now).subtract(16, 'days').format('DD-MMM-YY'), value: 33 },
        { timestamp: moment(now).subtract(15, 'days').format('DD-MMM-YY'), value: 32 },
        { timestamp: moment(now).subtract(13, 'days').format('DD-MMM-YY'), value: 35 },
        { timestamp: moment(now).subtract(11, 'days').format('DD-MMM-YY'), value: 31 },
        { timestamp: moment(now).subtract(10, 'days').format('DD-MMM-YY'), value: 28 },
        { timestamp: moment(now).subtract(9, 'days').format('DD-MMM-YY'), value: 32 },
        { timestamp: moment(now).subtract(8, 'days').format('DD-MMM-YY'), value: 30 },
        { timestamp: moment(now).subtract(7, 'days').format('DD-MMM-YY'), value: 33 },
        { timestamp: moment(now).subtract(6, 'days').format('DD-MMM-YY'), value: 36 }
    ];


    //data could have a shorter date range of eg, 1 or 2 weeks
    //ideally we want to still display 'week 1, 2, 3, 4' etc in the axis.
    //alternatively display dates instead
    // var chartData = [
    //     { timestamp: moment(now).subtract(27, 'days').format('DD-MMM-YY'), value: 40 },
    //     { timestamp: moment(now).subtract(25, 'days').format('DD-MMM-YY'), value: 36 },
    //     { timestamp: moment(now).subtract(24, 'days').format('DD-MMM-YY'), value: 33 },
    //     { timestamp: moment(now).subtract(21, 'days').format('DD-MMM-YY'), value: 35 },
    //     { timestamp: moment(now).subtract(20, 'days').format('DD-MMM-YY'), value: 35 }
    // ];

    let lastObj = chartData[chartData.length - 1];
    let lastObjTimestamp =  lastObj.timestamp;
    let lastAndNow = moment(lastObjTimestamp).diff(now, 'days');
    console.log('difference between last entry ' + lastObjTimestamp  + ' and today: ' + lastAndNow);

    var chartWrapperDomId = 'chart-container';
    var chartDomId = 'chart';
    var chartWrapperWidth = document.getElementById(chartWrapperDomId).clientWidth;
    var margin = 40;
    var width = chartWrapperWidth - margin;
    var height = 500 - margin * 2;

    var xMin = d3.time.format('%d-%b-%y').parse(chartData[0].timestamp);
    var xMax = d3.time.format('%d-%b-%y').parse(chartData[chartData.length-1].timestamp);

    //set the scale for the x axis
    var xScale = d3.time.scale();
    xScale.domain([xMin, xMax]);
    xScale.range([0, width]);

    var yScale = d3.scale.linear()
        .range([height, 0])
        .nice();

    console.log('no5 ', chartData[5].timestamp)

    var xAxis = d3.svg.axis()
        .scale(xScale)
        .orient('bottom')
        .tickFormat(d3.time.format('%d-%b'));
        //.tickFormat(d3.time.format('%b'))
        //tickFormat(d3.time.format('%W'));
        //.ticks(5);

    var yAxis = d3.svg.axis()
        .scale(yScale)
        .orient('left');

    var line = d3.svg.line()
        .x(function(d) {
            return xScale(d.timestamp);
        })
        .y(function(d) {
            return yScale(d.value);
        });

    var svg = d3.select('#' + chartDomId)
        .attr('width', width + margin * 2)
        .attr('height', height + margin * 2)
      .append('g')
        .attr('transform', 'translate(' + margin + ',' + margin + ')');

    chartData.forEach(function(d) {
        d.timestamp = d3.time.format('%d-%b-%y').parse(d.timestamp);
        d.value = +d.value;
    });

    yScale.domain(d3.extent(chartData, function(d) {
        return d.value;
    }));

    svg.append('g')
        .attr('class', 'axis x-axis')
        .attr('transform', 'translate(0,' + height + ')')
        .call(xAxis);

    svg.append('g')
        .attr('class', 'axis y-axis')
        .call(yAxis)
    .append('text')
        .attr('transform', 'rotate(-90)')
        .attr('y', 6)
        .attr('dy', '.71em')
        .style('text-anchor', 'end');

    svg.append('path')
        .datum(chartData)
        .attr('class', 'line')
        .attr('d', line);

</script>

</body>

然而,有时这项工作因为某种原因在每次说完连续运行2天后失败,我必须在此之后手动启动它。

这对我的目的来说非常低效,因为我不断从kafka读取数据并将其保存到cassandra。

火花对这种容错有什么特点?也许spark-submit可以再次推出?也许有更聪明的东西?我试图谷歌这个,但关于这一点的信息非常少。

P.S。 - 我正在使用sudo ./bin/spark-submit --jars lib/spark-streaming-kafka-assembly_2.10-1.4.1.jar \ --packages TargetHolding:pyspark-cassandra:0.2.4 \ examples/src/main/python/final/kafka-sparkstreaming-cassandra.py

我希望收到一些好主意!

谢谢!

0 个答案:

没有答案