我的spark应用程序不会始终返回数据。为什么?

时间:2016-02-24 10:03:09

标签: apache-spark spark-streaming datastax datastax-enterprise

这是我使用spark streaming的spark应用程序。问题是我没有得到数据。运行应用程序后大约每5分钟它仍然运行,但我没有得到任何数据。我还附上了我的火花UI中的一张图片,该图片没有显示失败但只显示空方块。

我将不同时间的不同分成不同的火花应用程序(1小时,3小时,6小时等),它工作得很好。但是一旦我尝试将它放入1个火花应用程序中,它每隔几分钟就停止一次,也不会给我任何错误信息。

    #coding = utf-8
    import sys
    import json
    from pyspark import SparkContext, SparkConf, rddsampler
    from pyspark.streaming import StreamingContext
    from pyspark.streaming.kafka import OffsetRange, KafkaUtils, TopicAndPartition
    from datetime import datetime, timedelta 
    from dateutil.parser import parse 

    from cassandra.cluster import Cluster
    from cassandra import ConsistencyLevel
    import pytz
    tz = pytz.timezone('')


    appname = str(sys.argv[1])
    source = str(sys.argv[2])





    cluster = Cluster(['localhost']); 
    session = cluster.connect('keyspace')
    channel_lookup_stmt = session.prepare("SELECT * FROM channels WHERE id = ?")
    channel_lookup_stmt.consistency_level = ConsistencyLevel.QUORUM


    session_statis = cluster.connect('keyspace')

    def read_json(x):
        try:
            y = json.loads(x)
        except:
            y = 0
        return y

    def TransformInData(x):
        try:
            body = json.loads(x['body'])
            return (body['articles'])
        except:
            return 0


    def axesTransformData(x):
        try:
            body = json.loads(x['body'])
            return (body)
        except:
            return 0
    def axesFlatMap(rdd):
        axess = rdd.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x))    
        for axes in axess.collect():
            print 'axes value is =========:',axes


    def windowOperation(rdd):
        articles = rdd.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x))
        return articles.filter(lambda x:parse(x['created_at'])>datetime.now(tz)-timedelta(minutes=10)).map(lambda x:(x['id'],x))
        '''
        for article in article_window:
            if parse(article[1]['created_at'])>datetime.now(tz)-timedelta(minutes=10):
                print 'article============:',article[0],parse(article[1]['created_at']),datetime.now(tz)-timedelta(minutes=10)
        '''


    def joinstream(rdd):
    #     print 'joinstream value is ================:',
        session_statis.execute('DELETE FROM statistics WHERE source = %s', [source])
        timeone=datetime.now(tz)-timedelta(hours=1)
        timethree = datetime.now(tz)-timedelta(hours=3)
        timesix = datetime.now(tz)-timedelta(hours=6)
        timetwelve = datetime.now(tz)-timedelta(hours=12)
        timetwentyfour = datetime.now(tz)-timedelta(hours=24)
    #     timethreeday = datetime.now(tz)-timedelta(hours=72)
    #     timeweek = datetime.now(tz)-timedelta(hours=168)
        print 'timeone:%s  timethree:%s timesix:%s timetwelve:%s timetwentyfour:%s' % (timeone,timethree,timesix,timetwelve,timetwentyfour)
        for dr in rdd.distinct().collect():
    #         print 'dr values =================',dr
            for d in dr[1][0].__iter__():
                axes_list = []
                channel = session.execute(channel_lookup_stmt,[d['channel']])
                category=''
                name = ''
                if channel:
                    category = channel[0].category
                    name = channel[0].name
                for daxes in dr[1][1].__iter__():
                    axes_list.append(daxes)
                if len(axes_list)>0:
                    attitudes = max(axes_list,key=lambda x:x['attitudes'])['attitudes']
                    comments = max(axes_list,key=lambda x:x['comments'])['comments']
                    reposts = max(axes_list,key=lambda x:x['reposts'])['reposts']
                    if comments>0 and parse(d['created_at'])>=timeone:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '1', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
                    if comments>0 and parse(d['created_at'])>=timethree:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '3', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
                    if comments>0 and parse(d['created_at'])>=timesix:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '6', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
                    if comments>0 and parse(d['created_at'])>=timetwelve:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '12', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
                    if comments>0 and parse(d['created_at'])>=timetwentyfour:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '24', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
                else:
                    if d['axes']['comments']>0 and parse(d['created_at'])>=timeone:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '1', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
                    if d['axes']['comments']>0 and parse(d['created_at'])>=timethree:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '3', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
                    if d['axes']['comments']>0 and parse(d['created_at'])>=timesix:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '6', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
                    if d['axes']['comments']>0 and parse(d['created_at'])>=timetwelve:
                        session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '12', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
                if d['axes']['comments']>0 and parse(d['created_at'])>=timetwentyfour:
                    session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '24', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))

conf = SparkConf().setAppName(appname)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc,300)
topic = 'topic1'
kafkaParams = {"metadata.broker.list": "localhost:9092"}


article_stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
article_join_stream=article_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(x['id'].encode("utf-8") ,x))
#axes topic  integration the article and the axes
axes_topic = 'topic2'
axes_stream = KafkaUtils.createDirectStream(ssc, [axes_topic], kafkaParams)
axes_join_stream = axes_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(str(x['id']),x))


#join
statistics = article_join_stream.window(24*60*60,5*60).cogroup(axes_join_stream.window(24*60*60,5*60))
statistics.transform(joinstream).pprint()


ssc.start()    # Start the computation ssc.awaitTermination()
ssc.awaitTermination()

红色方块应该有与其他数据一样的成功数据,但既不会显示失败也不会给我数据

enter image description here

感谢您的帮助!

0 个答案:

没有答案