这是我使用spark streaming的spark应用程序。问题是我没有得到数据。运行应用程序后大约每5分钟它仍然运行,但我没有得到任何数据。我还附上了我的火花UI中的一张图片,该图片没有显示失败但只显示空方块。
我将不同时间的不同分成不同的火花应用程序(1小时,3小时,6小时等),它工作得很好。但是一旦我尝试将它放入1个火花应用程序中,它每隔几分钟就停止一次,也不会给我任何错误信息。
#coding = utf-8
import sys
import json
from pyspark import SparkContext, SparkConf, rddsampler
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import OffsetRange, KafkaUtils, TopicAndPartition
from datetime import datetime, timedelta
from dateutil.parser import parse
from cassandra.cluster import Cluster
from cassandra import ConsistencyLevel
import pytz
tz = pytz.timezone('')
appname = str(sys.argv[1])
source = str(sys.argv[2])
cluster = Cluster(['localhost']);
session = cluster.connect('keyspace')
channel_lookup_stmt = session.prepare("SELECT * FROM channels WHERE id = ?")
channel_lookup_stmt.consistency_level = ConsistencyLevel.QUORUM
session_statis = cluster.connect('keyspace')
def read_json(x):
try:
y = json.loads(x)
except:
y = 0
return y
def TransformInData(x):
try:
body = json.loads(x['body'])
return (body['articles'])
except:
return 0
def axesTransformData(x):
try:
body = json.loads(x['body'])
return (body)
except:
return 0
def axesFlatMap(rdd):
axess = rdd.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x))
for axes in axess.collect():
print 'axes value is =========:',axes
def windowOperation(rdd):
articles = rdd.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x))
return articles.filter(lambda x:parse(x['created_at'])>datetime.now(tz)-timedelta(minutes=10)).map(lambda x:(x['id'],x))
'''
for article in article_window:
if parse(article[1]['created_at'])>datetime.now(tz)-timedelta(minutes=10):
print 'article============:',article[0],parse(article[1]['created_at']),datetime.now(tz)-timedelta(minutes=10)
'''
def joinstream(rdd):
# print 'joinstream value is ================:',
session_statis.execute('DELETE FROM statistics WHERE source = %s', [source])
timeone=datetime.now(tz)-timedelta(hours=1)
timethree = datetime.now(tz)-timedelta(hours=3)
timesix = datetime.now(tz)-timedelta(hours=6)
timetwelve = datetime.now(tz)-timedelta(hours=12)
timetwentyfour = datetime.now(tz)-timedelta(hours=24)
# timethreeday = datetime.now(tz)-timedelta(hours=72)
# timeweek = datetime.now(tz)-timedelta(hours=168)
print 'timeone:%s timethree:%s timesix:%s timetwelve:%s timetwentyfour:%s' % (timeone,timethree,timesix,timetwelve,timetwentyfour)
for dr in rdd.distinct().collect():
# print 'dr values =================',dr
for d in dr[1][0].__iter__():
axes_list = []
channel = session.execute(channel_lookup_stmt,[d['channel']])
category=''
name = ''
if channel:
category = channel[0].category
name = channel[0].name
for daxes in dr[1][1].__iter__():
axes_list.append(daxes)
if len(axes_list)>0:
attitudes = max(axes_list,key=lambda x:x['attitudes'])['attitudes']
comments = max(axes_list,key=lambda x:x['comments'])['comments']
reposts = max(axes_list,key=lambda x:x['reposts'])['reposts']
if comments>0 and parse(d['created_at'])>=timeone:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '1', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
if comments>0 and parse(d['created_at'])>=timethree:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '3', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
if comments>0 and parse(d['created_at'])>=timesix:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '6', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
if comments>0 and parse(d['created_at'])>=timetwelve:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '12', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
if comments>0 and parse(d['created_at'])>=timetwentyfour:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '24', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, attitudes, comments, reposts, comments))
else:
if d['axes']['comments']>0 and parse(d['created_at'])>=timeone:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '1', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
if d['axes']['comments']>0 and parse(d['created_at'])>=timethree:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '3', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
if d['axes']['comments']>0 and parse(d['created_at'])>=timesix:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '6', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
if d['axes']['comments']>0 and parse(d['created_at'])>=timetwelve:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '12', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
if d['axes']['comments']>0 and parse(d['created_at'])>=timetwentyfour:
session_statis.execute('INSERT INTO statistics(col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11, col12, col13, col14, col15) values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', (source, '24', d['id'], d['title'], d['thumbnail'], d['url'], parse(d['created_at']), category, d['genre'], name, 0, d['axes']['likes'], d['axes']['comments'], d['axes']['shares'], d['axes']['comments']))
conf = SparkConf().setAppName(appname)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc,300)
topic = 'topic1'
kafkaParams = {"metadata.broker.list": "localhost:9092"}
article_stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
article_join_stream=article_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(x['id'].encode("utf-8") ,x))
#axes topic integration the article and the axes
axes_topic = 'topic2'
axes_stream = KafkaUtils.createDirectStream(ssc, [axes_topic], kafkaParams)
axes_join_stream = axes_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(str(x['id']),x))
#join
statistics = article_join_stream.window(24*60*60,5*60).cogroup(axes_join_stream.window(24*60*60,5*60))
statistics.transform(joinstream).pprint()
ssc.start() # Start the computation ssc.awaitTermination()
ssc.awaitTermination()
红色方块应该有与其他数据一样的成功数据,但既不会显示失败也不会给我数据
感谢您的帮助!