我显然面临阅读洗牌问题。
我的Pyspark脚本在Hadoop集群1 EdgeNode和12个Datanode上运行,使用YARN作为资源管理器和Spark 1.6.2。
###[ini_file containing conf spark]
spark.app.name = MY_PYSPARK_APP
spark.master = yarn-client
spark.yarn.queue = agr_queue
spark.executor.instances = 24
spark.executor.memory = 14
spark.executor.cores = 3
#spark.storage.memoryFraction = 0.5
#spark.sql.shuffle.partitions = 2001
#spark.sql.shuffle.partitions = 1000
spark.sql.shuffle.partitions = 100
spark.shuffle.memoryFraction=0.5
spark.memory.offHeap.enabled = True
spark.serializer = org.apache.spark.serializer.KryoSerializer
#spark.driver.memory = 14g
spark.driver.maxResultSize = 20g
spark.python.worker.memory = 14g
spark.akka.heartbeat.interval = 100
spark.yarn.executor.memoryOverhead=2000
spark.yarn.driver.memoryOverhead=2000
spark.scheduler.mode = FIFO
spark.sql.tungsten.enabled = True
spark.default.parallelism = 200
spark.speculation = True
spark.speculation.interval = 1000ms
spark.speculation.multiplier = 2.0
sconf = SparkConf()
sc = SparkContext(sconf)
hctx = HiveContext(sc)
dataframe1 = hctx.sql("SELECT * FROM DB1.TABLE1")
dataframe2 = hctx.sql("SELECT * FROM DB2.TABLE2")
df = dataframe1.join(dataframe2, conditions)
# No major problem at this count()
# it returns 550 000 000 rows
df.count()
# 288 elements in List_dtm_t
List_dtm_t=['00:00:00', '00:05:00', ... '23:45:00', '23:50:00', '23:55:00']
dat_tm_bdcst = sc.broadcast(List_dtm)
global dat_tm_bdcst
def mapper(row):
import datetime
def ts_minus_5(tmstmp):
import datetime
return tmstmp-datetime.timedelta(minutes=5)
lst_tuple = ()
poids = row[9]
for dtm in dat_tm_bdcst.value:
t_minus = ts_minus_5(dtm)
if (row[0]<=dtm) & (row[1]>t_minus):
v1 = str(dtm)
v2 = str(t_minus)
v3 = row[2]
v4 = row[3]
v5 = row[4]
v6 = row[5]
v7 = row[6]
v8 = row[7]
v9 = row[8]
v10 = row[10]
v11 = poids * (min(dtm,row[1])-max(t_minus,row[0])).total_seconds()
v12 = poids
if row[0] <= dtm <= row[1] : v13 = poids
else : v13 = 0
lst_tuple += (((v1, v2, v3, v4, v5, v6, v7, v8, v9, v10),(v11, v12, v13)),)
return lst_tuple
global list_to_row
def list_to_row(keys, values):
from pyspark.sql import Row
row_dict = dict(zip(keys, values[0]+values[1]))
return Row(**row_dict)
f_reduce = lambda x,y: (x[0]+y[0], x[1]+y[1], x[2]+y[2])
# This flatMap takes a really infinite long time
# It generally returns a KO because it retries more than 3 times
# Or lose some shuffle path
mapped_df = df.limit(10000000)\
.flatMap(mapper)
reduced_rdd = mapped_df.reduceByKey(f_reduce)
reduced_rdd.count()
list_of_rows = reduced_rdd.map(lambda x: list_to_row(header, x))
df_to_exp = hctx.createDataFrame(list_of_rows)
## register as tempTable df_to_exp then write it into Hive
我尝试了不同的方式:
我正在寻找达到最终目标的解决方案,并加快整个过程。
spark UI的两个截图:
我们可以看到ReduceByKey阶段(不知道它是否只代表reduce任务,只有1个任务?!!) 并且随机读取/记录增加太慢(13分钟后300,000 / 100Millions)
希望有人能提供帮助, 谢谢!