我在pyspark中编写了一个小代码,以在一组列上生成分位数,而我正在使用current.futures调用此函数,因为我希望在两组列上并行进行此操作。
但是不是应该从Threadpool.executor执行的功能,而是整个代码三次执行。
我正在从另一个python程序的main()方法调用函数generate_advisor_quartiles()。
from src.utils import sql_service, apa_constant as constant
from pyspark.sql.functions import monotonicallyIncreasingId
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext,HiveContext
from pyspark.sql.functions import lit
from pyspark.sql.types import *
from pyspark.ml.feature import
from concurrent.futures import *
from pyspark.ml.feature import Bucketizer
import numpy as np
import pandas as pd
import os
def generate_Quantiles(df,attr_list,spark_context):
jdf = df._jdf
quantileList = []
sqlContext = SQLContext(spark_context)
fields = [StructField('attribute', StringType(), True),
StructField('col1', DoubleType(), True),
StructField('col2', DoubleType(), True),
StructField('col3', DoubleType(), True),
StructField('col4', DoubleType(), True),
StructField('col5', DoubleType(), True)]
schema = StructType(fields)
for var in attr_list:
bindt = spark_context._jvm.com.dstsystems.apa.util.DFQuantileFunction.approxQuantile(jdf,[var.col_name],[0.0, 0.25, 0.5, 0.75, 1.0],0.0)
q0 = bindt[0][0]
q1 = bindt[0][1]
q2 = bindt[0][2]
q3 = bindt[0][3]
q4 = bindt[0][4]
colQuantileList = [q0,q1,q2,q3,q4]
quantileList.append(colQuantileList)
bindt = sorted(list(set(list(bindt[0]))))
bindt = [-float("inf")] + bindt
bindt.insert(len(bindt), float("inf"))
bindt.insert(len(bindt), float("NaN"))
bucketizer = Bucketizer().setInputCol(var.col_name).setOutputCol("{}_quantile".format(var.col_name)).setSplits(bindt)
df = bucketizer.transform(df)
df = df.withColumn("{}_quantile".format(var.col_name),(lit(4.0) - df["{}_quantile".format(var.col_name)]))
df.drop(var.col_name)
quantileRDD = spark_context.parallelize(quantileList)
quantileDF = sqlContext.createDataFrame(quantileRDD,schema)
df.count()
quantileDF.count()
return df,quantileDF
def generate_advisor_quartiles(spark_context, hive_context, log, **kwargs):
log.info("Started - Generate adviser quartile reports ")
sql = """describe dbName.tablename""" #.format(kwargs['sem_db'])
op = hive_context.sql(sql)
res = op.withColumn("ordinal_position", monotonicallyIncreasingId())
res.registerTempTable('attribs')
id_lst = hive_context.sql(
"select col_name from attribs where ordinal_position <= 24 order by ordinal_position").collect()
sql = "select %s from %s.tablename " % ((", ".join(str(v.col_name) for v in id_lst)), kwargs['sem_db'])
id_tbl = hive_context.sql(sql)
attr_lst = hive_context.sql(
"""select col_name from attribs where ordinal_position > 24 AND col_name not like '%vehicle%'
AND col_name not like '%cluster_num%'
AND col_name not like '%value_seg%'order by ordinal_position limit 2""").collect()
vhcl_lst = hive_context.sql(
"""select col_name from attribs where ordinal_position > 24 AND col_name not like '%vehicle%'
AND ( col_name like '%vehicle%'
OR col_name IN ('cluster_num', 'value_seg')
) order by ordinal_position""").collect()
sqltemp ="select %s from %s.Tablename" % ((", ".join(['entity_id'] + [str(vhcl.col_name) for vhcl in vhcl_lst])),kwargs['sem_db'])
id_tbl = hive_context.sql(sqltemp)
attr_lst1 = attr_lst[:len(attr_lst)//2]
attr_lst2 = attr_lst[len(attr_lst) // 2:]
# sqltemp = "select cast(entity_id as decimal(38,20)) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as decimal(38,20))" for attr in attr_lst), kwargs['sem_db'])
sqltemp1 = "select cast(entity_id as double) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as double)" for attr in attr_lst1), kwargs['sem_db'])
sqltemp2 = "select cast(entity_id as double) , %s from %s.tablename where ud_rep = 1" % (", ".join("cast(" + str(attr.col_name) + " as double)" for attr in attr_lst2), kwargs['sem_db'])
df1 = hive_context.sql(sqltemp1)
df1 = df1.replace(0, np.nan)
df2 = hive_context.sql(sqltemp2)
df2 = df2.replace(0, np.nan)
with ThreadPoolExecutor(max_workers=2) as executor2:
result1 = executor2.submit(generate_Quantiles, df1,attr_lst1,spark_context)
result2 = executor2.submit(generate_Quantiles, df2,attr_lst2,spark_context)
future_list = [result1, result2]
for future in as_completed(future_list):
print("completed")
df1,df2 = result1.result()
df3,df4 = result2.result()
finalQuantiles = df1.join(df3,"entity_id","inner")
quantilValuesDF = df2.union(df4)
finalQuantiles.show()
quantilValuesDF.show()