我正在尝试使用pyspark构建一个flask-spark应用程序。出于测试目的,我将在url中给出一个sqlstring,它将在Spark脚本中处理,并将json数据作为response返回给浏览器。这是我的代码。
import re
from tkinter import*
from pyspark.sql import SparkSession
from flask import Flask
#from pyspark import SparkConf,SparkContext
#conf=SparkConf().setMaster('local').setAppName("TestValue")
#sc=SparkContext(conf=conf)
#sqlContext=HiveContext(sc)
#from pyspark.sql import Row
import json
app=Flask(__name__)
spark=SparkSession.builder.config("spark.sql.warehouse.dir", "C:\spark\spark-warehouse").appName("TestApp").enableHiveSupport().getOrCreate()
print("Success")
#sqlstring="SELECT lflow1.LeaseType as LeaseType, lflow1.Status as Status, lflow1.Property as property, lflow1.City as City, lesflow2.DealType as DealType, lesflow2.Area as Area, lflow1.Did as DID, lesflow2.MID as MID from lflow1, lesflow2 WHERE lflow1.Did = lesflow2.MID"
@app.route('/<sqlval>')
def queryBuilder(sqlval):
df=spark.sql(sqlval)
#df.show()
resultlist = df.toJSON().collect()
dumpdata = re.sub(r"\'", "", str(resultlist))
jsondata = json.dumps(dumpdata)
#print(jsondata)
return jsondata
#return df
#queryBuilder(sqlstring)
if __name__ == '__main__':
app.run(debug = True)
master=Tk()
entryval=Entry(master)
entryval.grid(row=0,column=1)
Button(master,text='Quit',command=master.quit).grid(row=3,column=1,sticky=W,pady=50)
mainloop()
这里sqlstring将来自localhost中的浏览器url并访问querybuilder函数,该函数将json数据作为响应返回给浏览器。当我运行spark提交时,此错误已发生
py4j.protocol.Py4JJavaError:调用None.org.apache.spark.api.java.JavaSparkContext时发生错误。 :org.apache.spark.SparkException:此JVM中只能运行一个SparkContext(请参阅SPARK-2243)。要忽略此错误,请设置spark.driver.allowMultipleContexts = true。当前运行的SparkContext创建于: org.apache.spark.api.java.JavaSparkContext(JavaSparkContext.scala:58)。
可能是我在这里缺少一些配置属性。但是我无法弄明白。我已经阅读了相关的帖子,他们提到了火花就足够了,我不需要sqlcontext但是你能帮我解决这个问题,如火花,这意味着 我怎么可能配置SparkSession以避免此问题 .Kindly帮我弄明白。谢谢
这是我的SqlCOntext样式代码
import re
from tkinter import*
import json
from pyspark.sql import HiveContext
#from pyspark.sql import SparkSession
from flask import Flask
from pyspark import SparkConf,SparkContext
conf=SparkConf("spark.driver.allowMultipleContexts = true").setMaster('local').setAppName("TestValue")
sc=SparkContext(conf=conf)
sqlContext=HiveContext(sc)
#from pyspark.sql import Row
app=Flask(__name__)
#spark=SparkSession.builder.config("spark.sql.warehouse.dir", "C:\spark\spark-warehouse").appName("TestApp").enableHiveSupport().getOrCreate()
print("Success")
#sqlstring="SELECT lflow1.LeaseType as LeaseType, lflow1.Status as Status, lflow1.Property as property, lflow1.City as City, lesflow2.DealType as DealType, lesflow2.Area as Area, lflow1.Did as DID, lesflow2.MID as MID from lflow1, lesflow2 WHERE lflow1.Did = lesflow2.MID"
@app.route('/<sqlval>')
def queryBuilder(sqlval):
df=sqlContext.sql(sqlval)
#df.show()
resultlist = df.toJSON().collect()
dumpdata = re.sub(r"\'", "", str(resultlist))
jsondata = json.dumps(dumpdata)
#print(jsondata)
return jsondata
#return df
#queryBuilder(sqlstring)
if __name__ == '__main__':
app.run(debug = True)
master=Tk()
entryval=Entry(master)
entryval.grid(row=0,column=1)
Button(master,text='Quit',command=master.quit).grid(row=3,column=1,sticky=W,pady=50)
mainloop()
我得到同样的错误