以下程序抛出错误名称'spark'未定义
Traceback (most recent call last):
File "pgm_latest.py", line 232, in <module>
sconf =SparkConf().set(spark.dynamicAllocation.enabled,true)
.set(spark.dynamicAllocation.maxExecutors,300)
.set(spark.shuffle.service.enabled,true)
.set(spark.shuffle.spill.compress,true)
NameError: name 'spark' is not defined
spark-submit --driver-memory 12g --master yarn-cluster --executor-memory 6g --executor-cores 3 pgm_latest.py
代码
#!/usr/bin/python
import sys
import os
from datetime import *
from time import *
from pyspark.sql import *
from pyspark
import SparkContext
from pyspark import SparkConf
sc = SparkContext()
sqlCtx= HiveContext(sc)
sqlCtx.sql('SET spark.sql.autoBroadcastJoinThreshold=104857600')
sqlCtx.sql('SET Tungsten=true')
sqlCtx.sql('SET spark.sql.shuffle.partitions=500')
sqlCtx.sql('SET spark.sql.inMemoryColumnarStorage.compressed=true')
sqlCtx.sql('SET spark.sql.inMemoryColumnarStorage.batchSize=12000')
sqlCtx.sql('SET spark.sql.parquet.cacheMetadata=true')
sqlCtx.sql('SET spark.sql.parquet.filterPushdown=true')
sqlCtx.sql('SET spark.sql.hive.convertMetastoreParquet=true')
sqlCtx.sql('SET spark.sql.parquet.binaryAsString=true')
sqlCtx.sql('SET spark.sql.parquet.compression.codec=snappy')
sqlCtx.sql('SET spark.sql.hive.convertMetastoreParquet=true')
## Main functionality
def main(sc):
if name == 'main':
# Configure OPTIONS
sconf =SparkConf() \
.set("spark.dynamicAllocation.enabled","true")\
.set("spark.dynamicAllocation.maxExecutors",300)\
.set("spark.shuffle.service.enabled","true")\
.set("spark.shuffle.spill.compress","true")
sc =SparkContext(conf=sconf)
# Execute Main functionality
main(sc)
sc.stop()
答案 0 :(得分:1)
我认为你使用的是比2.x更老的火花版本。
而不是
spark.createDataFrame(..)
使用以下
> df = sqlContext.createDataFrame(...)
答案 1 :(得分:0)
FindSpark模块将在这里方便使用。
使用以下内容安装模块:
python -m pip install findspark
确保设置了SPARK_HOME
环境变量。
用法:
import findspark
findspark.init()
import pyspark # Call this only after findspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)
print(spark)
答案 2 :(得分:0)
例如,如果您知道Spark的安装位置。例如:
/home/user/spark/spark-2.4.0-bin-hadoop2.7/
├── LICENSE
├── NOTICE
├── R
├── README.md
├── RELEASE
├── bin
├── conf
├── data
├── examples
├── jars
├── kubernetes
├── licenses
├── python
├── sbin
└── yarn
您可以在.init
方法中明确指定spark安装的路径
#pyspark
findspark.init("/home/user/spark/spark-2.4.0-bin-hadoop2.7/")