Jupyter Notebook挂在基本的Spark Context创建上

时间:2019-03-18 14:48:20

标签: pyspark

我尝试创建一个创建Spark上下文并运行一些简单模型的玩具程序。 我的代码如下

from pyspark.sql import SQLContext
from pyspark import SparkContext,SparkConf
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer
from pyspark.sql.types import IntegerType
from pyspark.sql.types import DoubleType
from pyspark.sql.types import *
from pyspark.sql.functions import unix_timestamp
from pyspark.sql.functions import isnan, when, count, col,trim
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import warnings
import pandas as pd
from pandas.plotting import scatter_matrix
import os.path
import re
from pyspark.sql.functions import udf, col
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import when, lit
import pyspark.sql.functions as f
import logging
import sys

conf = SparkConf().setAppName("myname-app").setMaster('local').set(spark.Network.timeout", "600s")
conf.set("spark.executor.heartbeatInterval","500s")
conf.set("spark.driver.cores",4)

warnings.filterwarnings('ignore')
%matplotlib inline

sc = SparkContext.getOrCreate(conf=conf)

这时,先前正在运行的代码现在挂起。我什么都没改变。这是错误消息

是的。我不得不打断键盘

KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-11-f29246ef897b> in <module>()
1 sc = SparkContext.getOrCreate(conf=conf)

~/anaconda3/lib/python3.6/site-packages/pyspark/context.py in    getOrCreate(cls, conf)
332         with SparkContext._lock:
333             if SparkContext._active_spark_context is None:
334                 SparkContext(conf=conf or SparkConf())
335             return SparkContext._active_spark_context
336 

~/anaconda3/lib/python3.6/site-packages/pyspark/context.py in __init__(self,     master, appName, sparkHome, pyFiles, environment, batchSize, serializer, conf,   gateway, jsc, profiler_cls)
113         """
114         self._callsite = first_spark_call() or CallSite(None, None,  None)
115         SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)
116         try:
117             self._do_init(master, appName, sparkHome, pyFiles,   environment, batchSize, serializer,

~/anaconda3/lib/python3.6/site-packages/pyspark/context.py in   _ensure_initialized(cls, instance, gateway, conf)
281         with SparkContext._lock:
282             if not SparkContext._gateway:
283                 SparkContext._gateway = gateway or launch_gateway(conf)
284                 SparkContext._jvm = SparkContext._gateway.jvm
285 

~/anaconda3/lib/python3.6/site-packages/pyspark/java_gateway.py in  launch_gateway(conf)
 85         while gateway_port is None and proc.poll() is None:
 86             timeout = 1  # (seconds)
 87             readable, _, _ = select.select([callback_socket], [], [],  timeout)
 88             if callback_socket in readable:
 89                 gateway_connection = callback_socket.accept()[0]

KeyboardInterrupt: 

在日志中,我得到以下信息:

 Exception in thread "main" java.util.NoSuchElementException: key not found: _    PYSPARK_DRIVER_CONN_INFO_PATH
    at scala.collection.MapLike$class.default(MapLike.scala:228)
    at scala.collection.AbstractMap.default(Map.scala:59)
    at scala.collection.MapLike$class.apply(MapLike.scala:141)
    at scala.collection.AbstractMap.apply(Map.scala:59)
    at  org.apache.spark.api.python.PythonGatewayServer$.main(PythonGatewayServer.scala:64)
    at org.apache.spark.api.python.PythonGatewayServer.main(PythonGatewayServer.scala)
    at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
    at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
    at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke(Method.java:498)
    at org.apache.spark.deploy.JavaMainApplication.start(SparkApplication.scala:52)
    at org.apache.spark.deploy.SparkSubmit.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:849)
    at org.apache.spark.deploy.SparkSubmit.doRunMain$1(SparkSubmit.scala:167)
    at org.apache.spark.deploy.SparkSubmit.submit(SparkSubmit.scala:195)
    at org.apache.spark.deploy.SparkSubmit.doSubmit(SparkSubmit.scala:86)
    at org.apache.spark.deploy.SparkSubmit$$anon$2.doSubmit(SparkSubmit.scala:924)
    at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:933)
    at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala) 

我不知道,这是怎么突然发生的。

我正在运行Spark 2.4.0和python3

0 个答案:

没有答案