如何使用iPython从Spark连接到Cassandra?
我已按照here中的代码进行了修改,
import os
import sys
# Path for spark source folder
os.environ['SPARK_HOME'] = "C:\Apache\spark-1.4.1"
# Append pyspark to Python Path
sys.path.append("C:\Apache\spark-1.4.1\python")
from pyspark import SparkContext
from pyspark.sql import SQLContext
host = 'localhost'
keyspace = 'demo'
cf = 'users'
sc = SparkContext(appName="CassandraInputFormat")
conf = {"cassandra.input.thrift.address": host,
"cassandra.input.thrift.port": "9160",
"cassandra.input.keyspace": keyspace,
"cassandra.input.columnfamily": cf,
"cassandra.input.partitioner.class": "Murmur3Partitioner",
"cassandra.input.page.row.size": "3"}
cass_rdd = sc.newAPIHadoopRDD(
"org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat",
"java.util.Map",
"java.util.Map",
keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter",
valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter",
conf=conf)
output = cass_rdd.collect()
当我从ipython笔记本运行它时,我在下面收到此错误,
--------------------------------------------------------------------------- Py4JJavaError Traceback (most recent call last) <ipython-input-15-458818fce35c> in <module>()
29 keyConverter="org.apache.spark.examples.pythonconverters.CassandraCQLKeyConverter",
30 valueConverter="org.apache.spark.examples.pythonconverters.CassandraCQLValueConverter",
---> 31 conf=conf)
32
33 output = cass_rdd.collect()
C:\Apache\spark-1.4.1\python\pyspark\context.pyc in newAPIHadoopRDD(self, inputFormatClass, keyClass, valueClass, keyConverter, valueConverter, conf, batchSize)
599 jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormatClass, keyClass,
600 valueClass, keyConverter, valueConverter,
--> 601 jconf, batchSize)
602 return RDD(jrdd, self)
603
C:\Users\lauthiamkok\inotebook\lib\site-packages\py4j-0.9-py2.7.egg\py4j\java_gateway.pyc in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
C:\Users\lauthiamkok\inotebook\lib\site-packages\py4j-0.9-py2.7.egg\py4j\protocol.pyc in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling {0}{1}{2}.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
还有更多......
> Py4JJavaError: An error occurred while calling
> z:org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD. :
> java.lang.ClassNotFoundException:
> org.apache.cassandra.hadoop.cql3.CqlPagingInputFormat at
> java.net.URLClassLoader.findClass(Unknown Source) at
> java.lang.ClassLoader.loadClass(Unknown Source) at
> java.lang.ClassLoader.loadClass(Unknown Source) at
> java.lang.Class.forName0(Native Method) at
> java.lang.Class.forName(Unknown Source) at
> org.apache.spark.util.Utils$.classForName(Utils.scala:179) at
> org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDDFromClassNames(PythonRDD.scala:520)
> at
> org.apache.spark.api.python.PythonRDD$.newAPIHadoopRDD(PythonRDD.scala:503)
> at
> org.apache.spark.api.python.PythonRDD.newAPIHadoopRDD(PythonRDD.scala)
> at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at
> sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source) at
> sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source) at
> java.lang.reflect.Method.invoke(Unknown Source) at
> py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) at
> py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379) at
> py4j.Gateway.invoke(Gateway.java:259) at
> py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
> at py4j.commands.CallCommand.execute(CallCommand.java:79) at
> py4j.GatewayConnection.run(GatewayConnection.java:207) at
> java.lang.Thread.run(Unknown Source)
我错过了哪些想法?
答案 0 :(得分:0)
这看起来像是原因:java.lang.ClassNotFoundException
,这意味着没有发现罐子&amp;罐子的路径没有正确设置。
您可以从命令行运行示例吗?
Run with example jar:
./bin/spark-submit --driver-class-path /path/to/example/jar \
/path/to/examples/cassandra_inputformat.py <host> <keyspace> <cf>