我正在尝试将csv文件的内容设为CoordinateMatrix
:
数据集在这里http://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt 我转换为csv文件。
from pyspark.mllib.linalg.distributed import RowMatrix, IndexedRowMatrix,CoordinateMatrix, MatrixEntry
entries = sc.parallelize([MatrixEntry(0, 0, 1.2),MatrixEntry(1, 0, 2), MatrixEntry(2, 1, 3.7)])
print type(entries)
mat = CoordinateMatrix(entries)
mat_transposed = mat.transpose()
mat_transposed = mat_transposed.toIndexedRowMatrix()
print(mat_transposed.numRows())
# # Load CSV File to a PySpark DataFrame
#
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
#df= sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('dataset.csv')
data = sc.textFile("dataset.csv")
#myrdd = sc.textFile("dataset.csv").map(lambda line: line.split(","))
noHeaderRDD = data.zipWithIndex().filter(lambda (row,index): index > 0).keys()
#mat_data = CoordinateMatrix(noHeaderRDD)
我想把我的csv的内容放到坐标矩阵中,但似乎我得到了一个错误
尝试制作coordinatematrix后出现错误
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-52-718d73b097b2> in <module>()
----> 1 mat_data = CoordinateMatrix(noHeaderRDD)
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\mllib\linalg\distributed.pyc in __init__(self, entries, numRows, numCols)
797 # each be easily serialized. We will convert back to
798 # MatrixEntry inputs on the Scala side.
--> 799 java_matrix = callMLlibFunc("createCoordinateMatrix", entries.toDF(),
800 long(numRows), long(numCols))
801 elif (isinstance(entries, JavaObject)
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\session.pyc in toDF(self, schema, sampleRatio)
55 [Row(name=u'Alice', age=1)]
56 """
---> 57 return sparkSession.createDataFrame(self, schema, sampleRatio)
58
59 RDD.toDF = toDF
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\session.pyc in createDataFrame(self, data, schema, samplingRatio, verifySchema)
533
534 if isinstance(data, RDD):
--> 535 rdd, schema = self._createFromRDD(data.map(prepare), schema, samplingRatio)
536 else:
537 rdd, schema = self._createFromLocal(map(prepare, data), schema)
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\session.pyc in _createFromRDD(self, rdd, schema, samplingRatio)
373 """
374 if schema is None or isinstance(schema, (list, tuple)):
--> 375 struct = self._inferSchema(rdd, samplingRatio)
376 converter = _create_converter(struct)
377 rdd = rdd.map(converter)
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\session.pyc in _inferSchema(self, rdd, samplingRatio)
344 :return: :class:`pyspark.sql.types.StructType`
345 """
--> 346 first = rdd.first()
347 if not first:
348 raise ValueError("The first row in RDD is empty, "
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\rdd.pyc in first(self)
1359 ValueError: RDD is empty
1360 """
-> 1361 rs = self.take(1)
1362 if rs:
1363 return rs[0]
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\rdd.pyc in take(self, num)
1341
1342 p = range(partsScanned, min(partsScanned + numPartsToTry, totalParts))
-> 1343 res = self.context.runJob(self, takeUpToNumLeft, p)
1344
1345 items += res
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\context.pyc in runJob(self, rdd, partitionFunc, partitions, allowLocal)
990 # SparkContext#runJob.
991 mappedRDD = rdd.mapPartitions(partitionFunc)
--> 992 port = self._jvm.PythonRDD.runJob(self._jsc.sc(), mappedRDD._jrdd, partitions)
993 return list(_load_from_socket(port, mappedRDD._jrdd_deserializer))
994
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\java_gateway.py in __call__(self, *args)
1131 answer = self.gateway_client.send_command(command)
1132 return_value = get_return_value(
-> 1133 answer, self.gateway_client, self.target_id, self.name)
1134
1135 for temp_arg in temp_args:
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\sql\utils.pyc in deco(*a, **kw)
61 def deco(*a, **kw):
62 try:
---> 63 return f(*a, **kw)
64 except py4j.protocol.Py4JJavaError as e:
65 s = e.java_exception.toString()
D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\py4j-0.10.4-src.zip\py4j\protocol.py in get_return_value(answer, gateway_client, target_id, name)
317 raise Py4JJavaError(
318 "An error occurred while calling {0}{1}{2}.\n".
--> 319 format(target_id, ".", name), value)
320 else:
321 raise Py4JError(
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 16.0 failed 1 times, most recent failure: Lost task 0.0 in stage 16.0 (TID 27, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 177, in main
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 172, in process
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\rdd.py", line 1339, in takeUpToNumLeft
yield next(iterator)
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\mllib\linalg\distributed.py", line 748, in _convert_to_matrix_entry
raise TypeError("Cannot convert type %s into MatrixEntry" % type(entry))
TypeError: Cannot convert type <type 'unicode'> into MatrixEntry
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:446)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:280)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:214)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 177, in main
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\worker.py", line 172, in process
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\serializers.py", line 268, in dump_stream
vs = list(itertools.islice(iterator, batch))
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\pyspark\rdd.py", line 1339, in takeUpToNumLeft
yield next(iterator)
File "D:\opt\spark\spark-2.2.0-bin-hadoop2.7\python\lib\pyspark.zip\pyspark\mllib\linalg\distributed.py", line 748, in _convert_to_matrix_entry
raise TypeError("Cannot convert type %s into MatrixEntry" % type(entry))
TypeError: Cannot convert type <type 'unicode'> into MatrixEntry
at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:108)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
... 1 more
有什么办法吗?似乎我需要将我的csv文件的内容作为MatrixEntry对象?
TypeError: Cannot convert type <type 'unicode'> into MatrixEntry
答案 0 :(得分:1)
TL; DR 您的代码失败,因为您管道未解析的字符串。您已将数据解析为表单,可以将其转换为MatrixEntry
。
如果dataset.csv
看起来像:
i,j,v
1,2,3
2,1,0
你可以:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
mat = CoordinateMatrix(
spark.read
.option("header", "true")
.schema("i long, j long, v double")
.csv("dataset.csv")
.rdd
)
2.3替换之前:
.schema("i long, j long, v double")
带
from pyspark.sql.types import *
.schema(StructType([
StructField("i", LongType()),
StructField("j", LongType()),
StructField("v", DoubleType())
]))