我正在尝试从pandas数据框创建pyspark数据框。
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
a_dict = {0: [(0, 9.821), (1, 82.185)]}
a_pd = pd.DataFrame.from_dict(a_dict.items())
a_pd.columns = ["row_num", "val"]
a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)])
my_schema = StructType([ StructField("row_num", LongType(), True),StructField("val", list(a_str), True)]) # error
a_df = spark.createDataFrame(a_pd, schema=my_schema)
错误:
AssertionError: dataType [StructField(id,IntegerType,true), StructField(prob,DoubleType,true)] should be an instance of <class 'pyspark.sql.types.DataType'>
如何定义有效的模式
list of tuple of (int, DoubleType)
以便pyspark可以理解它?
谢谢
答案 0 :(得分:0)
对于值列表,必须使用ArrayType。下面是示例复制的代码。
import pandas as pd
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType
a_dict = {0: [(0, 9.821), (1, 82.185)],
1: [(0, 9.821), (1, 8.10), (3, 2.385)],
2: [(0, 9.821), (1, 1.4485), (4, 5.15), (5, 6.104)]}
a_pd = pd.DataFrame.from_dict(a_dict.items())
a_pd.columns = ["row_num", "val"]
print(a_pd.head())
a_str = StructType([StructField("id", IntegerType(), True), StructField("prob", DoubleType(), True)])
my_schema = StructType([StructField("row_num", LongType(), True), StructField("val", ArrayType(a_str), True)]) # error
a_df = sqlContext.createDataFrame(a_pd, schema=my_schema)
print(a_df.show(truncate=False))
print(a_df.printSchema())
输出:
+-------+------------------------------------------------+
|row_num|val |
+-------+------------------------------------------------+
|0 |[[0, 9.821], [1, 82.185]] |
|1 |[[0, 9.821], [1, 8.1], [3, 2.385]] |
|2 |[[0, 9.821], [1, 1.4485], [4, 5.15], [5, 6.104]]|
+-------+------------------------------------------------+
root
|-- row_num: long (nullable = true)
|-- val: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- id: integer (nullable = true)
| | |-- prob: double (nullable = true)