我有一个类似
的架构的数据框<noscript>
我想在root
|-- state: struct (nullable = true)
| |-- fld: integer (nullable = true)
结构中添加列,即使用类似
state
但我得到
root
|-- state: struct (nullable = true)
| |-- fld: integer (nullable = true)
| |-- a: integer (nullable = true)
这是尝试
root
|-- state: struct (nullable = true)
| |-- fld: integer (nullable = true)
|-- state.a: integer (nullable = true)
答案 0 :(得分:6)
以下是一种不使用udf
:
# create example dataframe
import pyspark.sql.functions as f
data = [
({'fld': 0},)
]
schema = StructType(
[
StructField('state',
StructType(
[StructField('fld', IntegerType())]
)
)
]
)
df = sqlCtx.createDataFrame(data, schema)
df.printSchema()
#root
# |-- state: struct (nullable = true)
# | |-- fld: integer (nullable = true)
现在使用withColumn()
并使用lit()
和alias()
添加新字段。
val = 1
df_new = df.withColumn(
'state',
f.struct(*[f.col('state')['fld'].alias('fld'), f.lit(val).alias('a')])
)
df_new.printSchema()
#root
# |-- state: struct (nullable = false)
# | |-- fld: integer (nullable = true)
# | |-- a: integer (nullable = false)
如果嵌套结构中有很多字段,则可以使用列表推导,使用df.schema["state"].dataType.names
获取字段名称。例如:
val = 1
s_fields = df.schema["state"].dataType.names # ['fld']
df_new = df.withColumn(
'state',
f.struct(*([f.col('state')[c].alias(c) for c in s_fields] + [f.lit(val).alias('a')]))
)
df_new.printSchema()
#root
# |-- state: struct (nullable = false)
# | |-- fld: integer (nullable = true)
# | |-- a: integer (nullable = false)
<强>参考强>
答案 1 :(得分:3)
尽管答案为时已晚,但对于pyspark 2.x.x版,以下支持。
假设有问题,dfOld
已包含state
和fld
。
dfOld.withColumn("a","value")
dfNew = dfOld.select("level1Field1", "level1Field2", struct(col("state.fld").alias("fld"), col("a")).alias("state"))
参考:https://medium.com/@mrpowers/adding-structtype-columns-to-spark-dataframes-b44125409803
答案 2 :(得分:1)
使用如下转换:
import pyspark.sql.functions as f
df = df.withColumn(
"state",
f.struct(
f.col("state.*"),
f.lit(123).alias("a")
)
)
答案 3 :(得分:0)
from pyspark.sql.functions import *
from pyspark.sql.types import *
def add_field_in_dataframe(nfield, df, dt):
fields = nfield.split(".")
print fields
n = len(fields)
addField = fields[0]
if n == 1:
return df.withColumn(addField, lit(None).cast(dt))
nestedField = ".".join(fields[:-1])
sfields = df.select(nestedField).schema[fields[-2]].dataType.names
print sfields
ac = col(nestedField)
if n == 2:
nc = struct(*( [ac[c].alias(c) for c in sfields] + [lit(None).cast(dt).alias(fields[-1])]))
else:
nc = struct(*( [ac[c].alias(c) for c in sfields] + [lit(None).cast(dt).alias(fields[-1])])).alias(fields[-2])
print nc
n = n - 1
while n > 1:
print "n: ",n
fields = fields[:-1]
print "fields: ", fields
nestedField = ".".join(fields[:-1])
print "nestedField: ", nestedField
sfields = df.select(nestedField).schema[fields[-2]].dataType.names
print fields[-1]
print "sfields: ", sfields
sfields = [s for s in sfields if s != fields[-1]]
print "sfields: ", sfields
ac = col(".".join(fields[:-1]))
if n > 2:
print fields[-2]
nc = struct(*( [ac[c].alias(c) for c in sfields] + [nc])).alias(fields[-2])
else:
nc = struct(*( [ac[c].alias(c) for c in sfields] + [nc]))
n = n - 1
return df.withColumn(addField, nc)
答案 4 :(得分:0)
这是实现 没有 udf的一种方法。
初始化示例数据框:
@api_view(('POST',))
@csrf_exempt
@renderer_classes(JSONRenderer,)
def project_image_alternative_form_submit_ajax(request, object_id):
project_image = ProjectImage.objects.filter(pk=object_id).first()
response_json = {
'message': 'Image ...',
}
return Response(response_json, status=status.HTTP_200_OK)
userImg.sprite = Sprite.Create(iUserProfile.image, 0, 0, new Vector2(50,50), Vector.zero)
Spark nested_df1 = (spark.read.json(sc.parallelize(["""[
{ "state": {"fld": 1} },
{ "state": {"fld": 2}}
]"""])))
nested_df1.printSchema()
默认将所有整数导入为root
|-- state: struct (nullable = true)
| |-- fld: long (nullable = true)
。
如果.read.json
必须是long
,则需要强制转换。
state.fld
int
from pyspark.sql import functions as F
nested_df1 = (nested_df1
.select( F.struct(F.col("state.fld").alias("fld").cast('int')).alias("state") ))
nested_df1.printSchema()
root
|-- state: struct (nullable = false)
| |-- col1: integer (nullable = true)
使用nested_df1.show()
使用+-----+
|state|
+-----+
| [1]|
| [2]|
+-----+
标记从现有结构中获取所需的嵌套列,创建新列,然后将旧列与新列重新包装在{{ 1}}。
.select
"parent.child"
struct
val_a = 3
nested_df2 = (nested_df
.select(
F.struct(
F.col("state.fld"),
F.lit(val_a).alias("a")
).alias("state")
)
)
nested_df2.printSchema()
如果需要,可与root
|-- state: struct (nullable = false)
| |-- fld: integer (nullable = true)
| |-- a: integer (nullable = false)
一起放置。
nested_df2.show()
+------+
| state|
+------+
|[1, 3]|
|[2, 3]|
+------+
"parent.*"
nested_df2.select("state.*").printSchema()