我有一个嵌套的json文件,其结构如下:
{
"value": {
"employee": {
"employeeid": "1234",
"employeename": "ABCD",
"contactNumber": [
{
"type": "Work",
"phoneNumber": "1234567890"
},
{
"type": "Home",
"phoneNumber": "0987654321"
}
] }}}
我需要展平json中存在的此数组,并使用胶水将其写入Amazon RDS中的表。我尝试创建以下内容:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "test", table_name = "employee", transformation_ctx = "datasource0")
relationalized_json = datasource0.relationalize(root_table_name = "root", staging_path = args["TempDir"])
root_df = relationalized_json.select('root')
applymapping1 = ApplyMapping.apply(frame = root_df, mappings= [("`value.employee.employeeid`","string", "employeeid","string"),("`value.employee.employeename`","string", "employeename","string"), ("`value.employee.employeename.contactNumber.value.type`","string", "type","string"),("`value.employee.employeename.contactNumber.value.phoneNumber`","string", "phoneNumber","string")]transformation_ctx = "applymapping1")
datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = applymapping1, catalog_connection = "test", connection_options = {"dbtable": "employee", "database": "testdb"}, transformation_ctx = "datasink1")
它正在填充所有列的值,但类型和电话号码除外。 类型和电话号码填充为空。 请让我在这里知道这个问题吗?
答案 0 :(得分:0)
我修改了这样的代码:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.types import *
from pyspark.sql import functions as F
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "test", table_name = "taskjson_taskorderassignment_0429_2020_05_01_json", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "test", table_name = "employee", transformation_ctx = "datasource0")
## @type: ApplyMapping
## @args: [mapping = [("id", "string", "id", "string"), ("type", "string", "type", "string"), ("datacontenttype", "string", "datacontenttype", "string"), ("time", "string", "time", "string"), ("specversion", "string", "specversion", "string"), ("source", "string", "source", "string"), ("data.taskOrder.taskOrderIdentifier", "string", "`data.taskOrder.taskOrderIdentifier`", "string"), ("data.taskOrder.taskOrderTypeCode", "string", "`data.taskOrder.taskOrderTypeCode`", "string"), ("data.taskOrder.taskOrderRequestDateTime", "string", "`data.taskOrder.taskOrderRequestDateTime`", "string"), ("data.taskOrder.taskOrderRequestedPickupTime", "string", "`data.taskOrder.taskOrderRequestedPickupTime`", "string"), ("data.taskOrder.taskOrderRequestedDeliveryTime", "string", "`data.taskOrder.taskOrderRequestedDeliveryTime`", "string"), ("data.taskOrder.taskOrderRequestedCounselingDate", "string", "`data.taskOrder.taskOrderRequestedCounselingDate`", "string"), ("data.taskOrder.taskOrderPackagingDate", "string", "`data.taskOrder.taskOrderPackagingDate`", "string"), ("data.taskOrder.movePriorityCode", "string", "`data.taskOrder.movePriorityCode`", "string"), ("data.taskOrder.isSafeMove", "boolean", "`data.taskOrder.isSafeMove`", "boolean"), ("data.taskOrder.restrictionIndicator", "boolean", "`data.taskOrder.restrictionIndicator`", "boolean"), ("data.taskOrder.taskOrderRequester", "string", "`data.taskOrder.taskOrderRequester`", "string"), ("data.taskOrder.taskOrderApprovedbyUser", "string", "`data.taskOrder.taskOrderApprovedbyUser`", "string"), ("data.taskOrder.taskOrderApprovedDateTime", "string", "`data.taskOrder.taskOrderApprovedDateTime`", "string"), ("data.taskOrder.taskOrderStatus", "string", "`data.taskOrder.taskOrderStatus`", "string"), ("data.taskOrder.taskOrderNotes", "string", "`data.taskOrder.taskOrderNotes`", "string"), ("data.taskOrder.isVIP", "boolean", "`data.taskOrder.isVIP`", "boolean"), ("data.taskOrder.isBlueBark", "boolean", "`data.taskOrder.isBlueBark`", "boolean"), ("data.taskOrder.isEntitlementOrdered", "boolean", "`data.taskOrder.isEntitlementOrdered`", "boolean"), ("data.taskOrder.entitelmentData", "array", "`data.taskOrder.entitelmentData`", "string"), ("data.taskOrder.serviceMember.serviceMemberIdentifier", "string", "`data.taskOrder.serviceMember.serviceMemberIdentifier`", "string"), ("data.taskOrder.serviceMember.serviceMemberTitle", "string", "`data.taskOrder.serviceMember.serviceMemberTitle`", "string"), ("data.taskOrder.serviceMember.serviceMemberFirstName", "string", "`data.taskOrder.serviceMember.serviceMemberFirstName`", "string"), ("data.taskOrder.serviceMember.serviceMemberMiddleName", "string", "`data.taskOrder.serviceMember.serviceMemberMiddleName`", "string"), ("data.taskOrder.serviceMember.serviceMemberLastName", "string", "`data.taskOrder.serviceMember.serviceMemberLastName`", "string"), ("data.taskOrder.serviceMember.serviceMemberlastFourSSN", "string", "`data.taskOrder.serviceMember.serviceMemberlastFourSSN`", "string"), ("data.taskOrder.serviceMember.serviceMemberGender", "string", "`data.taskOrder.serviceMember.serviceMemberGender`", "string"), ("data.taskOrder.serviceMember.serviceUserDODIdentifier", "string", "`data.taskOrder.serviceMember.serviceUserDODIdentifier`", "string"), ("data.taskOrder.serviceMember.serviceMemberRank", "string", "`data.taskOrder.serviceMember.serviceMemberRank`", "string"), ("data.taskOrder.serviceMember.contactInfo.email", "string", "`data.taskOrder.serviceMember.contactInfo.email`", "string"), ("data.taskOrder.serviceMember.contactInfo.alternateEmail", "string", "`data.taskOrder.serviceMember.contactInfo.alternateEmail`", "string"), ("data.taskOrder.serviceMember.contactInfo.primaryCommunicationPref", "string", "`data.taskOrder.serviceMember.contactInfo.primaryCommunicationPref`", "string"), ("data.taskOrder.serviceMember.contactInfo.secondaryCommunicationPref", "string", "`data.taskOrder.serviceMember.contactInfo.secondaryCommunicationPref`", "string"), ("data.taskOrder.serviceMember.contactInfo.contactNumber", "array", "`data.taskOrder.serviceMember.contactInfo.contactNumber`", "string"), ("data.taskOrder.authorizedRepresentative.authRepIdentifier", "string", "`data.taskOrder.authorizedRepresentative.authRepIdentifier`", "string"), ("data.taskOrder.authorizedRepresentative.authRepTitle", "string", "`data.taskOrder.authorizedRepresentative.authRepTitle`", "string"), ("data.taskOrder.authorizedRepresentative.authRepFirstName", "string", "`data.taskOrder.authorizedRepresentative.authRepFirstName`", "string"), ("data.taskOrder.authorizedRepresentative.authRepMiddleName", "string", "`data.taskOrder.authorizedRepresentative.authRepMiddleName`", "string"), ("data.taskOrder.authorizedRepresentative.authRepLastName", "string", "`data.taskOrder.authorizedRepresentative.authRepLastName`", "string"), ("data.taskOrder.authorizedRepresentative.authRepEffectiveStartDate", "string", "`data.taskOrder.authorizedRepresentative.authRepEffectiveStartDate`", "string"), ("data.taskOrder.authorizedRepresentative.authRepEffectiveEndDate", "string", "`data.taskOrder.authorizedRepresentative.authRepEffectiveEndDate`", "string"), ("data.taskOrder.authorizedRepresentative.authRepGender", "string", "`data.taskOrder.authorizedRepresentative.authRepGender`", "string"), ("data.taskOrder.authorizedRepresentative.contactInfo.email", "string", "`data.taskOrder.authorizedRepresentative.contactInfo.email`", "string"), ("data.taskOrder.authorizedRepresentative.contactInfo.alternateEmail", "string", "`data.taskOrder.authorizedRepresentative.contactInfo.alternateEmail`", "string"), ("data.taskOrder.authorizedRepresentative.contactInfo.primaryCommunicationPref", "string", "`data.taskOrder.authorizedRepresentative.contactInfo.primaryCommunicationPref`", "string"), ("data.taskOrder.authorizedRepresentative.contactInfo.secondaryCommunicationPref", "string", "`data.taskOrder.authorizedRepresentative.contactInfo.secondaryCommunicationPref`", "string"), ("data.taskOrder.authorizedRepresentative.contactInfo.contactNumber", "array", "`data.taskOrder.authorizedRepresentative.contactInfo.contactNumber`", "string"), ("data.taskOrder.orderingOfficer.orderingOfficerIdentifier", "string", "`data.taskOrder.orderingOfficer.orderingOfficerIdentifier`", "string"), ("data.taskOrder.orderingOfficer.orderingOfficerTitle", "string", "`data.taskOrder.orderingOfficer.orderingOfficerTitle`", "string"), ("data.taskOrder.orderingOfficer.orderingOfficerFirstName", "string", "`data.taskOrder.orderingOfficer.orderingOfficerFirstName`", "string"), ("data.taskOrder.orderingOfficer.orderingOfficerMiddleName", "string", "`data.taskOrder.orderingOfficer.orderingOfficerMiddleName`", "string"), ("data.taskOrder.orderingOfficer.orderingOfficerLastName", "string", "`data.taskOrder.orderingOfficer.orderingOfficerLastName`", "string"), ("data.taskOrder.orderingOfficer.orderingOfficerGender", "string", "`data.taskOrder.orderingOfficer.orderingOfficerGender`", "string"), ("data.taskOrder.orderingOfficer.orderingOfficerDODIdentifier", "string", "`data.taskOrder.orderingOfficer.orderingOfficerDODIdentifier`", "string"), ("data.taskOrder.orderingOfficer.contactInfo.email", "string", "`data.taskOrder.orderingOfficer.contactInfo.email`", "string"), ("data.taskOrder.orderingOfficer.contactInfo.alternateEmail", "string", "`data.taskOrder.orderingOfficer.contactInfo.alternateEmail`", "string"), ("data.taskOrder.orderingOfficer.contactInfo.primaryCommunicationPref", "string", "`data.taskOrder.orderingOfficer.contactInfo.primaryCommunicationPref`", "string"), ("data.taskOrder.orderingOfficer.contactInfo.secondaryCommunicationPref", "string", "`data.taskOrder.orderingOfficer.contactInfo.secondaryCommunicationPref`", "string"), ("data.taskOrder.orderingOfficer.contactInfo.contactNumber", "array", "`data.taskOrder.orderingOfficer.contactInfo.contactNumber`", "string"), ("data.taskOrder.addressInfo.destinationAddress.addressLine1", "string", "`data.taskOrder.addressInfo.destinationAddress.addressLine1`", "string"), ("data.taskOrder.addressInfo.destinationAddress.addressLine2", "string", "`data.taskOrder.addressInfo.destinationAddress.addressLine2`", "string"), ("data.taskOrder.addressInfo.destinationAddress.addressLine3", "string", "`data.taskOrder.addressInfo.destinationAddress.addressLine3`", "string"), ("data.taskOrder.addressInfo.destinationAddress.longitude", "string", "`data.taskOrder.addressInfo.destinationAddress.longitude`", "string"), ("data.taskOrder.addressInfo.destinationAddress.latitude", "string", "`data.taskOrder.addressInfo.destinationAddress.latitude`", "string"), ("data.taskOrder.addressInfo.destinationAddress.city", "string", "`data.taskOrder.addressInfo.destinationAddress.city`", "string"), ("data.taskOrder.addressInfo.destinationAddress.county", "string", "`data.taskOrder.addressInfo.destinationAddress.county`", "string"), ("data.taskOrder.addressInfo.destinationAddress.stateOrProvince", "string", "`data.taskOrder.addressInfo.destinationAddress.stateOrProvince`", "string"), ("data.taskOrder.addressInfo.destinationAddress.postalCode", "string", "`data.taskOrder.addressInfo.destinationAddress.postalCode`", "string"), ("data.taskOrder.addressInfo.destinationAddress.countryCode", "string", "`data.taskOrder.addressInfo.destinationAddress.countryCode`", "string"), ("data.taskOrder.addressInfo.originAddress.addressLine1", "string", "`data.taskOrder.addressInfo.originAddress.addressLine1`", "string"), ("data.taskOrder.addressInfo.originAddress.addressLine2", "string", "`data.taskOrder.addressInfo.originAddress.addressLine2`", "string"), ("data.taskOrder.addressInfo.originAddress.addressLine3", "string", "`data.taskOrder.addressInfo.originAddress.addressLine3`", "string"), ("data.taskOrder.addressInfo.originAddress.longitude", "string", "`data.taskOrder.addressInfo.originAddress.longitude`", "string"), ("data.taskOrder.addressInfo.originAddress.latitude", "string", "`data.taskOrder.addressInfo.originAddress.latitude`", "string"), ("data.taskOrder.addressInfo.originAddress.city", "string", "`data.taskOrder.addressInfo.originAddress.city`", "string"), ("data.taskOrder.addressInfo.originAddress.county", "string", "`data.taskOrder.addressInfo.originAddress.county`", "string"), ("data.taskOrder.addressInfo.originAddress.stateOrProvince", "string", "`data.taskOrder.addressInfo.originAddress.stateOrProvince`", "string"), ("data.taskOrder.addressInfo.originAddress.postalCode", "string", "`data.taskOrder.addressInfo.originAddress.postalCode`", "string"), ("data.taskOrder.addressInfo.originAddress.countryCode", "string", "`data.taskOrder.addressInfo.originAddress.countryCode`", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
df=datasource0.toDF()
schema = ArrayType(MapType(StringType(),StringType()))
root_df=df.select("value.employee.employeeid","value.employee.employeename",\
"value.employee.contactNumber")\
.withColumn("contactNumber",F.explode(F.from_json(F.regexp_replace\
(F.regexp_replace("contactNumber","([\\w-]+)", "\"$1\"")\
,"\=",":"),schema)))\
.select("employeeid","employeename","contactNumber.type","contactNumber.phoneNumber")\
.show(truncate=False)
applymapping1 = ApplyMapping.apply(frame = root_df, mappings = [("`employeeid`", "string", "employeeid", "string"),
("`employeename`", "string", "employeename", "string"),
("`contactNumber.type`", "string", "type", "string"),
("`contactNumber.phoneNumber`", "string", "phone", "string"),
], transformation_ctx = "applymapping1")
datasink1 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = applymapping1, catalog_connection = "test", connection_options = {"dbtable": "employee", "database": "testdb"}, transformation_ctx = "datasink1")
job.commit()
但是我遇到以下错误:
pyspark.sql.utils.AnalysisException:'由于以下原因,无法解析\'regexp_replace(contactNumber
,\'([\\ w-] +)\',\'“ $ 1” \')\'数据类型不匹配:参数1需要字符串类型,但是\'contactNumber
\'是array>类型。;; \ n \'Project [taskOrderIdentifier#14,explode(jsontostructs(ArrayType(MapType(MapType(StringType,StringType, ,true),true),regexp_replace(regexp_replace(contactNumber#15,([\ w-] +),“ $ 1”),\ =,:),Some(UTC)))AS contactNumber。
请告知这里有什么问题。