我要一个接一个地处理多个json记录。我的代码读取多个json并将它们存储到dataframe中。现在我想从数据帧中逐行处理json文档。当我从数据帧中获取行时,我需要再次将该行转换为数据帧,并对此进行一些操作。我被困在将类'pyspark.sql.types.Row'对象转换为dataframe。
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").option("uri","mongodb://127.0.0.1/mydatabase.sample").load()
A = u(funcRowIter,df.schema)
z = df.withColumn("new_column",A(struct([df[x] for x in df.columns])))
z.show()
def funcRowIter(rows):
print type(rows)
if(rows is not None):
rdf = sqlContext.createDataFrame(rows)
rdf.show()
return rows
帮我把类'pyspark.sql.types.Row'对象转换为dataframe。我的行对象是巨大的json文件。
这是我试图从mongodb中读取的json
{
"Feed": {
"feedBody": {
"Reservation": {
"recordLocatorID": "X23344",
"pnrCreateDate": "2018-09-24T23:00:00.000",
"lastUpdateTimestamp": "2018-09-26T14:51:01.643",
"pnrReservationSystemSequenceID": "1643",
"pnrPurgeDate": "2018-10-11",
"passengerCount": "1",
"reservationSystemCode": "1X",
"passengerList": {
"passenger": {
"passengerID": "2",
"lastUpdateTimestamp": "2018-09-24T18:00:54.835",
"dateOfBirth": "1993-10-02",
"givenName": "fgdfg",
"surName": "fgdfg",
"gender": "M",
"infantIndicator": "true",
"seatCount": "1",
"reservationSystemCustomerID": "dfgdfg",
"passengerTypeCode": "dfgfd",
"groupDepositIndicator": "false",
"passengerTicketDocList": {
"passengerTicketDoc": {
"ticketDocID": "45",
"lastUpdateTimestamp": "2018-09-24T18:01:01.149",
"ticketNumber": "43434343434",
"ticketType": "T",
"ticketIndicator": "E",
"status": "T",
"issuanceDate": "2010-09-20",
"chargeAmount": "0.74",
"currency": "USD"
}
}
}
}
}
}
}
}
这是输出的行
Row(Feed=Row(
feedBody=Row(
Reservation=Row(
recordLocatorID=u'X23344',
pnrCreateDate=u'2018-09-24T23:00:00.000',
lastUpdateTimestamp=u'2018-09-26T14:51:01.643',
pnrReservationSystemSequenceID=u'1643',
pnrPurgeDate=u'2018-10-11',
passengerCount=u'1',
reservationSystemCode=u'1X',
passengerList=Row(
passenger=Row(
passengerID=u'2',
lastUpdateTimestamp=u'2018-09-24T18:00:54.835',
dateOfBirth=u'1993-10-02',
givenName=u'fgdfg',
surName=u'fgdfg',
gender=u'M',
infantIndicator=u'true',
seatCount=u'1',
reservationSystemCustomerID=u'dfgdfg',
passengerTypeCode=u'dfgfd',
groupDepositIndicator=u'false',
passengerTicketDocList=Row(
passengerTicketDoc=Row(
ticketDocID=u'45',
lastUpdateTimestamp=u'2018-09-24T18:01:01.149',
ticketNumber=u'43434343434',
ticketType=u'T',
ticketIndicator=u'E',
status=u'T',
issuanceDate=u'2010-09-20',
chargeAmount=u'0.74',
currency=u'USD'))))))), _id=Row(oid=u'5bc0cc8c2ec34dd42a44fc2f'))