从嵌套的数据框列中删除换行符

时间:2019-04-11 10:04:54

标签: apache-spark apache-spark-sql

我有一个带有架构的数据框

root
 |-- AppUsers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Email: string (nullable = true)
 |    |    |-- FirstName: string (nullable = true)
 |    |    |-- LastName: string (nullable = true)
 |    |    |-- UserName: string (nullable = true)
 |-- BusinessLines: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |-- Campaigns: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- BusinessLineId: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- StartDate: date (nullable = true)
 |    |    |-- EndDate: date (nullable = true)
 |    |    |-- Imported: boolean (nullable = true)
 |    |    |-- IsClosed: string (nullable = true)
 |-- CampaignDomains: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- DomainId: integer (nullable = true)
 |-- CampaignDomainEntityComments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- DomainId: integer (nullable = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- Comment: string (nullable = true)
 |-- CampaignEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- ClosedDate: date (nullable = true)
 |    |    |-- ClosedBy: string (nullable = true)
 |-- CampaignDomainEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- DomainId: integer (nullable = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- Status: string (nullable = true)
 |    |    |-- ValidationDate: date (nullable = true)
 |    |    |-- ValidatedBy: string (nullable = true)
 |-- Domains: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- BusinessLineId: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |-- Entities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- BasesClient: string (nullable = true)
 |    |    |-- BusinessLineId: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Pole: string (nullable = true)
 |    |    |-- PoleCode: string (nullable = true)
 |    |    |-- PoleLabel: string (nullable = true)
 |    |    |-- Transactions: string (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- ELRId: string (nullable = true)
 |    |    |-- ELRDescription: string (nullable = true)
 |    |    |-- UOId: string (nullable = true)
 |    |    |-- UODescription: string (nullable = true)
 |-- Groups: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- BusinessLine: integer (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- IsCampaign: boolean (nullable = true)
 |-- GroupEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- GroupId: integer (nullable = true)
 |-- Indicators: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- AccessLevel: string (nullable = true)
 |    |    |-- CanBeCopied: boolean (nullable = true)
 |    |    |-- Definition: string (nullable = true)
 |    |    |-- ModeReporting: string (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Perimeter: string (nullable = true)
 |    |    |-- PeriodTypeEN: string (nullable = true)
 |    |    |-- PeriodTypeFR: string (nullable = true)
 |    |    |-- PeriodTypeId: integer (nullable = true)
 |    |    |-- SubDomainId: integer (nullable = true)
 |    |    |-- Type: string (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- OversightIndicatorID: string (nullable = true)
 |-- IndicatorEntities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- EntityId: integer (nullable = true)
 |    |    |-- IndicatorId: integer (nullable = true)
 |-- SubDomains: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- Comment: string (nullable = true)
 |    |    |-- Name: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Enabled: boolean (nullable = true)
 |    |    |-- DomainId: integer (nullable = true)
 |-- SubIndicators: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- IndicatorId: integer (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |    |    |-- Order: integer (nullable = true)
 |    |    |-- Type: string (nullable = true)
 |    |    |-- Unit: string (nullable = true)
 |    |    |-- ValueListNameId: integer (nullable = true)
 |    |    |-- IsMandatory: boolean (nullable = true)
 |    |    |-- IsGDPR: boolean (nullable = true)
 |    |    |-- OversightSubIndicatorID: string (nullable = true)
 |-- ValueLists: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |    |    |-- Value: integer (nullable = true)
 |    |    |-- ValueListNameId: integer (nullable = true)
 |-- ValueListNames: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- NameEN: string (nullable = true)
 |    |    |-- NameFR: string (nullable = true)
 |-- Comments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Id: integer (nullable = true)
 |    |    |-- Code: string (nullable = true)
 |    |    |-- Definition: string (nullable = true)
 |-- CommentValues: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- CampaignId: integer (nullable = true)
 |    |    |-- CommentId: integer (nullable = true)
 |    |    |-- Value: string (nullable = true)

打印数据框:

+--------------------------------------+-------------+---------+---------------+----------------------------+-----------------------+--------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------+---------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------+-----------------+----------------+
|AppUsers                              |BusinessLines|Campaigns|CampaignDomains|CampaignDomainEntityComments|CampaignEntities       |CampaignDomainEntities    |Domains                   |Entities                                                                                                                                                                                  |Groups                             |GroupEntities|Indicators                                                                                                                                                                                                                                                                                                                                                                   |IndicatorEntities|SubDomains                        |SubIndicators                                                  |ValueLists                                                 |ValueListNames                       |Comments         |CommentValues   |
+--------------------------------------+-------------+---------+---------------+----------------------------+-----------------------+--------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------+---------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------+-----------------+----------------+
|[[1,null,JEROEN,SOMERS,JEROEN.SOMERS]]|[[1,PRIV]]   |null     |[[1,2]]        |[[122,1,9,add comments ]]   |[[1,1,2018-08-24,null]]|[[1,11,1,Draft,null,null]]|[[1,1,1,INCIDENTS,1,true]]|[[1,0071300000,Outil central (FORCE),1,SGPB MONACO GESTION PRIVEE,PRIV,000423,PRIV Monaco,Outil central (FORCE),true,0071300000,SOCIETE GENERALE PRIVATE BANKING (MONACO),20664,PRIV/MON]]|[[1,1,null,SGPB GROUPE,true,false]]|[[1,1]]      |[[18174,D3E_I1,EndUser,false,Rappel : les instructions transposées doivent être validées par la Conformité IBFS avant d'être soumises à la validation du Management de votre entité.,Flow,IBFS 000449 - IBFS Compliance Manual - published on 01/29/2015,IBFS 000449 - Manuel de conformité IBFS - publié le 29/01/2015,1,Global,Monthly,Mensuel,1,440,Complex,true,FCC.1.1]]|[[1,1]]          |[[1,18,null,Key Points,1,true,18]]|[[1,18.1,1,Entity,Entity,111,Text,,null,false,false,FCC.1.1.1]]|[[1,Discretionary management,Discretionary management,1,1]]|[[1,Compliance Item,Compliance Item]]|[[4,Priv-1,null]]|[[13,4,112323 ]]|
+--------------------------------------+-------------+---------+---------------+----------------------------+-----------------------+--------------------------+--------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+----------------------------------+---------------------------------------------------------------+-----------------------------------------------------------+-------------------------------------+-----------------+----------------+


指标中的描述带有换行符和一些不需要的字符,例如“,;

我想从描述子列中删除那些不需要的字符,并保持其结构不变 我已经使用扁平结构完成了此操作,但嵌套结构似乎令人困惑

为简单起见,我删除了大多数字段,仅保留了要对其应用转换的字段

示例输入:

{
  "AppUsers": [
    {
      "Id": 1,
      "UserName": "abc.bcd",
    }
  ],
  "Indicators": [
    {
      "Definition": "Rappel ;;;;; , \n",
    }
  ]
}

预期输出:

{
  "AppUsers": [
    {
      "Id": 1,
      "UserName": "abc.bcd",
    }
  ],
  "Indicators": [
    {

      "Definition": "Rappel",
    }
  ]
}

不需要的字符必须从Indicators.Definition列中删除 请帮助

1 个答案:

答案 0 :(得分:0)

也许您可以尝试访问您的列,并使用regexp_replace删除不需要的字符。以下是一个示例。

df = df.withColumn('Definition', regexp_replace(col('Indicators').getItem(4)), "/[~%&\\;:"',<>?#\s]/g",""))