我写的一个小的PySpark脚本正在寻找一个名为resource_tags_user_engagement的值。
如果该值为空,为空或包含一个单词,则应将其替换为默认值。但是,不仅仅是替换空白,空值或单词,而是替换所有值:
import sys
import pyspark.sql.functions as f
from pyspark.context import SparkContext
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
# Set Glue Context
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
spark.sql("set spark.sql.parquet.enableVectorizedReader=false")
# Create Dynamic Data Frame from table in the glue database
cost_allocation = glueContext.create_dynamic_frame.from_catalog(database="company_cost_allocation", table_name="company_cost_allocation")
# Convert dynamic frame to dta frame
cost_allocation_df = cost_allocation.toDF()
# Set default engagements
cost_allocation_df = cost_allocation_df.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789101', '123456789102', '123456789103', '123456789104', '123456789105', '123456789106', '123456789107', '123456789108', '123456789109' )) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008378'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
((f.col('line_item_usage_account_id') == f.lit('123456789110')) |
(f.col('line_item_usage_account_id') == f.lit('123456789111'))) &
(f.col('resource_tags_user_engagement') == f.lit('') ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000401'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789112', '123456789113', '123456789114')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000412'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789115', '123456789116', '123456789117', '123456789118', '123456789119', '123456789120', '123456789121', '123456789122', '123456789123')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008692'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789124', '123456789125', '123456789126')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '807000000412'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id').isin('123456789127', '123456789128', '123456789129', '123456789130', '123456789131')) &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '808000000298'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
(f.col('line_item_usage_account_id') == '123456789132') &
(f.col('resource_tags_user_engagement') == '' ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '803000006453'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
((f.col('line_item_usage_account_id') == f.lit('123456789133')) |
(f.col('line_item_usage_account_id') == f.lit('123456789134'))) &
(f.col('resource_tags_user_engagement') == f.lit('') ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '400000008426'
)) \
.withColumn('resource_tags_user_engagement',
f.when(
((f.col('line_item_usage_account_id') == f.lit('123456789135')) |
(f.col('line_item_usage_account_id') == f.lit('123456789136'))) &
(f.col('resource_tags_user_engagement') == f.lit('') ) |
(f.col('resource_tags_user_engagement').isNull()) |
(f.col('resource_tags_user_engagement').rlike('^[a-zA-Z]')), '800000047650'
).otherwise(f.col('resource_tags_user_engagement')))
# Convert back to a DynamicFrame for further processing.
partitioned_dynamicframe = DynamicFrame.fromDF(cost_allocation_df, glueContext, "partitioned_df")
# Repartition the dynamic frame before writing to S3
cost_allocation_df = cost_allocation_df.repartition(5)
# Write to S3
output_dir = "s3://company-cur-reports/company-costs-transformed-legacy-billing"
datasink = glueContext.write_dynamic_frame.from_options(frame = partitioned_dynamicframe, connection_type = "s3", connection_options = {"path": output_dir}, format = "parquet", transformation_ctx = "datasink")
为什么这样做?如何获取脚本以仅替换空,空白或其中包含单词的值?