当我运行包含两个作业的工作流时,我遇到了一个问题,第一个是第二个工作的源。第二个不是从第一个读取所有数据,如果我再次运行第二个作业,它将得到丢失的数据。有人遇到过吗?
好像我的第一份工作没有完成写作,而我的第二份工作已经在尝试阅读它。下面我提供更多详细信息。
我有一个工作流,该工作流读取存储桶,并且仅执行从JSON到Parquet格式的初始转换。因此,它将抓取此存储桶并刷新其架构。该模式在名为DB_RAW的数据库中可用。在相同工作流程中的第二项工作是负责读取原始数据表并进行一些转换,这些转换也将存储到存储桶中并进行爬网。第二张表将是数据模型,数据分析人员将使用数据可视化工具在其中构建报告。 我正在使用作业书签来管理何时应提取新文件。
第一份工作代码
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import Relationalize, ResolveChoice
from awsglue.job import Job
from pyspark.sql.functions import input_file_name, from_unixtime, unix_timestamp, year, col, when, struct
from pyspark.sql.types import *
import boto3
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'Environment'])
env = args["Environment"]
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session.builder.appName('Raw - Data Pipeline').getOrCreate()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
#get data from s3
connection_options = {"paths": [ "s3://bucket/landing/consolidated/"]}
dyf_vpm_consolidated_raw = glueContext.create_dynamic_frame.from_options(connection_type="s3", connection_options=connection_options, format="json", format_options={"multiline":"true"}, transformation_ctx="dyf_vpm_consolidated_raw")
#convert dynamic frame to data frame and add the required columns
df_vpm_consolidated_raw = dyf_vpm_consolidated_raw.toDF()
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("filename", input_file_name())
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("date", from_unixtime(unix_timestamp(col("filename").substr(-14,8), "yyyyMMdd")))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("year", year(df_vpm_consolidated_raw["date"]))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("type", when(col("filename").contains("28days"), "28").otherwise("7"))
#convert the data frame to dynamic frame
dyf_vpm_consolidated_staging = DynamicFrame.fromDF(df_vpm_consolidated_raw, glueContext, "dyf_vpm_consolidated_staging")
#rationalise the dynamic frame to flatten the columns
dyf_vpm_consolidated_rationalised_collection = Relationalize.apply(frame = dyf_vpm_consolidated_staging, staging_path = "s3://bucket/scripts/temporary-glue-files/", name = "root", transformation_ctx = "dyf_vpm_consolidated_rationalised_collection")
dyf_vpm_consolidated_staging = dyf_vpm_consolidated_rationalised_collection.select("root")
#convert to df again and renaming the column
df_vpm_consolidated_staging = dyf_vpm_consolidated_staging.toDF()
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
df_vpm_consolidated_staging = df_vpm_consolidated_staging.withColumnRenamed("columx", "columx")
#convert to final dynamic frame
dyf_vpm_consolidated_final = DynamicFrame.fromDF(df_vpm_consolidated_staging, glueContext, "dyf_vpm_consolidated_final")
#write the file
connection_options = {"path": "s3://bucket/raw/consolidated/", "partitionKeys": ["year", "type"]}
glueContext.write_dynamic_frame.from_options(frame=dyf_vpm_consolidated_final, connection_type="s3", connection_options=connection_options, format="parquet")
job.commit()
第二作业代码
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.transforms import Relationalize, ResolveChoice
from awsglue.job import Job
from pyspark.sql.functions import monotonically_increasing_id, regexp_replace, col, lit, explode, array, struct, regexp_replace, substring, when
from pyspark.sql.types import DoubleType
import boto3
args = getResolvedOptions(sys.argv, ["JOB_NAME", "Environment"])
env = args["Environment"]
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session.builder.appName("Transform - Data Pipeline").getOrCreate()
job = Job(glueContext)
job.init(args["JOB_NAME"], args)
dyf_vpm_consolidated_analytics = glueContext.create_dynamic_frame.from_catalog(database = "db_insights_raw", table_name = "tb_consolidated", transformation_ctx = "dyf_vpm_consolidated_analytics")
#dyf_vpm_consolidated_analytics = glueContext.create_dynamic_frame_from_catalog(database = "db_insights_raw", table_name = "tb_consolidated", transformation_ctx = "dyf_vpm_consolidated_analytics")
#convert dynamic frame to data frame
df_vpm_consolidated_raw = dyf_vpm_consolidated_analytics.toDF()
print("Count row table 0:" + str(df_vpm_consolidated_raw.count()))
#prepare data to create the Dimensions
#df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("id", monotonically_increasing_id())
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("episodeName", regexp_replace("episodeName", ",", ""))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("episodeName", regexp_replace("episodeName", "\n", " "))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("genre", regexp_replace("genre", ",", ""))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("seriesName", regexp_replace("seriesName", ",", ""))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("programName", regexp_replace("programName", ",", ""))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("classification", regexp_replace("classification", ",", ""))
df_vpm_consolidated_raw = df_vpm_consolidated_raw.withColumn("minutesViewed", (col("millisecondsViewed") / 60000).cast(DoubleType()))
connection_options = {"path": "s3://bucket/analytics/consolidated/vpm/", "partitionKeys": ["year", "type"]}
glueContext.write_dynamic_frame.from_options(frame=dyf_vpm_consolidated, connection_type="s3", connection_options=connection_options, format="parquet")
job.commit()