我一直看到我的工作中的以下几行一遍又一遍。工作永远不会完成,巧合的是,与我在5分钟内完成的同事工作相比,显着减少了表格和数据。
18/06/14 12:01:38 DEBUG Client: client token: N/A diagnostics: N/A ApplicationMaster host: 999.99.99.99 ApplicationMaster RPC port: 0 queue: default start time: 1528976771600 final status: UNDEFINED tracking URL: http://ip-999-99-9-999.ec2.internal:20888/proxy/application_1528975883916_0001/ user: root
由于关于GLUE的文档仍然相当稀少,不确定我应该在哪里看。
代码如下:
import sys
from awsglue.transforms import *
from awsglue.dynamicframe import DynamicFrame
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.window import *
import pyspark.sql.functions as func
from pyspark.sql import SparkSession
from pprint import pprint
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'TempDir'])
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
## @type: DataSource
## @args: [database = "snapshots", table_name = "data_prospect", redshift_tmp_dir = args["TempDir"], transformation_ctx = "<transformation_ctx>"]
## @return: <output>
## @inputs: []
# data_prospects = glueContext.create_dynamic_frame.from_catalog(database="snapshots", table_name="data_prospect", redshift_tmp_dir=args["TempDir"], transformation_ctx="<transformation_ctx>")
## @type: DataSource
## @args: [database = "snapshots", table_name = "data_prospect", redshift_tmp_dir = args["TempDir"], transformation_ctx = "<transformation_ctx>"]
## @return: <output>
## @inputs: []
data_visitor_activitys = glueContext.create_dynamic_frame.from_catalog(database = "snapshots", table_name = "data_visitor_activity", redshift_tmp_dir = args["TempDir"], transformation_ctx = "<transformation_ctx>")
data_prospects = glueContext.create_dynamic_frame.from_catalog(database = "snapshots", table_name = "data_prospect", redshift_tmp_dir = args["TempDir"], transformation_ctx = "<transformation_ctx>")
sf_contacts = glueContext.create_dynamic_frame.from_catalog(database = "snapshots", table_name = "sf_contactfile", redshift_tmp_dir = args["TempDir"], transformation_ctx = "<transformation_ctx>")
dataactivities_df = data_visitor_activitys.toDF()
dataactivities_df.createOrReplaceTempView("dataactivitiestable")
dataactivities_dfsql = spark.sql("select campaign_id, created_at, type, id from dataactivitiestable where type IN (6,11)")
data_visitor_activitys = DynamicFrame.fromDF(dataactivities_dfsql, glueContext, "new_dynamic_frame")
dataprospects_df = data_prospects.toDF()
dataprospects_df.createOrReplaceTempView("dataprospecttable")
dataprospects_dfsql = spark.sql("select crm_lead_fid, crm_contact_fid, id from dataprospecttable ")
data_prospects = DynamicFrame.fromDF(dataprospects_dfsql, glueContext, "new_dynamic_frame")
sfcontacts_df = sf_contacts.toDF()
sfcontacts_df.createOrReplaceTempView("contactstable")
sfcontacts_dfsql = spark.sql("select AccountId,id from contactstable")
sf_contacts = DynamicFrame.fromDF(sfcontacts_dfsql, glueContext, "new_dynamic_frame")
## @type: RenameField
## @args: [old_name = "<old_name>", new_name = "<new_name>", transformation_ctx = "<transformation_ctx>"]
## @return: <output>
## @inputs: [frame = <frame>]
data_visitor_activitys = RenameField.apply(frame = data_visitor_activitys, old_name = "campaign_id", new_name = "campaignidfull__c", transformation_ctx = "<transformation_ctx>")
data_visitor_activitys = RenameField.apply(frame = data_visitor_activitys, old_name = "created_at", new_name = "CampaignAdoptedDate__c", transformation_ctx = "<transformation_ctx>")
data_visitor_activitys = RenameField.apply(frame = data_visitor_activitys, old_name = "id", new_name = "CampMemId", transformation_ctx = "<transformation_ctx>")
# datava = glueContext.write_dynamic_frame.from_options(frame = data_visitor_activitys, connection_type = "s3", connection_options = {"path":"s3://bucket/glue/prospect/MTA/datava"}, format = "avro", transformation_ctx = "<transformation_ctx>")
data_prospects = RenameField.apply(frame = data_prospects, old_name = "crm_lead_fid", new_name = "LeadId", transformation_ctx = "<transformation_ctx>")
# dataprospects = glueContext.write_dynamic_frame.from_options(frame = data_prospects, connection_type = "s3", connection_options = {"path":"s3://bucket/glue/prospect/MTA/datapros"}, format = "avro", transformation_ctx = "<transformation_ctx>")
sf_contacts = RenameField.apply(frame = sf_contacts, old_name = "accountid", new_name = "acctnumber", transformation_ctx = "<transformation_ctx>")
sf_contacts = RenameField.apply(frame = sf_contacts, old_name = "id", new_name = "leadorcontactownerid", transformation_ctx = "<transformation_ctx>")
# sfcontacts = glueContext.write_dynamic_frame.from_options(frame = sf_contacts, connection_type = "s3", connection_options = {"path":"s3://bucket/glue/prospect/MTA/contacts"}, format = "avro", transformation_ctx = "<transformation_ctx>")
data_join_1 = Join.apply(frame1=data_visitor_activitys, frame2=data_prospects, keys1=['prospect_id'], keys2=['id'], transformation_ctx="<transformation_ctx>")
datajoin1 = glueContext.write_dynamic_frame.from_options(frame = data_join_1, connection_type = "s3", connection_options = {"path":"s3://bucket/glue/prospect/MTA/datajoin1"}, format = "avro", transformation_ctx = "<transformation_ctx>")
data_join_2 = Join.apply(frame1=data_join_1, frame2=sf_contacts, keys1=['crm_contact_fid'], keys2=['id'], transformation_ctx="<transformation_ctx>")
# datajoin2 = glueContext.write_dynamic_frame.from_options(frame = data_join_2, connection_type = "s3", connection_options = {"path":"s3://bucket/glue/prospect/MTA/datajoin2"}, format = "avro", transformation_ctx = "<transformation_ctx>")
pardor_final_join = Join.apply(frame1=data_join_1, frame2=sf_contacts, keys1=['crm_contact_fid'], keys2=['id'], transformation_ctx="<transformation_ctx>")
DataFINAL = glueContext.write_dynamic_frame.from_options(frame = data_join_2, connection_type = "s3", connection_options = {"path":"s3://bucket/glue/prospect/MTA/datafinal"}, format = "avro", transformation_ctx = "<transformation_ctx>")