我试图在spark工作中运行并行线程。当我从cli运行python脚本时,这没有任何障碍,但我的理解是,它并没有真正利用EMR集群并行处理的好处。当我作为spark工作运行时,它实际上不会保存数据。当我将它作为一个火花工作运行时,我甚至不确定它是否创建了火花数据帧。
我也尝试过使用map而不是做并行线程,但也无法使用它。
如果我不能将并行性作为一个火花工作,那么我似乎也可以在具有并行线程的单个ec2实例上运行它。
所以基本逻辑就是这个 -
使用以下方法循环遍历文件列表
# this is run for 10 blocks of 10 files each across the EMR cluster in parallel
def parquet_driver(self):
max_threads = 20
futures=[]
pool = ThreadPoolExecutor(max_threads)
i = 0
total_files_processed = 0
while total_files_processed <= len(self.master_file_list):
while i < max_threads:
print('Processing %s' % self.master_file_list[i])
futures.append(pool.submit(self.convert_to_parquet,
self.master_file_list[i]))
i += 1
for x in as_completed(futures):
pass
# add in i number of files to the total
total_files_processed += i
请注意,这是将文件传递给名为&#34; convert_to_parquet&#34;的方法。
def convert_to_parquet(self, file):
log_file_name = file.split(':')[2].replace('.dat', '.log')
logger = Logger(log_file_name).get()
try:
bucket = s3.Bucket(file.split(':')[0])
file_name = file.split(':')[2]
file_obj = bucket.Object(file.split(':')[1] + '/' + file.split(':')[2])
partition_key = file.split(':')[2].split('.')[2]
target_table = file.split(':')[2].split('.')[1]
receipt_handle = file.split(':')[3]
file_contents = file_obj.get()["Body"].read()
if 'al1' not in file.split(':')[2]:
logger.debug('Record type = %s, deleting from queue and returning ..' % target_table)
else:
logger.debug('Working on %s..' % target_table)
app_name = file
#sc = SparkContext(appName=app_name)
print('Reading the following file from s3: %s' % file_name)
print('Found the following file contents on s3: %s' % file_contents)
rdd = sc.parallelize(file_contents.split('\n')).map(lambda line: line.split(','))
# rdd = sc.textFile(csv_file).map(lambda line: line.split(','))
# pd.read_csv(csv_file)
sqlContext = sql.SQLContext(sc)
if hasattr(rdd, "toDF"):
df = rdd.toDF()
else:
spark = SparkSession
df = rdd.toDF()
logger.debug("Partitioning data to: {0}".format(partition_key))
# Go to redshift and get the data definition
metadata = self.build_df_definition('al1')
if 'cycle_date' in metadata['columns']:
metadata['columns'].remove('cycle_date')
if 'log_timestamp' in metadata['columns']:
metadata['columns'].remove('log_timestamp')
cols = metadata['columns']
data_types = metadata['data_types']
for idx in range(0,len(cols)):
col_str = '_' + str(int(idx) + 1)
df_field_value = regexp_replace(df[col_str], '"', '')
df = df.withColumn(cols[idx],df_field_value.cast(data_types[idx]))
df = df.withColumn("cycle_date",lit(partition_key))
# this field will be pushed to the sqs queue
df = df.withColumn("log_timestamp",lit(self.log_timestamp))
full_cols = cols
full_cols.append('cycle_date')
full_cols.append('log_timestamp')
print(full_cols)
ref_df = df.select(full_cols)
ref_df.show()
partitionby=['year','month','day']
output='/opt/data/' + '/' + target_table
s3_loc = 's3://<bucket>/<prefix>/' + target_table
codec='snappy'
ref_df.write.partitionBy(['cycle_date']).format("parquet").save(s3_loc, mode="append")
#sc.stop()
except Exception as e:
logger.debug(e)
traceback.print_exc()
open("/opt/logs/dump.log","w").write(traceback.print_exc())
exit()
else:
# Delete received message from queue
sqs.delete_message(
QueueUrl=self.queue_url,
ReceiptHandle=receipt_handle
)
logger.debug('Received and deleted file: %s' % file)