我有一个相当大(〜1TB)的Parquet数据集,该数据集由列database_id
划分。我想将此数据集复制到一个新的数据集,仅保留index
列包含在单独的“ special_indexes”表中的那些行。
我目前的做法是:
import pyspark.sql.functions as F
big_table = spark.read.parquet("path/to/partitioned/big_table.parquet")
unique_indexes_table = spark.read.parquet("path/to/unique_indexes.parquet")
out = (
big_table
.join(F.broadcast(unique_indexes_table), on="index")
.write
.save(
path="{path/to/partitioned/big_table_2.parquet}",
format='parquet',
mode='overwrite',
partitionBy="database_id")
)
但是,这会导致改组并失败,并在我的10节点群集上出现java.io.IOException: No space left on device
错误,每个节点在SPARK_LOCAL_DIRS
中拥有约900MB的磁盘空间。
我已经尝试使它工作好几天了,但是没有成功。我正在考虑在pyarrow
中重写它,在其中我读取分区并一次执行一个连接,但是我不知道为什么pyspark
不能做同样的事情?
我在下面发布了SQL图和查询执行计划。我以为Exchange
步骤是导致问题的原因,并且我不确定为什么有必要?!
== Parsed Logical Plan == 'InsertIntoHadoopFsRelationCommand file:/scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet, false, ['database_id], Parquet, Map(path -> /scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet), Overwrite, [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- AnalysisBarrier +- RepartitionByExpression [database_id#104], 200 +- Project [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- Join Inner, (__index_level_0__#77L = __index_level_0__#222L) :- Relation[uniparc_id#70,sequence#71,database#72,interpro_name#73,interpro_id#74,domain_start#75L,domain_end#76L,__index_level_0__#77L,domain_length#78L,structure_id#79,model_id#80,chain_id#81,pc_identity#82,alignment_length#83,mismatches#84,gap_opens#85,q_start#86,q_end#87,s_start#88,s_end#89,evalue_log10#90,bitscore#91,qseq#92,sseq#93,... 11 more fields] parquet +- ResolvedHint (broadcast) +- Relation[__index_level_0__#222L] parquet == Analyzed Logical Plan == InsertIntoHadoopFsRelationCommand file:/scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet, false, [database_id#104], Parquet, Map(path -> /scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet), Overwrite, [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- RepartitionByExpression [database_id#104], 200 +- Project [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- Join Inner, (__index_level_0__#77L = __index_level_0__#222L) :- Relation[uniparc_id#70,sequence#71,database#72,interpro_name#73,interpro_id#74,domain_start#75L,domain_end#76L,__index_level_0__#77L,domain_length#78L,structure_id#79,model_id#80,chain_id#81,pc_identity#82,alignment_length#83,mismatches#84,gap_opens#85,q_start#86,q_end#87,s_start#88,s_end#89,evalue_log10#90,bitscore#91,qseq#92,sseq#93,... 11 more fields] parquet +- ResolvedHint (broadcast) +- Relation[__index_level_0__#222L] parquet == Optimized Logical Plan == InsertIntoHadoopFsRelationCommand file:/scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet, false, [database_id#104], Parquet, Map(path -> /scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet), Overwrite, [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- RepartitionByExpression [database_id#104], 200 +- Project [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- Join Inner, (__index_level_0__#77L = __index_level_0__#222L) :- Filter isnotnull(__index_level_0__#77L) : +- Relation[uniparc_id#70,sequence#71,database#72,interpro_name#73,interpro_id#74,domain_start#75L,domain_end#76L,__index_level_0__#77L,domain_length#78L,structure_id#79,model_id#80,chain_id#81,pc_identity#82,alignment_length#83,mismatches#84,gap_opens#85,q_start#86,q_end#87,s_start#88,s_end#89,evalue_log10#90,bitscore#91,qseq#92,sseq#93,... 11 more fields] parquet +- ResolvedHint (broadcast) +- Filter isnotnull(__index_level_0__#222L) +- Relation[__index_level_0__#222L] parquet == Physical Plan == Execute InsertIntoHadoopFsRelationCommand InsertIntoHadoopFsRelationCommand file:/scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet, false, [database_id#104], Parquet, Map(path -> /scratch/username/datapkg_output_dir/uniparc-domain-wstructure/master/remove_duplicate_matches/adjacency_matrix.parquet), Overwrite, [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- Exchange hashpartitioning(database_id#104, 200) +- *(2) Project [__index_level_0__#77L, uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] +- *(2) BroadcastHashJoin [__index_level_0__#77L], [__index_level_0__#222L], Inner, BuildRight :- *(2) Project [uniparc_id#70, sequence#71, database#72, interpro_name#73, interpro_id#74, domain_start#75L, domain_end#76L, __index_level_0__#77L, domain_length#78L, structure_id#79, model_id#80, chain_id#81, pc_identity#82, alignment_length#83, mismatches#84, gap_opens#85, q_start#86, q_end#87, s_start#88, s_end#89, evalue_log10#90, bitscore#91, qseq#92, sseq#93, ... 11 more fields] : +- *(2) Filter isnotnull(__index_level_0__#77L) : +- *(2) FileScan parquet [uniparc_id#70,sequence#71,database#72,interpro_name#73,interpro_id#74,domain_start#75L,domain_end#76L,__index_level_0__#77L,domain_length#78L,structure_id#79,model_id#80,chain_id#81,pc_identity#82,alignment_length#83,mismatches#84,gap_opens#85,q_start#86,q_end#87,s_start#88,s_end#89,evalue_log10#90,bitscore#91,qseq#92,sseq#93,... 11 more fields] Batched: false, Format: Parquet, Location: InMemoryFileIndex[file:/scratch/username/datapkg_output_dir/uniparc-domain-wstructure/v0.1/contru..., PartitionCount: 1373, PartitionFilters: [], PushedFilters: [IsNotNull(__index_level_0__)], ReadSchema: struct