我有30个,每个在s3位置有2 GB的数据压缩文件,我试图解压缩文件,将它们转换为数据帧,然后根据列名称对数据进行子集化。当我运行代码时,它会被杀死(请参阅屏幕截图中的错误)。请帮我解释如何将文件转换为数据框。
def run():
with futures.ThreadPoolExecutor(max_workers=len(source_keys)) as executor:
jobs = [executor.submit(move_source_to_dest, source_keys[i],src_session) for i in range(len(source_keys))]
jobs_categorize = futures.wait(jobs, timeout=6000)
if jobs_categorize.not_done:
print("one job didn't complete with-in time-out")
for job in jobs_categorize.done:
key = job.result()
if key:
print("Compelted {}".format(os.path.basename(key)))
def move_source_to_dest(key, src_session):
with tempfile.TemporaryDirectory() as tempdir:
try:
print("downloading {}/{}".format(s3_src_bucket, key))
src_session.client('s3').download_file(Bucket=s3_src_bucket, Key=key,
Filename=os.path.join(tempdir, os.path.basename(key)))
#Command to decompress the files
command = "bzip2 -dk " + os.path.join(tempdir, os.path.basename(key))
subprocess.call(command,shell = True)
#Reading all the columns names from the file "ambs_ambi_ColumnsNames.txt"
file = open('ambs_ambi_ColumnsNames.txt','r')
clist=file.readlines()
Filename=os.path.join(tempdir, os.path.basename(key[:-4]))
Fileout=os.path.join(tempdir, os.path.basename(key[:-4])) + "-out.csv"
print("Creating the DataFrame")
df=pd.read_csv(Filename, sep="\x01", names=clist)
#Give all the column names that are required in the file named 'ambs_ambi_OutColumnsNames.txt'
file = open('ambs_ambi_OutColumnsNames.txt','r')
outcols=file.readlines()
print("subseting the data")
data=df.loc[:,outcols]
data.to_csv(Fileout,sep='\x01')
print("Uploading to {}/{}".format(s3_dest_bucket, os.path.join(s3_dest_prefix, os.path.basename(Fileout))))
src_session.client('s3').upload_file(Filename=Fileout,
Bucket=s3_dest_bucket,
Key=os.path.join(s3_dest_prefix, os.path.basename(Fileout)),
ExtraArgs={"ServerSideEncryption": "AES256"})
except Exception as e:
print("exception handling {}/{}".format(s3_src_bucket, key))
raise e
return key