Question

我有30个，每个在s3位置有2 GB的数据压缩文件，我试图解压缩文件，将它们转换为数据帧，然后根据列名称对数据进行子集化。当我运行代码时，它会被杀死（请参阅屏幕截图中的错误）。请帮我解释如何将文件转换为数据框。

def run():
    with futures.ThreadPoolExecutor(max_workers=len(source_keys)) as executor:
        jobs = [executor.submit(move_source_to_dest, source_keys[i],src_session) for i in range(len(source_keys))]
        jobs_categorize = futures.wait(jobs, timeout=6000)
        if jobs_categorize.not_done:
            print("one job didn't complete with-in time-out")
        for job in jobs_categorize.done:
            key = job.result()
            if key:
                print("Compelted {}".format(os.path.basename(key)))



def move_source_to_dest(key, src_session):
    with tempfile.TemporaryDirectory() as tempdir:
        try:
            print("downloading {}/{}".format(s3_src_bucket, key))
            src_session.client('s3').download_file(Bucket=s3_src_bucket, Key=key,
                                                   Filename=os.path.join(tempdir, os.path.basename(key)))

        #Command to decompress the files
        command = "bzip2 -dk " + os.path.join(tempdir, os.path.basename(key))
        subprocess.call(command,shell = True)

        #Reading all the columns names from the file "ambs_ambi_ColumnsNames.txt"    
        file = open('ambs_ambi_ColumnsNames.txt','r')
        clist=file.readlines()

        Filename=os.path.join(tempdir, os.path.basename(key[:-4]))
        Fileout=os.path.join(tempdir, os.path.basename(key[:-4])) + "-out.csv" 
        print("Creating the DataFrame")

        df=pd.read_csv(Filename, sep="\x01", names=clist)

        #Give all the column names that are required in the file named 'ambs_ambi_OutColumnsNames.txt' 
        file = open('ambs_ambi_OutColumnsNames.txt','r')
        outcols=file.readlines()
        print("subseting the data")
        data=df.loc[:,outcols]
        data.to_csv(Fileout,sep='\x01')

        print("Uploading to {}/{}".format(s3_dest_bucket, os.path.join(s3_dest_prefix, os.path.basename(Fileout))))
        src_session.client('s3').upload_file(Filename=Fileout,
                                              Bucket=s3_dest_bucket,
                                              Key=os.path.join(s3_dest_prefix, os.path.basename(Fileout)),
                                              ExtraArgs={"ServerSideEncryption": "AES256"})
    except Exception as e:
        print("exception handling {}/{}".format(s3_src_bucket, key))
        raise e
return key

读取大文件时pd.read_csv被杀死

0 个答案: