获取不同的线程来改变pandas数据帧的不同部分

时间:2017-09-17 19:34:48

标签: python multithreading pandas python-multithreading

我是python中多线程的新手,所以我不确定如何设置它。我正在尝试生成一个大型输出数据框,其中填充了基于另一个输入数据帧的计算。输出数据帧类似于输入数据帧的列的邻接矩阵。

以下非多线程版本运行良好:

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency
import json
import os
import time

def build_adjacency_matrix(DATA_MATRIX, OUT):

    # READS DATA: data must be a csv with a header and an index column
    my_data = pd.read_csv(DATA_MATRIX, index_col=0)

    # INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames)
    AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns)

    y=0
    w=2
    for c1 in my_data.columns:
        print (c1)
        y+=1
        if y > w:
            time.sleep(1)   # GIVE THE PROCESSER A REST AFTER EACH 10 COLUMNS
            print(y)        #KEEP TRACK OF HOW MANY COLS HAVE BEEN PROCESSED
            w+=10
        for c2 in my_data.columns:
            if c1==c2: AM.loc[c1,c2]=0; continue
            sample_df = pd.DataFrame(my_data, columns=[c1,c2])
            # KEEP ONLY ROWS WITH 1s and 0s
            sample_df = sample_df[sample_df[c1] != 0.5]
            sample_df = sample_df[sample_df[c2] != 0.5]
            sample_df = sample_df.dropna()
            # CALCULATE ChiX
            # Contingency table.
            contingency = pd.crosstab(sample_df[c1], sample_df[c2])
            # Chi-square test of independence.
            try:
                chi2, p, ddof, expected = chi2_contingency(contingency)
                AM.loc[c1,c2] = p
            except:
                ValueError;
                # ASSIGN AS NOT SIGNIFICANT IF THERE IS A PROBLEM
                AM.loc[c1,c2] = 1

    AM.to_csv(OUT, sep=',')

    return

# FILES
data_matrix='input_test.csv'
out='output_mt_test.csv'

# FUNCTION CALL
build_adjacency_matrix(data_matrix, out)

以下是输入文件的前几行:

,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
SAMPLE1,1,0,0.5,1,1,0.5,0.5,1,0.5,0.5,0.5,0.5,0,0.5,0,0.5,0,0.5,0.5
SAMPLE2,0.5,0.5,0.5,1,1,0.5,0.5,1,0.5,0.5,0,1,0,0.5,0,0.5,0.5,0.5,0.5
SAMPLE3,0.5,0,0.5,1,1,0.5,0.5,1,0.5,0.5,1,0.5,0.5,0.5,0,1,0,0.5,0.5
SAMPLE4,1,0.5,0.5,1,1,0.5,0.5,0,0.5,0.5,0.5,0.5,0.5,0.5,1,1,0.5,0.5,1

这是输出文件的前几行:

    ,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185,0.920167313802
VAR2,0.00326965769624,0,0.67328997966,0.714393037634,0.714393037634,0.829638099719,1.0,0.881545828869,1.0,1.0,0.504985075094,0.665005542102,0.672603817442,0.75946286538,0.365088814029,1.0,0.478520976544,0.698535358303,0.700311372937
VAR3,0.67328997966,0.67328997966,0,1.0,1.0,0.665005542102,1.0,0.672603817442,1.0,1.0,1.0,1.0,0.819476976778,1.0,0.324126587758,1.0,1.0,0.665005542102,0.608407800233

代码运行良好并产生测试文件的预期输出,但真正的输入文件(完全相同的文件结构,但有100行和1000s的cols)相当大,需要大约48小时才能运行,所以我需要使它更快。

我尝试了以下尝试实现多线程:

import pandas as pd
from scipy.stats import chi2_contingency
from threading import Thread


def build_adjacency_matrix(DATA_MATRIX, OUT, THREADS):

    # READS DATA: data must be a csv with a header and an index column
    my_data = pd.read_csv(DATA_MATRIX, index_col=0)

    # INITIALIZE EMPTY DF WITH COLSNAMES FROM INPUT AS COLUMNS AND INDEX (rownames)
    AM = pd.DataFrame(columns=my_data.columns, index = my_data.columns)
    print(len(my_data.columns))
    print(len(my_data.index))



    # BUILD THREAD GROUPS
    thread_groups={}
    chunk=int(len(AM.columns)/THREADS)
    i=0; j=chunk
    for t in range(THREADS): thread_groups[t]=list(range(i,j)); i+=chunk; j+=chunk; 
    # DELEGATE REMAINING COLS TO THE LAST THREAD
    if thread_groups[THREADS-1][-1] != len(AM.columns):
        thread_groups[THREADS-1] = thread_groups[THREADS-1] + \
                                   list(range((thread_groups[THREADS-1][-1]),len(AM.columns)))
    print(thread_groups)


    def populate_DF(section):

        for c1 in AM.columns[section]:
            for c2 in AM.columns:
                if c1==c2: AM.loc[c1,c2]=0; continue
                sample_df = pd.DataFrame(my_data, columns=[c1,c2])
                # KEEP ONLY ROWS WITH 1s and 0s
                sample_df = sample_df[sample_df[c1] != 0.5]
                sample_df = sample_df[sample_df[c2] != 0.5]
                sample_df = sample_df.dropna()
                # CALCULATE ChiX
                # Contingency table.
                contingency = pd.crosstab(sample_df[c1], sample_df[c2])
                #Chi-square test of independence.
                try:
                    # POPULATE AM WITH CHI-SQ p-value
                    chi2, p, ddof, expected = chi2_contingency(contingency)
                    AM.loc[c1,c2] = p
                except:
                    # ASSIGN A p-value OF 1.0 IF THERE IS A PROBLEM
                    ValueError;
                    AM.loc[c1,c2] = 1


    for tg in thread_groups:
        t = Thread(target=populate_DF, args=(thread_groups[tg],))
        print(tg)
        print(thread_groups[tg])
        t.start()


    AM.to_csv(OUT, sep=',')

    return


data_matrix='input_test.csv'
out='output_mt_test.csv'

build_adjacency_matrix(data_matrix, out, 4)

我不确定是否应该将输出数据帧作为全局变量?或者怎么做? “构建线程组”一节的目的是将输入文件中的列组委托给单独的线程,并将每个输出添加到最终的数据帧。我有多达16个内核可用,所以认为多线程解决方案在这里会有所帮助。代码原样会产生意外的,部分完整的输出:

    ,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,VAR8,VAR9,VAR10,VAR11,VAR12,VAR13,VAR14,VAR15,VAR16,VAR17,VAR18,VAR19
VAR1,0,0.00326965769624,0.67328997966,0.573642138098,0.573642138098,0.923724918398,0.556975806531,0.665485722686,1.0,0.545971722677,0.125786424639,0.665005542102,0.914326585297,0.843324894877,0.10024407707,0.37367830795,0.894229755473,0.711877649185,
VAR2,,,,,,,,,,,,,,,,,,,
VAR3,,,,,,,,,,,,,,,,,,,
VAR4,,,,,,,,,,,,,,,,,,,
VAR5,0.573642138098,0.714393037634,1.0,5.61531250139e-06,0,1.0,1.0,0.859350808026,0.819476976778,0.819476976778,1.0,1.0,0.805020272634,,,,,,
VAR6,,,,,,,,,,,,,,,,,,,
VAR7,,,,,,,,,,,,,,,,,,,
VAR8,,,,,,,,,,,,,,,,,,,
VAR9,1.0,1.0,1.0,0.819476976778,,,,,,,,,,,,,,,
VAR10,,,,,,,,,,,,,,,,,,,
VAR11,,,,,,,,,,,,,,,,,,,
VAR12,,,,,,,,,,,,,,,,,,,
VAR13,0.914326585297,,,,,,,,,,,,,,,,,,
VAR14,,,,,,,,,,,,,,,,,,,
VAR15,,,,,,,,,,,,,,,,,,,
VAR16,,,,,,,,,,,,,,,,,,,
VAR17,,,,,,,,,,,,,,,,,,,
VAR18,,,,,,,,,,,,,,,,,,,
VAR19,,,,,,,,,,,,,,,,,,,

我不确定这是否与多线程试图输出到同一个变量的问题有关,或者这是我如何分散工作负载的问题。我真的很感激如何解决这个问题或任何其他优化代码的方法?提前谢谢!

0 个答案:

没有答案