使我的EC2实例利用更多的CPU能力

时间:2019-06-13 16:50:41

标签: python-3.x amazon-ec2 parallel-processing cpu-usage

我选择了t2.2xlarge实例,该实例具有8个CPU和32GiB的内存。但是,我觉得它的性能与我用来运行python脚本的“免费”版本相比是相同的。当我查看计算机上的CPU使用率时,它说只有8%。

如何利用更多的CPU?

这是我目前在这些EC2实例上运行的以下代码:

def connectToDB():
    databaseServerIP = "mydb.us-east-2.rds.amazonaws.com"  # IP address of the MySQL database server
    databaseUserName = "mydbUsername"       # User name of the database server
    databaseUserPassword = "mypwd"           # Password for the database user

    cursorType = pymysql.cursors.DictCursor


    connectionInstance = pymysql.connect(host=databaseServerIP, 
                                         user=databaseUserName, 
                                         password=databaseUserPassword, 
                                         cursorclass=cursorType,
                                         autocommit=True)

    # Create a cursor object
    cursorInstance = connectionInstance.cursor()     

    return connectionInstance, cursorInstance 


def construct_each_company(tmpDF_forPeerGroup, ii):

    print(tmpDF_forPeerGroup['Name'].values[ii])

    finalBigDataframe = pd.DataFrame(date_generated, index = date_generated)

    #symbolToCheck = tmpDF_forPeerGroup['Symbol'].values[ii]

    idx = tmpDF_forPeerGroup.index[ii]

    #####################
    #####################
    ##### dataframe 1
    try:
        connectionInstance, cursorInstance = connectToDB()
        sql = "SELECT * FROM DB1.Scores WHERE company_idx = "+str(idx)
        finalBigDataframe_1 = pd.read_sql(sql, con=connectionInstance)

    except:
        finalBigDataframe_1 = None             


    #####################
    #####################
    ##### dataframe 2
    try:
        connectionInstance, cursorInstance = connectToDB()
        sql = "SELECT * FROM DB2.Scores WHERE company_idx = "+str(idx)
        finalBigDataframe_2 = pd.read_sql(sql, con=connectionInstance)

    except:
        finalBigDataframe_2 = None          


    #####################
    #####################
    ##### dataframe 3
    try:
        connectionInstance, cursorInstance = connectToDB()
        sql = "SELECT * FROM DB3.Scores WHERE company_idx = "+str(idx)
        finalBigDataframe_3 = pd.read_sql(sql, con=connectionInstance)

    except:
        finalBigDataframe_3 = None 


    #####################
    #####################
    ##### dataframe 4
    try:
        connectionInstance, cursorInstance = connectToDB()
        sql = "SELECT * FROM DB4.Scores WHERE company_idx = "+str(idx)
        finalBigDataframe_4 = pd.read_sql(sql, con=connectionInstance)

    except:
        finalBigDataframe_4 = None 


    ##################
    ##################
    ##################
    ##################

    # merge for every input
    # this is not right though...
    tmpList_forThisCompany = [finalBigDataframe_1, finalBigDataframe_2, finalBigDataframe_3, finalBigDataframe_4]

    return (ii, tmpList_forThisCompany)


def collect_result(result):
    global results
    results.append(result)



import multiprocessing as mp

for elem_PeerGroup in list(sorted(finalDict))[:]:

    print(elem_PeerGroup)

    #elem_PeerGroup = 'Africa - Banks'
    ########################################
    ### FOR ALL COMPANIES IN THIS PEER GROUP

    tmpDF_forPeerGroup = finalDict[elem_PeerGroup]

    if len(tmpDF_forPeerGroup)!=0:

        ########################
        ## CREATE A FINAL LIST FOR COMPANIES
        #finalListForCompanies = []

        ########################
        ## CREATE DATETIME RANGE
        start = datetime.strptime("01-01-2004", "%d-%m-%Y")
        end = datetime.strptime("06-04-2019", "%d-%m-%Y")
        date_generated = [start + timedelta(days=x) for x in range(0, (end-start).days)]


        # each process will use each CPU
        #pool = mp.Pool(mp.cpu_count())
        pool = mp.Pool(2)
        results=[]

        for ii in range(0, len(tmpDF_forPeerGroup)):
            pool.apply_async(construct_each_company, args=(tmpDF_forPeerGroup, ii), callback=collect_result)

        pool.close()
        # postpones the execution of next line of code until all processes in the queue are done.
        pool.join() 

        # Step 5: Sort results [OPTIONAL]
        results.sort(key=lambda x: x[0])
        finalListForCompanies = [r for (ii, r) in results]

    else:
        continue



    finalScores = []
    # for each dataframe, NORMALIZE the companies in the PEER GROUP
    for kk in range(4):
        #print(kk)
        tmpListForNormalisation=[]
        for elem in finalListForCompanies:
            tmpListForNormalisation.append(elem[kk])

        dict_of_dfs = dict(enumerate(tmpListForNormalisation))
        try:
            dframes = pd.concat(dict_of_dfs)
        except:
            finalScores.append(None)
            continue
        dframes = dframes.iloc[:,1:]

        if len(dframes)==0:
            finalScores.append(None)
            continue


        if len(dframes)==len(dframes.groupby(level=1)):
            arrayTest=[]
            for k in range(len(tmpListForNormalisation)):
                if (tmpListForNormalisation[k] is None) or (len(tmpListForNormalisation[k])==0):
                    arrayTest.append(None)
                else:
                    arrayTest.append(tmpListForNormalisation[k])

            # put the final result into a list
            dict_of_dfs2 = dict(enumerate(arrayTest))
            finalScores.append(dict_of_dfs2)

        else:
            test = dframes.groupby(level=1).pipe(lambda g: dframes.sub(g.mean(), level=1).div(g.std(), level=1))    

            tmpListForNormalisation2=[]
            for date, new_df in test.groupby(level=0):
                tmpListForNormalisation2.append(new_df) 

            arrayTest=[]
            j=0
            for k in range(len(tmpListForNormalisation)):
                if (tmpListForNormalisation[k] is None) or (len(tmpListForNormalisation[k])==0):
                    arrayTest.append(None)
                else:
                    arrayTest.append(tmpListForNormalisation2[j])
                    j+=1


            test_min = test.min(level=1)
            test_max = test.max(level=1)

            dict_of_dfs2 = dict(enumerate(arrayTest))

            def nrm(d):
                _d = d
                _d.index = _d.index.get_level_values(1)
                NewRange = np.array([0, 100])
                o = test_max - test_min
                n = NewRange[1] - NewRange[0]
                return (((_d - test_min) * n) / o) + NewRange[0]


            for k, d in dict_of_dfs2.items():
                if d is None:
                    continue

                d.loc[:] = nrm(d).rolling(window=7).mean()

            # put the final result into a list
            finalScores.append(dict_of_dfs2)



    # take the final MEAN for every company
    for ll in range(len(tmpDF_forPeerGroup)):

        namex = tmpDF_forPeerGroup['Name'].values[ll]
        print("Inserting to DB...", namex)
        company_idx = tmpDF_forPeerGroup['Company_idx'].values[ll]
        company_symbol = tmpDF_forPeerGroup['Symbol'].values[ll]
        industryName = tmpDF_forPeerGroup['GICS_Industry_Name'].values[ll]

        try:
            val1 = finalScores[0][ll]
        except:
            val1 = None
        try:
            val2 = finalScores[1][ll]
        except:
            val2 = None
        try:
            val3 = finalScores[2][ll]
        except:
            val3 = None
        try:
            val4 = finalScores[3][ll]
        except:
            val4 = None

        tmpList = [val1, val2, val3, val4]
        tmpDf = dict(enumerate(tmpList))
        dframes = pd.concat(tmpDf)
        finfin = dframes.mean(level=1)

        # adjust according to its industry weights
        finfin = adjustWeights(industryName, finfin) 

        # take data from 01.01.2007 onwards only
        finfin = finfin['2007/01/01':]

        #####################
        # NOW PUT TO DATABASE
        engine = create_engine("mysql://mydb.us-east-2.rds.amazonaws.com/"+newDatabaseName)
        con = engine.connect()
        finfin['timestamp'] = finfin.index
        finfin['company_idx'] = [company_idx]*len(finfin)
        finfin['company_symbol'] = [company_symbol]*len(finfin)
        finfin.to_sql(name='Scores', con=con, if_exists='append', index=False)         


在这种情况下,我不明白为什么我的VM仅使用8%的CPU。我看不到代码中的任何错误,因为它应该遍历许多不同的公司并为每个公司分配一个CPU。

0 个答案:

没有答案