如何在python中的pandas数据帧上使用vectorize或Apply而不是iterrows

时间:2017-12-06 17:11:46

标签: python pandas dataframe vectorization apply

我有2000多个数据帧,有两列。我想在列上使用ngrams,然后使用ngrams创建一个新的数据帧。这是我的代码。它的工作正常。只是花了很多时间。

我目前正在使用itterows迭代每个文件中每个数据帧的每一行。有没有更简单的方法来使用矢量化或应用?

Attendence.ST_ROLLNO

在这里输入代码

1 个答案:

答案 0 :(得分:0)

# pylint: disable=I0011
# pylint: disable=C0111
# pylint: disable=C0301
# pylint: disable=C0103
# pylint: disable=W0612
# pylint: disable=W0611
import logging
import os
from os import listdir
from os.path import isfile, join
import math
import pickle
import itertools
import multiprocessing
import time
import gc
import numpy as np
import pandas as pd


def create_combinations(file):
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    initial_path ='./training/pickles/standard and documentation/custom_elements/trial/'
    final_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
    completed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
    custom = pd.read_pickle(initial_path+file, compression='gzip')
    custom = custom.drop_duplicates(subset=['category', 'element'])
    custom['element'] = custom['element'].str.replace(r'([a-z](?=[A-Z])|[A-Z](?=[A-Z][a-z]))', r'\1 ')
    total_rows = len(custom.index)
    logging.warning('Processing element : ' + file + 'Number of rows to combine: '+ str(total_rows))
    cat = []
    ele = []
    combined_df = pd.DataFrame(columns=['category', 'element'])
    logging.warning('creating combinations')
    k=1
    for key, data in custom.iterrows():
        words = data['element']#.split()
        logging.warning(words)
        words2 = words.replace('%', '%%').replace(' ', '%s')
        logging.warning('Number of words to combine: '+ str(len(words.split())))
        for i in itertools.product((' ', ''), repeat=words.count(' ')):
            ele.append(words2 % i)
            cat.append(data['category'])
        lst = zip(cat,ele)
        if len(lst) > 200000:
            del cat
            del ele
            combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
            del lst
            combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
            combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
            #del combined_df
            gc.collect()
            k+=1
    del cat
    del ele
    combined_df = pd.DataFrame.from_records(lst,columns=['category','element'])
    del lst
    combined_df.to_pickle(final_path + os.path.splitext(file)[0] + str(k)+'.pickle', compression='gzip')
    combined_df.to_csv(final_path + os.path.splitext(file)[0] + str(k)+'.csv') 
    del combined_df
    gc.collect()
    del custom
    del words
    del words2
    logging.warning('completed ' + file)
    os.rename(initial_path+file, completed_file_path+file)
    os.rename(initial_path+os.path.splitext(file)[0]+'.csv', completed_file_path+os.path.splitext(file)[0]+'.csv')
    return True


if __name__ == "__main__":
    logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
    partitions = 1 #number of partitions to split dataframe
    cores = 6 #number of cores on your machine
    path ='./training/pickles/standard and documentation/custom_elements/trial/'
    combi_path = './training/pickles/standard and documentation/custom_element_combination_trial/'
    processed_file_path ='./training/pickles/standard and documentation/custom_elements_processed_trial/'
    files = [f for f in listdir(path) if isfile(join(path, f))]
    pickle_files=[]
    for any_file in files:
        if any_file.endswith('.pickle'):
            if os.path.isfile(combi_path+any_file):
                os.rename(path+any_file, processed_file_path+any_file)
                os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                logging.warning(any_file +' already processed.')
            else:
                df = pd.read_pickle(path+any_file, compression='gzip')
                rows = len(df.index)
                if rows > 0:
                    #if rows < 500:
                    pickle_files.insert(len(pickle_files),any_file)
                    # else:
                    #     continue
                else:
                    os.rename(path+any_file, processed_file_path+any_file)
                    os.rename(path+os.path.splitext(any_file)[0]+'.csv', processed_file_path+os.path.splitext(any_file)[0]+'.csv')
                del df
                gc.collect()
                del rows
                gc.collect()
    ctx = multiprocessing.get_context('spawn')
    p = ctx.Pool(processes=cores, maxtasksperchild=1000)
    start = time.time()
    async_result = p.map_async(create_combinations, pickle_files)
    p.close()
    p.join()
    print("Complete")
    end = time.time()
    print('total time (s)= ' + str(end-start))