Question

下面的代码使用一些伪变量处理pandas DataFrame中的标志。对于包含4000行的数据集，该代码似乎非常慢。

有没有更有效的方式来编写相同的代码？
如何替换for循环？
有没有办法减少if语句的数量？
是否可以使代码更具可读性？

代码：

import pandas as pd
def dummy():
    df=pd.read_csv('DF.csv',header=0)
    df['North']=0
    df['South']=0
    df['Central']=0
    df['West']=0
    df['East']=0
    df['Remote']=0
    for i in range(len(df['Region'])):
        if df['Region'][i]=='North':
            df['North'][i]=1
        elif df['Region'][i]=='South':
            df['South'][i]=1
        elif df['Region'][i]=='East':
            df['East'][i]=1
        elif df['Region'][i]=='West':
            df['West'][i]=1
        elif df['Region'][i]=='Central':
            df['Central'][i]=1
        elif df['Region'][i]=='Remote':
            df['Remote'][i]=1
    df['apartment']=0
    df['house']=0
    df['townhouse']=0
    df['unit']=0
    df['villa']=0
    df['acreage']=0
    df['other']=0
    for i in range(len(df['Type_Property'])):
        if df['Type_Property'][i]=='house':
            df['house'][i]=1
        elif df['Type_Property'][i]=='apartment' or df['Type_Property'][i]=='flat':
            df['apartment'][i]=1
        elif df['Type_Property'][i]=='townhouse':
            df['townhouse'][i]=1
        elif df['Type_Property'][i]=='villa':
            df['villa'][i]=1
        elif df['Type_Property'][i]=='acreage+semi+rural' or df['Type_Property'][i]=='terrance':
            df['acreage'][i]=1
        elif df['Type_Property'][i]=='unit':
            df['unit'][i]=1
        else:
            df['other']=1
    df.to_csv('Dummied.csv')

dummy()

Answer 1

第一部分

df['North']=0
df['South']=0
df['Central']=0
df['West']=0
df['East']=0
df['Remote']=0
df['apartment']=0
df['house']=0
df['townhouse']=0
df['unit']=0
df['villa']=0
df['acreage']=0
df['other']=0

成为

fields = ['North', 'South', 'Central', 'West', 'East', 'Remote', 'apartment', 'house', 'townhouse', 'unit', 'villa', 'acreage', 'other']
for field in fields:
    df[ field ] = 0

这部分

for i in range(len(df['Region'])):
        if df['Region'][i]=='North':
            df['North'][i]=1
        elif df['Region'][i]=='South':
            df['South'][i]=1
        elif df['Region'][i]=='East':
            df['East'][i]=1
        elif df['Region'][i]=='West':
            df['West'][i]=1
        elif df['Region'][i]=='Central':
            df['Central'][i]=1
        elif df['Region'][i]=='Remote':
            df['Remote'][i]=1

可以替换为

for i in range(len(df['Region'])):
        df[ df['Region'][i] ][i] = 1

另一个for循环

for i in range(len(df['Type_Property'])):
        if df['Type_Property'][i]=='house':
            df['house'][i]=1
        elif df['Type_Property'][i]=='apartment' or df['Type_Property'][i]=='flat':
            df['apartment'][i]=1
        elif df['Type_Property'][i]=='townhouse':
            df['townhouse'][i]=1
        elif df['Type_Property'][i]=='villa':
            df['villa'][i]=1
        elif df['Type_Property'][i]=='acreage+semi+rural' or df['Type_Property'][i]=='terrance':
            df['acreage'][i]=1
        elif df['Type_Property'][i]=='unit':
            df['unit'][i]=1
        else:
            df['other']=1

可能是

for i in range(len(df['Type_Property'])):
    if df['Type_Property'][i] in ['house', 'apartment', 'townhouse', 'villa', 'unit']:
        df[ df['Type_Property'][i] ][i] = 1
    elif df['Type_Property'][i]=='flat':
        df['apartment'][i]=1
    elif df['Type_Property'][i]=='acreage+semi+rural' or df['Type_Property'][i]=='terrance':
        df['acreage'][i]=1
    else:
        df['other']=1

Answer 2

这是使用.loc利用熊猫功能的另一种选择，这将使每行的操作速度更快。我还使用列表将区域和property_types分组在一起。

def dummy():

    df = read_csv('path_to_file.csv', header=0)

    regions = ['North', 'South', 'Central', 'West', 'East']
    property_types = ['apartment', 'house', 'townhouse', 'unit', 'villa', 'acreage', 'other']
    columns_to_update = regions + property_types

    for column in columns_to_update
      df[region] = 0

    for region in regions:
        df.loc[df.Region.str == region, region] = 1

    for property_type in property_types:
        if property_type == 'apartment':
            possible_types = [property_types, 'flat']
        elif property_type == 'acreage':
            possible_types = [property_type, 'acreage+semi+rural', 'terrance']
        else: 
            possible_types = [property_types]

        df.loc[df.Type_Property.str.isin(possible_types), property_type] = 1

此外，我想分享一个链接，该链接可以帮助您优化熊猫：A Beginner’s Guide to Optimizing Pandas Code for Speed

Answer 3

我清理了它，不确定如何提高效率

cordova clean

如何优化处理熊猫DataFrame并提高代码的可读性？

3 个答案: