Python 3:高运行时间 - >有可能改进代码吗?

时间:2017-08-01 12:00:51

标签: performance python-3.x pandas dataframe database-performance

对于我的工作,我写了一个python脚本来链接2个文件。由于我是一个自学者,因为没有同事写代码,我在这里提出这个问题。

我的代码需要一段难以置信的时间才能运行。是否可以改进以下代码? (我确信这是可能的,但我不知道如何制作它。)

上下文:在可能的建筑物位置地址上链接数十万行的两个文件。由于它是两个手动编写的数据库,因此并不简单。

目标:获取一个文件,其中存在关于地址的信息(完全相同的地址,相同的街道或同一条街道但具有不同的街道名称)。

谢谢大家的建议!

import unicodedata
import pandas as pd
import nltk
from nltk .corpus import stopwords

def strip_accents(s):
    '''Remove all accents from words'''
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')


def tokenize(string):
    '''return list with words and numbers from string'''
    tok = nltk.RegexpTokenizer(r'''(?x)
            \w+               # only words and numbers
            ''')
    return tok.tokenize(string)


def french_stopwords(my_list, update=[]):
    ''' remove words from stropwords.words('french') from string. Update= list
        of words to add to stopwords'''
    french_stopwords = set(stopwords.words('french'))
    french_stopwords.update(update)
    french_stopwords = set(french_stopwords)
    new_list = []
    for token in my_list:
        if token not in french_stopwords:
            new_list.append(token)
    return new_list


def set_column_sequence(dataframe, seq, front=True):
    '''Takes a dataframe and a subsequence of its columns,
       returns dataframe with seq as first columns if "front" is True,
       and seq as last columns if "front" is False.'''
    cols = seq[:]  # copy so we don't mutate seq
    for x in dataframe.columns:
        if x not in cols:
            if front:  # we want "seq" to be in the front
                # so append current column to the end of the list
                cols.append(x)
            else:
                # we want "seq" to be last, so insert this
                # column in the front of the new column list
                # "cols" we are building:
                cols.insert(0, x)
    return dataframe[cols]


# IPIC data
df = pd.read_excel('all_files_IPIC_2_test.xls', encoding='latin1')
df_ipic = df[(df['Dernier'] == 1)]  # select only last data entry (dernier= 1)
df_ipic = df_ipic.reset_index(drop=True)
df_ipic = df_ipic.fillna('')        # remove all Nan values
df_ipic['Rue_01_Diff'] = df_ipic['Rue_01_Diff'].str.replace(r"\(.*\)", "")
df_ipic['Rue_01_Diff'] = df_ipic['Rue_01_Diff'].str.replace(r"\[.*\]", "")

# Adlib data
df_adlib = pd.read_csv('database_adlib_buildings_test.csv', encoding='latin1')
df_adlib = df_adlib.fillna('')  # remove all Nan values
df_adlib = df_adlib.rename(columns={'object_type_(OB)': 'object_type_OB',
                                    'title and description':'title_and_description'})
df_adlib['current_location'] = df_adlib['current_location'].str.replace(r"\(.*\)","")
df_adlib['current_location'] = df_adlib['current_location'].str.replace(r"\[.*\]","")

# words to add to stopwords
words_list = ['a', 'dite', 'dit']

# lists results
ipic_id = []       # column name = CodeInt
ipic_adress = []   # column name = Rue_01_Diff
adlib_adress = []  # column name = current_location
adlib_street = []  # derived from this script
adlib_street_changed = []
adlib_object_number = []  # column name = object_number
osm_adress = []
osm_id = []

temp1 = []
temp2 = []
temp3 = []
temp_1 = []
temp_2 = []
temp_22 = []
temp_3 = []
temp_33 = []
temp_11 = []
temp_osm = []

# -------------------------------------------------------------------
for idx_ipic in df_ipic.index:
    ipic = df_ipic.Rue_01_Diff[idx_ipic]
    ipic2 = strip_accents(ipic.lower())
    ipic_tok = french_stopwords(tokenize(ipic2), update=words_list)
    print(ipic_tok)

    for idx_adlib in df_adlib.index:
        adlib = df_adlib.current_location[idx_adlib]
        adlib2 = strip_accents(adlib.lower())
        adlib_tok = french_stopwords(tokenize(adlib2), update=words_list)

        # raw matching
        if set(adlib_tok) == set(ipic_tok):
            print('|'.join(adlib_tok))
            temp1.append(adlib)
            temp_1.append(df_ipic.CodeInt[idx_ipic])
            temp_11.append(df_adlib.object_number[idx_adlib])

        # only street name
        a = [x for x in ipic_tok if not x.isdigit()]
        b = [x for x in adlib_tok if not x.isdigit()]
        if set(a) == set(b):
            bb = ' '.join(b)
            if df_adlib.object_number[idx_adlib] not in temp_22:
                print('-'.join(b))
                temp2.append(bb)
                temp_2.append(df_ipic.CodeInt[idx_ipic])
                temp_22.append(df_adlib.object_number[idx_adlib])

        # change street denomination
        groupe_semantique = {'nom_generique': 'rue',
                             'liste':['rue', 'avenue', 'boulevard', 'autoroute', 'chaussée']}
        w = [x for x in groupe_semantique['liste'] if x in a]
        y = [x for x in groupe_semantique['liste'] if x in b]
        if len(w) != 0 and len(y) != 0:
            if df_adlib.object_number[idx_adlib] not in temp_33:
                a_new = [groupe_semantique['nom_generique'] if x in groupe_semantique['liste'] else x for x in a]
                b_new = [groupe_semantique['nom_generique'] if x in groupe_semantique['liste'] else x for x in b]
                a_new2 = ' '.join(a_new)
                b_new2 = ' '.join(b_new)
                if a_new2 == b_new2:
                    print('AAAAAAAAAAAA')
                    temp3.append(adlib)
                    temp_3.append(df_ipic.CodeInt[idx_ipic])
                    temp_33.append(df_adlib.object_number[idx_adlib])



df1 = pd.DataFrame({'id_ipic': temp_1, 'adlib_adress': temp1, 'adlib_object_number': temp_11})
df1 = df1.drop_duplicates()
df2 = pd.DataFrame({'id_ipic': temp_2, 'adlib_street': temp2, 'adlib_object_number': temp_22})
df2 = df2.drop_duplicates()
df3 = pd.DataFrame({'id_ipic': temp_3, 'adlib_type_street_changed': temp3, 'adlib_object_number': temp_33})
df3 = df3.drop_duplicates()

# information from xls files
data_ipic = pd.DataFrame({'id_ipic': df_ipic.CodeInt,
                          'ipic_adress': df_ipic.Rue_01_Diff, 
                          'Libelle_Diff': df_ipic.Libelle_Diff})
data_adlib = pd.DataFrame({
                           'adlib_object_name': df_adlib.object_name,
                           'adlib_object_number': df_adlib.object_number,
                           'object_type_OB': df_adlib.object_type_OB,
                           'title_and_description':df_adlib.title_and_description})

# final dataframe
df_recap = df1.merge(df2, how='outer').merge(df3, how='outer')
df_recap = df_recap.merge(data_ipic, how='inner').merge(data_adlib, how='inner')

# reshape final dataframe
df_recap = df_recap.drop_duplicates()
df_recap = df_recap.reset_index(drop=True)

# change columns order
seq = ['id_ipic', 'Libelle_Diff', 'ipic_adress', 'adlib_adress', 'adlib_street',
       'adlib_type_street_changed', 'adlib_object_name', 'adlib_object_number',
       'object_type_OB', 'title_and_description']
df_recap = set_column_sequence(df_recap, seq, front=True)

0 个答案:

没有答案