对于我的工作,我写了一个python脚本来链接2个文件。由于我是一个自学者,因为没有同事写代码,我在这里提出这个问题。
我的代码需要一段难以置信的时间才能运行。是否可以改进以下代码? (我确信这是可能的,但我不知道如何制作它。)
上下文:在可能的建筑物位置地址上链接数十万行的两个文件。由于它是两个手动编写的数据库,因此并不简单。
目标:获取一个文件,其中存在关于地址的信息(完全相同的地址,相同的街道或同一条街道但具有不同的街道名称)。
谢谢大家的建议!
import unicodedata
import pandas as pd
import nltk
from nltk .corpus import stopwords
def strip_accents(s):
'''Remove all accents from words'''
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
def tokenize(string):
'''return list with words and numbers from string'''
tok = nltk.RegexpTokenizer(r'''(?x)
\w+ # only words and numbers
''')
return tok.tokenize(string)
def french_stopwords(my_list, update=[]):
''' remove words from stropwords.words('french') from string. Update= list
of words to add to stopwords'''
french_stopwords = set(stopwords.words('french'))
french_stopwords.update(update)
french_stopwords = set(french_stopwords)
new_list = []
for token in my_list:
if token not in french_stopwords:
new_list.append(token)
return new_list
def set_column_sequence(dataframe, seq, front=True):
'''Takes a dataframe and a subsequence of its columns,
returns dataframe with seq as first columns if "front" is True,
and seq as last columns if "front" is False.'''
cols = seq[:] # copy so we don't mutate seq
for x in dataframe.columns:
if x not in cols:
if front: # we want "seq" to be in the front
# so append current column to the end of the list
cols.append(x)
else:
# we want "seq" to be last, so insert this
# column in the front of the new column list
# "cols" we are building:
cols.insert(0, x)
return dataframe[cols]
# IPIC data
df = pd.read_excel('all_files_IPIC_2_test.xls', encoding='latin1')
df_ipic = df[(df['Dernier'] == 1)] # select only last data entry (dernier= 1)
df_ipic = df_ipic.reset_index(drop=True)
df_ipic = df_ipic.fillna('') # remove all Nan values
df_ipic['Rue_01_Diff'] = df_ipic['Rue_01_Diff'].str.replace(r"\(.*\)", "")
df_ipic['Rue_01_Diff'] = df_ipic['Rue_01_Diff'].str.replace(r"\[.*\]", "")
# Adlib data
df_adlib = pd.read_csv('database_adlib_buildings_test.csv', encoding='latin1')
df_adlib = df_adlib.fillna('') # remove all Nan values
df_adlib = df_adlib.rename(columns={'object_type_(OB)': 'object_type_OB',
'title and description':'title_and_description'})
df_adlib['current_location'] = df_adlib['current_location'].str.replace(r"\(.*\)","")
df_adlib['current_location'] = df_adlib['current_location'].str.replace(r"\[.*\]","")
# words to add to stopwords
words_list = ['a', 'dite', 'dit']
# lists results
ipic_id = [] # column name = CodeInt
ipic_adress = [] # column name = Rue_01_Diff
adlib_adress = [] # column name = current_location
adlib_street = [] # derived from this script
adlib_street_changed = []
adlib_object_number = [] # column name = object_number
osm_adress = []
osm_id = []
temp1 = []
temp2 = []
temp3 = []
temp_1 = []
temp_2 = []
temp_22 = []
temp_3 = []
temp_33 = []
temp_11 = []
temp_osm = []
# -------------------------------------------------------------------
for idx_ipic in df_ipic.index:
ipic = df_ipic.Rue_01_Diff[idx_ipic]
ipic2 = strip_accents(ipic.lower())
ipic_tok = french_stopwords(tokenize(ipic2), update=words_list)
print(ipic_tok)
for idx_adlib in df_adlib.index:
adlib = df_adlib.current_location[idx_adlib]
adlib2 = strip_accents(adlib.lower())
adlib_tok = french_stopwords(tokenize(adlib2), update=words_list)
# raw matching
if set(adlib_tok) == set(ipic_tok):
print('|'.join(adlib_tok))
temp1.append(adlib)
temp_1.append(df_ipic.CodeInt[idx_ipic])
temp_11.append(df_adlib.object_number[idx_adlib])
# only street name
a = [x for x in ipic_tok if not x.isdigit()]
b = [x for x in adlib_tok if not x.isdigit()]
if set(a) == set(b):
bb = ' '.join(b)
if df_adlib.object_number[idx_adlib] not in temp_22:
print('-'.join(b))
temp2.append(bb)
temp_2.append(df_ipic.CodeInt[idx_ipic])
temp_22.append(df_adlib.object_number[idx_adlib])
# change street denomination
groupe_semantique = {'nom_generique': 'rue',
'liste':['rue', 'avenue', 'boulevard', 'autoroute', 'chaussée']}
w = [x for x in groupe_semantique['liste'] if x in a]
y = [x for x in groupe_semantique['liste'] if x in b]
if len(w) != 0 and len(y) != 0:
if df_adlib.object_number[idx_adlib] not in temp_33:
a_new = [groupe_semantique['nom_generique'] if x in groupe_semantique['liste'] else x for x in a]
b_new = [groupe_semantique['nom_generique'] if x in groupe_semantique['liste'] else x for x in b]
a_new2 = ' '.join(a_new)
b_new2 = ' '.join(b_new)
if a_new2 == b_new2:
print('AAAAAAAAAAAA')
temp3.append(adlib)
temp_3.append(df_ipic.CodeInt[idx_ipic])
temp_33.append(df_adlib.object_number[idx_adlib])
df1 = pd.DataFrame({'id_ipic': temp_1, 'adlib_adress': temp1, 'adlib_object_number': temp_11})
df1 = df1.drop_duplicates()
df2 = pd.DataFrame({'id_ipic': temp_2, 'adlib_street': temp2, 'adlib_object_number': temp_22})
df2 = df2.drop_duplicates()
df3 = pd.DataFrame({'id_ipic': temp_3, 'adlib_type_street_changed': temp3, 'adlib_object_number': temp_33})
df3 = df3.drop_duplicates()
# information from xls files
data_ipic = pd.DataFrame({'id_ipic': df_ipic.CodeInt,
'ipic_adress': df_ipic.Rue_01_Diff,
'Libelle_Diff': df_ipic.Libelle_Diff})
data_adlib = pd.DataFrame({
'adlib_object_name': df_adlib.object_name,
'adlib_object_number': df_adlib.object_number,
'object_type_OB': df_adlib.object_type_OB,
'title_and_description':df_adlib.title_and_description})
# final dataframe
df_recap = df1.merge(df2, how='outer').merge(df3, how='outer')
df_recap = df_recap.merge(data_ipic, how='inner').merge(data_adlib, how='inner')
# reshape final dataframe
df_recap = df_recap.drop_duplicates()
df_recap = df_recap.reset_index(drop=True)
# change columns order
seq = ['id_ipic', 'Libelle_Diff', 'ipic_adress', 'adlib_adress', 'adlib_street',
'adlib_type_street_changed', 'adlib_object_name', 'adlib_object_number',
'object_type_OB', 'title_and_description']
df_recap = set_column_sequence(df_recap, seq, front=True)