我最近写了一些代码,它正在创建我想要的输出,但是这需要一个永恒的……我有70万个客户线要遍历435个仓库,而运行10万个花了3个小时。
我知道这个问题可能含糊不清,但是我不确定为什么它运行如此缓慢。我怀疑这是由于我嵌套了for循环造成的,但是除非将其分块,否则数据不会加载。
import pandas as pd
import geopy.distance
dfware = pd.read_csv('dfware.csv', encoding = "ISO-8859-1")
dfcust = pd.read_csv(r'dfcust.csv', encoding = "ISO-8859-1")
ppmwinarray = []
#Chunk size to load in
csize=10 ** 3
Bigoutput = []
y=0
for dfcust in pd.read_csv(r'dfcust.csv', encoding = "ISO-8859-1", chunksize = csize):
#For all columns (index) and rows, in datacust, iterate through those rows.
y+=1
print(y)
ppmwinarray = []
z=0
for index,row in dfcust.iterrows():
#Assign the Lattitude variable in the row titled 'lat' to lat1
lat1 = row['Lat']
# Assign the longitude variable in the row titled 'Long' to lon1
lon1 = row['Lon']
dlist=[]
dindex=[]
print(z)
z+=1
for index2, row2 in dfware.iterrows():
y+=1
lat2 = row2['Lat']
lon2 = row2['Lon']
coords_1 = [lat1, lon1]
coords_2 = [lat2, lon2]
distance = geopy.distance.distance(coords_1, coords_2).miles
if distance > 300:
distance = 0
else:
distance = distance
dlist.append(distance)
d_i = ((300-distance)/300)
if d_i != 1:
d_i=d_i
else:
d_i=0
dindex.append(d_i)
sumdi = sum(dindex)
if sumdi == 0:
sumdi = 1
#Defining last 13 as PPM Index
ppmdindex =(dindex[-13:])
#Calculating their independent win chance
IndependentWinChance=[ x/ sumdi for x in ppmdindex]
#Store in an array
ppmarray = IndependentWinChance
#Summing independent chances to get sum chance
sumppmWinChance = sum(ppmarray)
#Appending the sum of all distance indexes
ppmarray.append(sumdi)
#Appending the sum of ppm win chance
ppmarray.append(sumppmWinChance)
ppmwinarray.append(ppmarray)
Bigoutput.extend(ppmwinarray)
Bigoutputdf = pd.DataFrame(Bigoutput)
Bigoutputdf.to_csv('customers1234.csv')
exit()