问题:我正在尝试通过API服务提取数据。单个请求可能需要3到10秒。 Pandas DataFrame中大约有20,000行数据要输入到API调用中。我通过多处理设法加快了速度,但它仍然运行得很慢。有什么建议?
代码:
def scored_card_features2(source, n_batches):
"""Multiprocessing version of Scored Card Features Function
Returns reason for rating
"""
# read in source data and convert to list of lists for inputs
data = pd.read_excel(source)
data = data[['primary_bank_report_id', 'primary_tu_credit_report_id', 'purpose']]
inputs = data.values.tolist()
def scored_card_map(i):
"""form request to scored card service and retrieve values"""
url = "url/FourthGen?bank_report_id=%s&credit_report_id=%s&" \
"&loan_purpose=%s" % (i[0], i[1], i[2].replace(" ", "%20"))
r = requests.get(url)
try:
d = json.loads(r.text)
l = [d['probability_of_default'],
d['condition'],
d['purpose_of_loan'],
d['rating'],
d['bank_report_id'],
d['reason_for_rating'],
d['credit_report_id']]
return l
except:
l = [np.nan] * 7
return l
# inititate multithreading
with Pool(n_batches) as p:
vals = p.map(scored_card_map, inputs)
result = pd.DataFrame(vals, columns=['Probability of Default', 'Condition', 'Purpose of Loan', 'Rating', 'Bank Report ID',
'Reason for Rating', 'Credit Report ID'])
result = result.dropna(how='all')
return result
if __name__ == '__main__':
# model features
start = time.time()
df = scored_card_features2('BankCreditPortalIDsPurpose.xlsx', multiprocessing.cpu_count()-1)
df.to_csv('scored_card_features.csv', index=False)
end = time.time()
print(end-start)