我有一个从API中提取数据的脚本,其中requests.get(url=url, auth=(user, password)).json()
final_df = pd.DataFrame()
for record in all_results:
df = pd.DataFrame(record.get('children', {}))
df['contactId'] = record.get('contactId')
df['origin'] = record.get('origin')
df['description'] = record.get('description')
final_df = final_df.append(df)
- 完整脚本 -
user = ''
password= ""
# Starting values
start = 0
rows = 1500
base_url = 'https://....?start={0}&rows={1}'
print ("Connecting to API..")
url = base_url.format(start,rows)
req = requests.get(url=url, auth=(user, password))
print ("Extracting data..")
out = req.json()
total_records = out['other']['numFound']
print("Total records found: "+ str(total_records))
results = out['resultList']
all_results = results
print ("First " + str(rows) + " rows were extracted")
# Results will be an empty list if no more results are found
while results:
start += rows # Rebuild url based on current start
url = base_url.format(start, rows)
req = requests.get(url=url, auth=(user, password))
out = req.json()
results = out['resultList']
all_results += results
print ("Next " + str(rows) + " rows were extracted")
# All results will now contains all the responses of each request.
print("Total records returned from API: "+ str(len(all_results))) #should equal number of records in response
final_df = pd.DataFrame()
for record in all_results:
df = pd.DataFrame(record.get('children', {}))
df['contactId'] = record.get('contactId')
df['origin'] = record.get('origin')
df['description'] = record.get('description')
final_df = final_df.append(df)
final_df = final_df.reset_index()
del final_df['index']
final_df['ticker'] = final_df['identifier'].str.split('@').str.get(0) #extract ticker (anything before @)
final_df.drop_duplicates(keep='first') #removes duplicates
print('DataFrame from API created succesfully\n')