我正在抓取一个包含大量查找内容的网站(USGA GHIN系统)(查找30至60个高尔夫球手的残障并存储在excel表中)。我似乎没有代码问题-我认为该网站检测到来自单一来源的大量请求,然后立即关闭。通常会发生约32个请求-前31个请求运行正常,然后弹出404。遇到时,我关闭硒,等待30秒钟,然后重新开始。有时这有效,有时却无效。我希望有人可以帮助我保护我的应用程序免受此问题的困扰。我已经对该程序进行了编码,以便在检测到三个错误后,它将保存已检索的数据并关闭。如果我稍等片刻,我可以重新启动它,它将在中断的地方继续运行,并且通常在没有遇到其他404的情况下运行到工作结束。
我是否可以更改发送的标头的一部分,以更改网站看到的身份,以便我的应用程序每次都能运行到工作终止?
我已经描述了我尝试过的解决方法。
#
# This Module Retrieves and Stores Handicaps
# First GHIN No is Passed
#
def Retrieve_Handicaps(br):
global next_row, ghin_number, workbook
# Set up Loop
next_row = 4
failures = 0
# Open Player/Handicap File
path1 = "C:/Users/Steve/Desktop/SaturdayGolf.xlsx"
workbook = openpyxl.load_workbook(path1)
# Open Handicaps Sheet
sheet = workbook["Handicaps"]
# Get First GHIN No.
ghin_number = sheet.cell(row = next_row, column = 2).value
print("Processsing", ghin_number)
first_ghin_number = ghin_number
# Open Iframe
br.switch_to.frame(br.find_element_by_tag_name("iframe"))
# Find Handicap entry field
ghin_no = WebDriverWait(br,10).until(EC.presence_of_element_located((By.XPATH,\
'//*[@id="ctl00_bodyMP_tcLookupModes_tpSingle_tbGHIN"]')))
# Enter GHIN No.
ghin_no.send_keys(ghin_number)
# Click on Lookup button
br.find_element_by_css_selector("#ctl00_bodyMP_tcLookupModes_tpSingle_btnSubmit1").click()
# Get revision Date
result = Look_With_Wait(br, '//*[@id="ctl00_bodyMP_grdClubs"]/tbody/tr[1]/td[3]')
if result == failure:
print("Unable to retrieve revision date")
return
print("Revision date = ", retrieved_value.text)
# If new revision date found - store it and loop to clear previous handicaps
if sheet.cell(row = 3, column = 3).value != retrieved_value.text:
print("new date found")
sheet.cell(row = 3, column = 3).value = retrieved_value.text
no_of_players = int(sheet.cell(row = 1, column = 2).value)
counter = 1
# Blank out any stored handicaps
while counter < no_of_players + 1:
sheet.cell(row = counter + 3, column = 3).value = 100
counter += 1
workbook.save(path1)
print("Handicaps blanked out")
# Main Loop for processing all participants
while ghin_number != 0:
# If Handicap is non blank, no need to process
#print("top of main loop row =", next_row-3)
# Check for end of player list
# Get next ghin number from excel file
ghin_number = sheet.cell(row = next_row, column = 2).value
if ghin_number == 0:
workbook.save(path1)
workbook.close()
return
if sheet.cell(row = next_row, column = 3).value != 100:
# Increment row in table
#print("Skipping Ghin # =", sheet.cell(row = next_row, column = 2).value, "Stored Handicap =",\
#sheet.cell(row = next_row, column = 3).value)
next_row += 1
else:
#print("Processing GHIN #", sheet.cell(row = next_row, column = 2).value)
# Blank handicap - process this golfer
## # Get next ghin number from excel file
## ghin_number = sheet.cell(row = next_row, column = 2).value
# Reload GHIN Entry page
print("Going to GHIN page")
br.get(GHIN_URL)
print("back from GHIN load")
# Open Iframe
br.switch_to.frame(br.find_element_by_tag_name("iframe"))
#print("Loop #", next_row - 3, "After iframe")
# Find Handicap entry field
ghin_no_entry = WebDriverWait(br,10).until(EC.presence_of_element_located((By.XPATH,\
'//*[@id="ctl00_bodyMP_tcLookupModes_tpSingle_tbGHIN"]')))
# Enter GHIN No.
ghin_no_entry.send_keys(ghin_number)
# Click on Lookup button
br.find_element_by_css_selector("#ctl00_bodyMP_tcLookupModes_tpSingle_btnSubmit1").click()
# Scrape Handicap
result = Look_With_Wait(br,'//*[@id="ctl00_bodyMP_grdClubs"]/tbody/tr/td[2]')
# Test result and Store if found
if result == success:
print("Loop #", next_row - 3, "GHIN # =", ghin_number, "Handicap found = ", retrieved_value.text)
# Store handicap in worksheet for this golfer
sheet.cell(row = next_row, column = 3).value = retrieved_value.text
# Prepare for Next iteration
next_row +=1
else:
failures += 1
if failures == 3:
# Only accomodate 5 failed attempts before shutting down
print("3 Failed attempts to work through the list")
workbook.save(path1)
workbook.close()
return
# Close Chromedriver
workbook.save(path1)
workbook.close()
print("workbook saved - returning to mainline")
br.close()
br.quit()
sleep(30)
print("restarting, failures =", failures)
# Restart Chromedriver and go to Log on Page
br = Initialize_Start_Chrome()
return